/[pcre]/code/trunk/study.c
ViewVC logotype

Diff of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 27 by nigel, Sat Feb 24 21:38:49 2007 UTC revision 75 by nigel, Sat Feb 24 21:40:37 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-1999 University of Cambridge             Copyright (c) 1997-2004 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Redistribution and use in source and binary forms, with or without
16  computer system, and to redistribute it freely, subject to the following  modification, are permitted provided that the following conditions are met:
 restrictions:  
   
 1. This software is distributed in the hope that it will be useful,  
    but WITHOUT ANY WARRANTY; without even the implied warranty of  
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
17    
18  2. The origin of this software must not be misrepresented, either by      * Redistributions of source code must retain the above copyright notice,
19     explicit claim or by omission.        this list of conditions and the following disclaimer.
20    
21  3. Altered versions must be plainly marked as such, and must not be      * Redistributions in binary form must reproduce the above copyright
22     misrepresented as being the original software.        notice, this list of conditions and the following disclaimer in the
23          documentation and/or other materials provided with the distribution.
24    
25        * Neither the name of the University of Cambridge nor the names of its
26          contributors may be used to endorse or promote products derived from
27          this software without specific prior written permission.
28    
29    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39    POSSIBILITY OF SUCH DAMAGE.
40  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
41  */  */
42    
# Line 53  Returns:        nothing Line 65  Returns:        nothing
65  */  */
66    
67  static void  static void
68  set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)  set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
69  {  {
70  start_bits[c/8] |= (1 << (c&7));  start_bits[c/8] |= (1 << (c&7));
71  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
# Line 74  Arguments: Line 86  Arguments:
86    code         points to an expression    code         points to an expression
87    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
88    caseless     the current state of the caseless flag    caseless     the current state of the caseless flag
89      utf8         TRUE if in UTF-8 mode
90    cd           the block with char table pointers    cd           the block with char table pointers
91    
92  Returns:       TRUE if table built, FALSE otherwise  Returns:       TRUE if table built, FALSE otherwise
# Line 81  Returns:       TRUE if table built, FALS Line 94  Returns:       TRUE if table built, FALS
94    
95  static BOOL  static BOOL
96  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
97    compile_data *cd)    BOOL utf8, compile_data *cd)
98  {  {
99  register int c;  register int c;
100    
# Line 95  volatile int dummy; Line 108  volatile int dummy;
108    
109  do  do
110    {    {
111    const uschar *tcode = code + 3;    const uschar *tcode = code + 1 + LINK_SIZE;
112    BOOL try_next = TRUE;    BOOL try_next = TRUE;
113    
114    while (try_next)    while (try_next)
115      {      {
     try_next = FALSE;  
   
116      /* If a branch starts with a bracket or a positive lookahead assertion,      /* If a branch starts with a bracket or a positive lookahead assertion,
117      recurse to set bits from within them. That's all for this branch. */      recurse to set bits from within them. That's all for this branch. */
118    
119      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
120        {        {
121        if (!set_start_bits(tcode, start_bits, caseless, cd))        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
122          return FALSE;          return FALSE;
123          try_next = FALSE;
124        }        }
125    
126      else switch(*tcode)      else switch(*tcode)
# Line 116  do Line 128  do
128        default:        default:
129        return FALSE;        return FALSE;
130    
131          /* Skip over callout */
132    
133          case OP_CALLOUT:
134          tcode += 2 + 2*LINK_SIZE;
135          break;
136    
137          /* Skip over extended extraction bracket number */
138    
139          case OP_BRANUMBER:
140          tcode += 3;
141          break;
142    
143        /* Skip over lookbehind and negative lookahead assertions */        /* Skip over lookbehind and negative lookahead assertions */
144    
145        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
146        case OP_ASSERTBACK:        case OP_ASSERTBACK:
147        case OP_ASSERTBACK_NOT:        case OP_ASSERTBACK_NOT:
148        try_next = TRUE;        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
149        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);        tcode += 1+LINK_SIZE;
       tcode += 3;  
150        break;        break;
151    
152        /* Skip over an option setting, changing the caseless flag */        /* Skip over an option setting, changing the caseless flag */
# Line 131  do Line 154  do
154        case OP_OPT:        case OP_OPT:
155        caseless = (tcode[1] & PCRE_CASELESS) != 0;        caseless = (tcode[1] & PCRE_CASELESS) != 0;
156        tcode += 2;        tcode += 2;
       try_next = TRUE;  
157        break;        break;
158    
159        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
160    
161        case OP_BRAZERO:        case OP_BRAZERO:
162        case OP_BRAMINZERO:        case OP_BRAMINZERO:
163        if (!set_start_bits(++tcode, start_bits, caseless, cd))        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
164          return FALSE;          return FALSE;
165        dummy = 1;        dummy = 1;
166        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
167        tcode += 3;        tcode += 1+LINK_SIZE;
       try_next = TRUE;  
168        break;        break;
169    
170        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
# Line 154  do Line 175  do
175        case OP_MINQUERY:        case OP_MINQUERY:
176        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
177        tcode += 2;        tcode += 2;
178        try_next = TRUE;  #ifdef SUPPORT_UTF8
179          if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
180    #endif
181        break;        break;
182    
183        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 163  do Line 186  do
186        case OP_MINUPTO:        case OP_MINUPTO:
187        set_bit(start_bits, tcode[3], caseless, cd);        set_bit(start_bits, tcode[3], caseless, cd);
188        tcode += 4;        tcode += 4;
189        try_next = TRUE;  #ifdef SUPPORT_UTF8
190          if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
191    #endif
192        break;        break;
193    
194        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
195    
196        case OP_EXACT:       /* Fall through */        case OP_EXACT:       /* Fall through */
197        tcode++;        tcode += 2;
   
       case OP_CHARS:       /* Fall through */  
       tcode++;  
198    
199          case OP_CHAR:
200          case OP_CHARNC:
201        case OP_PLUS:        case OP_PLUS:
202        case OP_MINPLUS:        case OP_MINPLUS:
203        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
204          try_next = FALSE;
205        break;        break;
206    
207        /* Single character type sets the bits and stops */        /* Single character type sets the bits and stops */
# Line 184  do Line 209  do
209        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
210        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
211          start_bits[c] |= ~cd->cbits[c+cbit_digit];          start_bits[c] |= ~cd->cbits[c+cbit_digit];
212          try_next = FALSE;
213        break;        break;
214    
215        case OP_DIGIT:        case OP_DIGIT:
216        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
217          start_bits[c] |= cd->cbits[c+cbit_digit];          start_bits[c] |= cd->cbits[c+cbit_digit];
218          try_next = FALSE;
219        break;        break;
220    
221        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
222        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
223          start_bits[c] |= ~cd->cbits[c+cbit_space];          start_bits[c] |= ~cd->cbits[c+cbit_space];
224          try_next = FALSE;
225        break;        break;
226    
227        case OP_WHITESPACE:        case OP_WHITESPACE:
228        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
229          start_bits[c] |= cd->cbits[c+cbit_space];          start_bits[c] |= cd->cbits[c+cbit_space];
230          try_next = FALSE;
231        break;        break;
232    
233        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
234        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
235          start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);          start_bits[c] |= ~cd->cbits[c+cbit_word];
236          try_next = FALSE;
237        break;        break;
238    
239        case OP_WORDCHAR:        case OP_WORDCHAR:
240        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
241          start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);          start_bits[c] |= cd->cbits[c+cbit_word];
242          try_next = FALSE;
243        break;        break;
244    
245        /* One or more character type fudges the pointer and restarts, knowing        /* One or more character type fudges the pointer and restarts, knowing
# Line 217  do Line 248  do
248        case OP_TYPEPLUS:        case OP_TYPEPLUS:
249        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
250        tcode++;        tcode++;
       try_next = TRUE;  
251        break;        break;
252    
253        case OP_TYPEEXACT:        case OP_TYPEEXACT:
254        tcode += 3;        tcode += 3;
       try_next = TRUE;  
255        break;        break;
256    
257        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 238  do Line 267  do
267        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
268        switch(tcode[1])        switch(tcode[1])
269          {          {
270            case OP_ANY:
271            return FALSE;
272    
273          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
274          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
275            start_bits[c] |= ~cd->cbits[c+cbit_digit];            start_bits[c] |= ~cd->cbits[c+cbit_digit];
# Line 260  do Line 292  do
292    
293          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
294          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
295            start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);            start_bits[c] |= ~cd->cbits[c+cbit_word];
296          break;          break;
297    
298          case OP_WORDCHAR:          case OP_WORDCHAR:
299          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
300            start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);            start_bits[c] |= cd->cbits[c+cbit_word];
301          break;          break;
302          }          }
303    
304        tcode += 2;        tcode += 2;
       try_next = TRUE;  
305        break;        break;
306    
307        /* Character class: set the bits and either carry on or not,        /* Character class where all the information is in a bit map: set the
308        according to the repeat count. */        bits and either carry on or not, according to the repeat count. If it was
309          a negative class, and we are operating with UTF-8 characters, any byte
310          with a value >= 0xc4 is a potentially valid starter because it starts a
311          character with a value > 255. */
312    
313          case OP_NCLASS:
314          if (utf8)
315            {
316            start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
317            memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
318            }
319          /* Fall through */
320    
321        case OP_CLASS:        case OP_CLASS:
322          {          {
323          tcode++;          tcode++;
324          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];  
325            /* In UTF-8 mode, the bits in a bit map correspond to character
326            values, not to byte values. However, the bit map we are constructing is
327            for byte values. So we have to do a conversion for characters whose
328            value is > 127. In fact, there are only two possible starting bytes for
329            characters in the range 128 - 255. */
330    
331            if (utf8)
332              {
333              for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
334              for (c = 128; c < 256; c++)
335                {
336                if ((tcode[c/8] && (1 << (c&7))) != 0)
337                  {
338                  int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
339                  start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
340                  c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
341                  }
342                }
343              }
344    
345            /* In non-UTF-8 mode, the two bit maps are completely compatible. */
346    
347            else
348              {
349              for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
350              }
351    
352            /* Advance past the bit map, and act on what follows */
353    
354          tcode += 32;          tcode += 32;
355          switch (*tcode)          switch (*tcode)
356            {            {
# Line 288  do Line 359  do
359            case OP_CRQUERY:            case OP_CRQUERY:
360            case OP_CRMINQUERY:            case OP_CRMINQUERY:
361            tcode++;            tcode++;
           try_next = TRUE;  
362            break;            break;
363    
364            case OP_CRRANGE:            case OP_CRRANGE:
365            case OP_CRMINRANGE:            case OP_CRMINRANGE:
366            if (((tcode[1] << 8) + tcode[2]) == 0)            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
367              {              else try_next = FALSE;
368              tcode += 5;            break;
369              try_next = TRUE;  
370              }            default:
371              try_next = FALSE;
372            break;            break;
373            }            }
374          }          }
375        break; /* End of class handling */        break; /* End of bitmap class handling */
376    
377        }      /* End of switch */        }      /* End of switch */
378      }        /* End of try_next loop */      }        /* End of try_next loop */
379    
380    code += (code[1] << 8) + code[2];   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
381    }    }
382  while (*code == OP_ALT);  while (*code == OP_ALT);
383  return TRUE;  return TRUE;
# Line 328  Arguments: Line 399  Arguments:
399    errorptr  points to where to place error messages;    errorptr  points to where to place error messages;
400              set NULL unless error              set NULL unless error
401    
402  Returns:    pointer to a pcre_extra block,  Returns:    pointer to a pcre_extra block, with study_data filled in and the
403                  appropriate flag set;
404              NULL on error or if no optimization possible              NULL on error or if no optimization possible
405  */  */
406    
407  pcre_extra *  EXPORT pcre_extra *
408  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
409  {  {
410  uschar start_bits[32];  uschar start_bits[32];
411  real_pcre_extra *extra;  pcre_extra *extra;
412    pcre_study_data *study;
413    const uschar *tables;
414  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
415    uschar *code = (uschar *)re + re->name_table_offset +
416      (re->name_count * re->name_entry_size);
417  compile_data compile_block;  compile_data compile_block;
418    
419  *errorptr = NULL;  *errorptr = NULL;
# Line 354  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 430  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
430    return NULL;    return NULL;
431    }    }
432    
433  /* For an anchored pattern, or an unchored pattern that has a first char, or a  /* For an anchored pattern, or an unanchored pattern that has a first char, or
434  multiline pattern that matches only at "line starts", no further processing at  a multiline pattern that matches only at "line starts", no further processing
435  present. */  at present. */
436    
437  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
438    return NULL;    return NULL;
439    
440  /* Set the character tables in the block which is passed around */  /* Set the character tables in the block that is passed around */
441    
442  compile_block.lcc = re->tables + lcc_offset;  tables = re->tables;
443  compile_block.fcc = re->tables + fcc_offset;  if (tables == NULL)
444  compile_block.cbits = re->tables + cbits_offset;    (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, &tables);
445  compile_block.ctypes = re->tables + ctypes_offset;  
446    compile_block.lcc = tables + lcc_offset;
447    compile_block.fcc = tables + fcc_offset;
448    compile_block.cbits = tables + cbits_offset;
449    compile_block.ctypes = tables + ctypes_offset;
450    
451  /* See if we can find a fixed set of initial characters for the pattern. */  /* See if we can find a fixed set of initial characters for the pattern. */
452    
453  memset(start_bits, 0, 32 * sizeof(uschar));  memset(start_bits, 0, 32 * sizeof(uschar));
454  if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,  if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
455    &compile_block)) return NULL;    (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
456    
457  /* Get an "extra" block and put the information therein. */  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
458    the latter, which is pointed to by the former, which may also get additional
459    data set later by the calling program. At the moment, the size of
460    pcre_study_data is fixed. We nevertheless save it in a field for returning via
461    the pcre_fullinfo() function so that if it becomes variable in the future, we
462    don't have to change that code. */
463    
464  extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));  extra = (pcre_extra *)(pcre_malloc)
465      (sizeof(pcre_extra) + sizeof(pcre_study_data));
466    
467  if (extra == NULL)  if (extra == NULL)
468    {    {
# Line 384  if (extra == NULL) Line 470  if (extra == NULL)
470    return NULL;    return NULL;
471    }    }
472    
473  extra->options = PCRE_STUDY_MAPPED;  study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
474  memcpy(extra->start_bits, start_bits, sizeof(start_bits));  extra->flags = PCRE_EXTRA_STUDY_DATA;
475    extra->study_data = study;
476    
477    study->size = sizeof(pcre_study_data);
478    study->options = PCRE_STUDY_MAPPED;
479    memcpy(study->start_bits, start_bits, sizeof(start_bits));
480    
481  return (pcre_extra *)extra;  return extra;
482  }  }
483    
484  /* End of study.c */  /* End of study.c */

Legend:
Removed from v.27  
changed lines
  Added in v.75

  ViewVC Help
Powered by ViewVC 1.1.5