/[pcre]/code/trunk/study.c
ViewVC logotype

Diff of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 27 by nigel, Sat Feb 24 21:38:49 2007 UTC revision 71 by nigel, Sat Feb 24 21:40:24 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-1999 University of Cambridge             Copyright (c) 1997-2003 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 74  Arguments: Line 78  Arguments:
78    code         points to an expression    code         points to an expression
79    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
80    caseless     the current state of the caseless flag    caseless     the current state of the caseless flag
81      utf8         TRUE if in UTF-8 mode
82    cd           the block with char table pointers    cd           the block with char table pointers
83    
84  Returns:       TRUE if table built, FALSE otherwise  Returns:       TRUE if table built, FALSE otherwise
# Line 81  Returns:       TRUE if table built, FALS Line 86  Returns:       TRUE if table built, FALS
86    
87  static BOOL  static BOOL
88  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
89    compile_data *cd)    BOOL utf8, compile_data *cd)
90  {  {
91  register int c;  register int c;
92    
# Line 95  volatile int dummy; Line 100  volatile int dummy;
100    
101  do  do
102    {    {
103    const uschar *tcode = code + 3;    const uschar *tcode = code + 1 + LINK_SIZE;
104    BOOL try_next = TRUE;    BOOL try_next = TRUE;
105    
106    while (try_next)    while (try_next)
107      {      {
     try_next = FALSE;  
   
108      /* If a branch starts with a bracket or a positive lookahead assertion,      /* If a branch starts with a bracket or a positive lookahead assertion,
109      recurse to set bits from within them. That's all for this branch. */      recurse to set bits from within them. That's all for this branch. */
110    
111      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
112        {        {
113        if (!set_start_bits(tcode, start_bits, caseless, cd))        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
114          return FALSE;          return FALSE;
115          try_next = FALSE;
116        }        }
117    
118      else switch(*tcode)      else switch(*tcode)
# Line 116  do Line 120  do
120        default:        default:
121        return FALSE;        return FALSE;
122    
123          /* Skip over callout */
124    
125          case OP_CALLOUT:
126          tcode += 2;
127          break;
128    
129          /* Skip over extended extraction bracket number */
130    
131          case OP_BRANUMBER:
132          tcode += 3;
133          break;
134    
135        /* Skip over lookbehind and negative lookahead assertions */        /* Skip over lookbehind and negative lookahead assertions */
136    
137        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
138        case OP_ASSERTBACK:        case OP_ASSERTBACK:
139        case OP_ASSERTBACK_NOT:        case OP_ASSERTBACK_NOT:
140        try_next = TRUE;        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
141        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);        tcode += 1+LINK_SIZE;
       tcode += 3;  
142        break;        break;
143    
144        /* Skip over an option setting, changing the caseless flag */        /* Skip over an option setting, changing the caseless flag */
# Line 131  do Line 146  do
146        case OP_OPT:        case OP_OPT:
147        caseless = (tcode[1] & PCRE_CASELESS) != 0;        caseless = (tcode[1] & PCRE_CASELESS) != 0;
148        tcode += 2;        tcode += 2;
       try_next = TRUE;  
149        break;        break;
150    
151        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
152    
153        case OP_BRAZERO:        case OP_BRAZERO:
154        case OP_BRAMINZERO:        case OP_BRAMINZERO:
155        if (!set_start_bits(++tcode, start_bits, caseless, cd))        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
156          return FALSE;          return FALSE;
157        dummy = 1;        dummy = 1;
158        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
159        tcode += 3;        tcode += 1+LINK_SIZE;
       try_next = TRUE;  
160        break;        break;
161    
162        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
# Line 154  do Line 167  do
167        case OP_MINQUERY:        case OP_MINQUERY:
168        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
169        tcode += 2;        tcode += 2;
170        try_next = TRUE;  #ifdef SUPPORT_UTF8
171          if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
172    #endif
173        break;        break;
174    
175        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 163  do Line 178  do
178        case OP_MINUPTO:        case OP_MINUPTO:
179        set_bit(start_bits, tcode[3], caseless, cd);        set_bit(start_bits, tcode[3], caseless, cd);
180        tcode += 4;        tcode += 4;
181        try_next = TRUE;  #ifdef SUPPORT_UTF8
182          if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
183    #endif
184        break;        break;
185    
186        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
# Line 177  do Line 194  do
194        case OP_PLUS:        case OP_PLUS:
195        case OP_MINPLUS:        case OP_MINPLUS:
196        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
197          try_next = FALSE;
198        break;        break;
199    
200        /* Single character type sets the bits and stops */        /* Single character type sets the bits and stops */
# Line 184  do Line 202  do
202        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
203        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
204          start_bits[c] |= ~cd->cbits[c+cbit_digit];          start_bits[c] |= ~cd->cbits[c+cbit_digit];
205          try_next = FALSE;
206        break;        break;
207    
208        case OP_DIGIT:        case OP_DIGIT:
209        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
210          start_bits[c] |= cd->cbits[c+cbit_digit];          start_bits[c] |= cd->cbits[c+cbit_digit];
211          try_next = FALSE;
212        break;        break;
213    
214        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
215        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
216          start_bits[c] |= ~cd->cbits[c+cbit_space];          start_bits[c] |= ~cd->cbits[c+cbit_space];
217          try_next = FALSE;
218        break;        break;
219    
220        case OP_WHITESPACE:        case OP_WHITESPACE:
221        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
222          start_bits[c] |= cd->cbits[c+cbit_space];          start_bits[c] |= cd->cbits[c+cbit_space];
223          try_next = FALSE;
224        break;        break;
225    
226        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
227        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
228          start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);          start_bits[c] |= ~cd->cbits[c+cbit_word];
229          try_next = FALSE;
230        break;        break;
231    
232        case OP_WORDCHAR:        case OP_WORDCHAR:
233        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
234          start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);          start_bits[c] |= cd->cbits[c+cbit_word];
235          try_next = FALSE;
236        break;        break;
237    
238        /* One or more character type fudges the pointer and restarts, knowing        /* One or more character type fudges the pointer and restarts, knowing
# Line 217  do Line 241  do
241        case OP_TYPEPLUS:        case OP_TYPEPLUS:
242        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
243        tcode++;        tcode++;
       try_next = TRUE;  
244        break;        break;
245    
246        case OP_TYPEEXACT:        case OP_TYPEEXACT:
247        tcode += 3;        tcode += 3;
       try_next = TRUE;  
248        break;        break;
249    
250        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 260  do Line 282  do
282    
283          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
284          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
285            start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);            start_bits[c] |= ~cd->cbits[c+cbit_word];
286          break;          break;
287    
288          case OP_WORDCHAR:          case OP_WORDCHAR:
289          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
290            start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);            start_bits[c] |= cd->cbits[c+cbit_word];
291          break;          break;
292          }          }
293    
294        tcode += 2;        tcode += 2;
       try_next = TRUE;  
295        break;        break;
296    
297        /* Character class: set the bits and either carry on or not,        /* Character class where all the information is in a bit map: set the
298        according to the repeat count. */        bits and either carry on or not, according to the repeat count. If it was
299          a negative class, and we are operating with UTF-8 characters, any byte
300          with a value >= 0xc4 is a potentially valid starter because it starts a
301          character with a value > 255. */
302    
303          case OP_NCLASS:
304          if (utf8)
305            {
306            start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
307            memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
308            }
309          /* Fall through */
310    
311        case OP_CLASS:        case OP_CLASS:
312          {          {
313          tcode++;          tcode++;
314          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];  
315            /* In UTF-8 mode, the bits in a bit map correspond to character
316            values, not to byte values. However, the bit map we are constructing is
317            for byte values. So we have to do a conversion for characters whose
318            value is > 127. In fact, there are only two possible starting bytes for
319            characters in the range 128 - 255. */
320    
321            if (utf8)
322              {
323              for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
324              for (c = 128; c < 256; c++)
325                {
326                if ((tcode[c/8] && (1 << (c&7))) != 0)
327                  {
328                  int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
329                  start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
330                  c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
331                  }
332                }
333              }
334    
335            /* In non-UTF-8 mode, the two bit maps are completely compatible. */
336    
337            else
338              {
339              for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
340              }
341    
342            /* Advance past the bit map, and act on what follows */
343    
344          tcode += 32;          tcode += 32;
345          switch (*tcode)          switch (*tcode)
346            {            {
# Line 288  do Line 349  do
349            case OP_CRQUERY:            case OP_CRQUERY:
350            case OP_CRMINQUERY:            case OP_CRMINQUERY:
351            tcode++;            tcode++;
           try_next = TRUE;  
352            break;            break;
353    
354            case OP_CRRANGE:            case OP_CRRANGE:
355            case OP_CRMINRANGE:            case OP_CRMINRANGE:
356            if (((tcode[1] << 8) + tcode[2]) == 0)            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
357              {              else try_next = FALSE;
358              tcode += 5;            break;
359              try_next = TRUE;  
360              }            default:
361              try_next = FALSE;
362            break;            break;
363            }            }
364          }          }
365        break; /* End of class handling */        break; /* End of bitmap class handling */
366    
367        }      /* End of switch */        }      /* End of switch */
368      }        /* End of try_next loop */      }        /* End of try_next loop */
369    
370    code += (code[1] << 8) + code[2];   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
371    }    }
372  while (*code == OP_ALT);  while (*code == OP_ALT);
373  return TRUE;  return TRUE;
# Line 328  Arguments: Line 389  Arguments:
389    errorptr  points to where to place error messages;    errorptr  points to where to place error messages;
390              set NULL unless error              set NULL unless error
391    
392  Returns:    pointer to a pcre_extra block,  Returns:    pointer to a pcre_extra block, with study_data filled in and the
393                  appropriate flag set;
394              NULL on error or if no optimization possible              NULL on error or if no optimization possible
395  */  */
396    
# Line 336  pcre_extra * Line 398  pcre_extra *
398  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
399  {  {
400  uschar start_bits[32];  uschar start_bits[32];
401  real_pcre_extra *extra;  pcre_extra *extra;
402    pcre_study_data *study;
403  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
404    uschar *code = (uschar *)re + sizeof(real_pcre) +
405      (re->name_count * re->name_entry_size);
406  compile_data compile_block;  compile_data compile_block;
407    
408  *errorptr = NULL;  *errorptr = NULL;
# Line 354  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 419  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
419    return NULL;    return NULL;
420    }    }
421    
422  /* For an anchored pattern, or an unchored pattern that has a first char, or a  /* For an anchored pattern, or an unanchored pattern that has a first char, or
423  multiline pattern that matches only at "line starts", no further processing at  a multiline pattern that matches only at "line starts", no further processing
424  present. */  at present. */
425    
426  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
427    return NULL;    return NULL;
# Line 371  compile_block.ctypes = re->tables + ctyp Line 436  compile_block.ctypes = re->tables + ctyp
436  /* See if we can find a fixed set of initial characters for the pattern. */  /* See if we can find a fixed set of initial characters for the pattern. */
437    
438  memset(start_bits, 0, 32 * sizeof(uschar));  memset(start_bits, 0, 32 * sizeof(uschar));
439  if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,  if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
440    &compile_block)) return NULL;    (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
441    
442  /* Get an "extra" block and put the information therein. */  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
443    the latter, which is pointed to by the former, which may also get additional
444    data set later by the calling program. At the moment, the size of
445    pcre_study_data is fixed. We nevertheless save it in a field for returning via
446    the pcre_fullinfo() function so that if it becomes variable in the future, we
447    don't have to change that code. */
448    
449  extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));  extra = (pcre_extra *)(pcre_malloc)
450      (sizeof(pcre_extra) + sizeof(pcre_study_data));
451    
452  if (extra == NULL)  if (extra == NULL)
453    {    {
# Line 384  if (extra == NULL) Line 455  if (extra == NULL)
455    return NULL;    return NULL;
456    }    }
457    
458  extra->options = PCRE_STUDY_MAPPED;  study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
459  memcpy(extra->start_bits, start_bits, sizeof(start_bits));  extra->flags = PCRE_EXTRA_STUDY_DATA;
460    extra->study_data = study;
461    
462    study->size = sizeof(pcre_study_data);
463    study->options = PCRE_STUDY_MAPPED;
464    memcpy(study->start_bits, start_bits, sizeof(start_bits));
465    
466  return (pcre_extra *)extra;  return extra;
467  }  }
468    
469  /* End of study.c */  /* End of study.c */

Legend:
Removed from v.27  
changed lines
  Added in v.71

  ViewVC Help
Powered by ViewVC 1.1.5