/[pcre]/code/trunk/study.c
ViewVC logotype

Diff of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 7 by nigel, Sat Feb 24 21:38:09 2007 UTC revision 73 by nigel, Sat Feb 24 21:40:30 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997 University of Cambridge             Copyright (c) 1997-2003 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 37  the external pcre header. */ Line 41  the external pcre header. */
41    
42    
43  /*************************************************  /*************************************************
44    *      Set a bit and maybe its alternate case    *
45    *************************************************/
46    
47    /* Given a character, set its bit in the table, and also the bit for the other
48    version of a letter if we are caseless.
49    
50    Arguments:
51      start_bits    points to the bit map
52      c             is the character
53      caseless      the caseless flag
54      cd            the block with char table pointers
55    
56    Returns:        nothing
57    */
58    
59    static void
60    set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61    {
62    start_bits[c/8] |= (1 << (c&7));
63    if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64      start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65    }
66    
67    
68    
69    /*************************************************
70  *          Create bitmap of starting chars       *  *          Create bitmap of starting chars       *
71  *************************************************/  *************************************************/
72    
# Line 47  goes by, we may be able to get more clev Line 77  goes by, we may be able to get more clev
77  Arguments:  Arguments:
78    code         points to an expression    code         points to an expression
79    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
80      caseless     the current state of the caseless flag
81      utf8         TRUE if in UTF-8 mode
82      cd           the block with char table pointers
83    
84  Returns:       TRUE if table built, FALSE otherwise  Returns:       TRUE if table built, FALSE otherwise
85  */  */
86    
87  static BOOL  static BOOL
88  set_start_bits(const uschar *code, uschar *start_bits)  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
89      BOOL utf8, compile_data *cd)
90  {  {
91  register int c;  register int c;
92    
93    /* This next statement and the later reference to dummy are here in order to
94    trick the optimizer of the IBM C compiler for OS/2 into generating correct
95    code. Apparently IBM isn't going to fix the problem, and we would rather not
96    disable optimization (in this module it actually makes a big difference, and
97    the pcre module can use all the optimization it can get). */
98    
99    volatile int dummy;
100    
101  do  do
102    {    {
103    const uschar *tcode = code + 3;    const uschar *tcode = code + 1 + LINK_SIZE;
104    BOOL try_next = TRUE;    BOOL try_next = TRUE;
105    
106    while (try_next)    while (try_next)
107      {      {
108      try_next = FALSE;      /* If a branch starts with a bracket or a positive lookahead assertion,
109        recurse to set bits from within them. That's all for this branch. */
110    
111      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
112        {        {
113        if (!set_start_bits(tcode, start_bits)) return FALSE;        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
114            return FALSE;
115          try_next = FALSE;
116        }        }
117    
118      else switch(*tcode)      else switch(*tcode)
# Line 75  do Line 120  do
120        default:        default:
121        return FALSE;        return FALSE;
122    
123          /* Skip over callout */
124    
125          case OP_CALLOUT:
126          tcode += 2;
127          break;
128    
129          /* Skip over extended extraction bracket number */
130    
131          case OP_BRANUMBER:
132          tcode += 3;
133          break;
134    
135          /* Skip over lookbehind and negative lookahead assertions */
136    
137          case OP_ASSERT_NOT:
138          case OP_ASSERTBACK:
139          case OP_ASSERTBACK_NOT:
140          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
141          tcode += 1+LINK_SIZE;
142          break;
143    
144          /* Skip over an option setting, changing the caseless flag */
145    
146          case OP_OPT:
147          caseless = (tcode[1] & PCRE_CASELESS) != 0;
148          tcode += 2;
149          break;
150    
151        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
152    
153        case OP_BRAZERO:        case OP_BRAZERO:
154        case OP_BRAMINZERO:        case OP_BRAMINZERO:
155        if (!set_start_bits(++tcode, start_bits)) return FALSE;        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
156        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);          return FALSE;
157        tcode += 3;        dummy = 1;
158        try_next = TRUE;        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
159          tcode += 1+LINK_SIZE;
160        break;        break;
161    
162        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
# Line 91  do Line 165  do
165        case OP_MINSTAR:        case OP_MINSTAR:
166        case OP_QUERY:        case OP_QUERY:
167        case OP_MINQUERY:        case OP_MINQUERY:
168        start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));        set_bit(start_bits, tcode[1], caseless, cd);
169        tcode += 2;        tcode += 2;
170        try_next = TRUE;  #ifdef SUPPORT_UTF8
171          if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
172    #endif
173        break;        break;
174    
175        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
176    
177        case OP_UPTO:        case OP_UPTO:
178        case OP_MINUPTO:        case OP_MINUPTO:
179        start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));        set_bit(start_bits, tcode[3], caseless, cd);
180        tcode += 4;        tcode += 4;
181        try_next = TRUE;  #ifdef SUPPORT_UTF8
182          if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
183    #endif
184        break;        break;
185    
186        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
# Line 115  do Line 193  do
193    
194        case OP_PLUS:        case OP_PLUS:
195        case OP_MINPLUS:        case OP_MINPLUS:
196        start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));        set_bit(start_bits, tcode[1], caseless, cd);
197          try_next = FALSE;
198        break;        break;
199    
200        /* Single character type sets the bits and stops */        /* Single character type sets the bits and stops */
201    
202        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
203        for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];        for (c = 0; c < 32; c++)
204            start_bits[c] |= ~cd->cbits[c+cbit_digit];
205          try_next = FALSE;
206        break;        break;
207    
208        case OP_DIGIT:        case OP_DIGIT:
209        for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];        for (c = 0; c < 32; c++)
210            start_bits[c] |= cd->cbits[c+cbit_digit];
211          try_next = FALSE;
212        break;        break;
213    
214        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
215        for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];        for (c = 0; c < 32; c++)
216            start_bits[c] |= ~cd->cbits[c+cbit_space];
217          try_next = FALSE;
218        break;        break;
219    
220        case OP_WHITESPACE:        case OP_WHITESPACE:
221        for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];        for (c = 0; c < 32; c++)
222            start_bits[c] |= cd->cbits[c+cbit_space];
223          try_next = FALSE;
224        break;        break;
225    
226        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
227        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
228          start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);          start_bits[c] |= ~cd->cbits[c+cbit_word];
229          try_next = FALSE;
230        break;        break;
231    
232        case OP_WORDCHAR:        case OP_WORDCHAR:
233        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
234          start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);          start_bits[c] |= cd->cbits[c+cbit_word];
235          try_next = FALSE;
236        break;        break;
237    
238        /* One or more character type fudges the pointer and restarts, knowing        /* One or more character type fudges the pointer and restarts, knowing
# Line 152  do Line 241  do
241        case OP_TYPEPLUS:        case OP_TYPEPLUS:
242        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
243        tcode++;        tcode++;
       try_next = TRUE;  
244        break;        break;
245    
246        case OP_TYPEEXACT:        case OP_TYPEEXACT:
247        tcode += 3;        tcode += 3;
       try_next = TRUE;  
248        break;        break;
249    
250        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 173  do Line 260  do
260        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
261        switch(tcode[1])        switch(tcode[1])
262          {          {
263            case OP_ANY:
264            return FALSE;
265    
266          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
267          for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];          for (c = 0; c < 32; c++)
268              start_bits[c] |= ~cd->cbits[c+cbit_digit];
269          break;          break;
270    
271          case OP_DIGIT:          case OP_DIGIT:
272          for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];          for (c = 0; c < 32; c++)
273              start_bits[c] |= cd->cbits[c+cbit_digit];
274          break;          break;
275    
276          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
277          for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];          for (c = 0; c < 32; c++)
278              start_bits[c] |= ~cd->cbits[c+cbit_space];
279          break;          break;
280    
281          case OP_WHITESPACE:          case OP_WHITESPACE:
282          for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];          for (c = 0; c < 32; c++)
283              start_bits[c] |= cd->cbits[c+cbit_space];
284          break;          break;
285    
286          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
287          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
288            start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);            start_bits[c] |= ~cd->cbits[c+cbit_word];
289          break;          break;
290    
291          case OP_WORDCHAR:          case OP_WORDCHAR:
292          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
293            start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);            start_bits[c] |= cd->cbits[c+cbit_word];
294          break;          break;
295          }          }
296    
297        tcode += 2;        tcode += 2;
       try_next = TRUE;  
298        break;        break;
299    
300        /* Character class: set the bits and either carry on or not,        /* Character class where all the information is in a bit map: set the
301        according to the repeat count. */        bits and either carry on or not, according to the repeat count. If it was
302          a negative class, and we are operating with UTF-8 characters, any byte
303          with a value >= 0xc4 is a potentially valid starter because it starts a
304          character with a value > 255. */
305    
306          case OP_NCLASS:
307          if (utf8)
308            {
309            start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
310            memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
311            }
312          /* Fall through */
313    
314        case OP_CLASS:        case OP_CLASS:
315          {          {
316          tcode++;          tcode++;
317          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];  
318            /* In UTF-8 mode, the bits in a bit map correspond to character
319            values, not to byte values. However, the bit map we are constructing is
320            for byte values. So we have to do a conversion for characters whose
321            value is > 127. In fact, there are only two possible starting bytes for
322            characters in the range 128 - 255. */
323    
324            if (utf8)
325              {
326              for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
327              for (c = 128; c < 256; c++)
328                {
329                if ((tcode[c/8] && (1 << (c&7))) != 0)
330                  {
331                  int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
332                  start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
333                  c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
334                  }
335                }
336              }
337    
338            /* In non-UTF-8 mode, the two bit maps are completely compatible. */
339    
340            else
341              {
342              for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
343              }
344    
345            /* Advance past the bit map, and act on what follows */
346    
347          tcode += 32;          tcode += 32;
348          switch (*tcode)          switch (*tcode)
349            {            {
# Line 219  do Line 352  do
352            case OP_CRQUERY:            case OP_CRQUERY:
353            case OP_CRMINQUERY:            case OP_CRMINQUERY:
354            tcode++;            tcode++;
           try_next = TRUE;  
355            break;            break;
356    
357            case OP_CRRANGE:            case OP_CRRANGE:
358            case OP_CRMINRANGE:            case OP_CRMINRANGE:
359            if (((tcode[1] << 8) + tcode[2]) == 0)            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
360              {              else try_next = FALSE;
361              tcode += 5;            break;
362              try_next = TRUE;  
363              }            default:
364              try_next = FALSE;
365            break;            break;
366            }            }
367          }          }
368        break; /* End of class handling */        break; /* End of bitmap class handling */
369    
370        }      /* End of switch */        }      /* End of switch */
371      }        /* End of try_next loop */      }        /* End of try_next loop */
372    
373    code += (code[1] << 8) + code[2];   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
374    }    }
375  while (*code == OP_ALT);  while (*code == OP_ALT);
376  return TRUE;  return TRUE;
# Line 259  Arguments: Line 392  Arguments:
392    errorptr  points to where to place error messages;    errorptr  points to where to place error messages;
393              set NULL unless error              set NULL unless error
394    
395  Returns:    pointer to a pcre_extra block,  Returns:    pointer to a pcre_extra block, with study_data filled in and the
396                  appropriate flag set;
397              NULL on error or if no optimization possible              NULL on error or if no optimization possible
398  */  */
399    
400  pcre_extra *  EXPORT pcre_extra *
401  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
402  {  {
 BOOL caseless;  
403  uschar start_bits[32];  uschar start_bits[32];
404  real_pcre_extra *extra;  pcre_extra *extra;
405    pcre_study_data *study;
406  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
407    uschar *code = (uschar *)re + sizeof(real_pcre) +
408      (re->name_count * re->name_entry_size);
409    compile_data compile_block;
410    
411  *errorptr = NULL;  *errorptr = NULL;
412    
# Line 285  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 422  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
422    return NULL;    return NULL;
423    }    }
424    
425  /* Caseless can either be from the compiled regex or from options. */  /* For an anchored pattern, or an unanchored pattern that has a first char, or
426    a multiline pattern that matches only at "line starts", no further processing
427  caseless = ((re->options | options) & PCRE_CASELESS) != 0;  at present. */
   
 /* For an anchored pattern, or an unchored pattern that has a first char, or a  
 multiline pattern that matches only at "line starts", no further processing at  
 present. */  
428    
429  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
430    return NULL;    return NULL;
431    
432  /* See if we can find a fixed set of initial characters for the pattern. */  /* Set the character tables in the block which is passed around */
433    
434  memset(start_bits, 0, 32 * sizeof(uschar));  compile_block.lcc = re->tables + lcc_offset;
435  if (!set_start_bits(re->code, start_bits)) return NULL;  compile_block.fcc = re->tables + fcc_offset;
436    compile_block.cbits = re->tables + cbits_offset;
437    compile_block.ctypes = re->tables + ctypes_offset;
438    
439  /* If this studying is caseless, scan the created bit map and duplicate the  /* See if we can find a fixed set of initial characters for the pattern. */
 bits for any letters. */  
440    
441  if (caseless)  memset(start_bits, 0, 32 * sizeof(uschar));
442    {  if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
443    register int c;    (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
   for (c = 0; c < 256; c++)  
     {  
     if ((start_bits[c/8] & (1 << (c&7))) != 0 &&  
         (pcre_ctypes[c] & ctype_letter) != 0)  
       {  
       int d = pcre_fcc[c];  
       start_bits[d/8] |= (1 << (d&7));  
       }  
     }  
   }  
444    
445  /* Get an "extra" block and put the information therein. */  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
446    the latter, which is pointed to by the former, which may also get additional
447    data set later by the calling program. At the moment, the size of
448    pcre_study_data is fixed. We nevertheless save it in a field for returning via
449    the pcre_fullinfo() function so that if it becomes variable in the future, we
450    don't have to change that code. */
451    
452  extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));  extra = (pcre_extra *)(pcre_malloc)
453      (sizeof(pcre_extra) + sizeof(pcre_study_data));
454    
455  if (extra == NULL)  if (extra == NULL)
456    {    {
# Line 328  if (extra == NULL) Line 458  if (extra == NULL)
458    return NULL;    return NULL;
459    }    }
460    
461  extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);  study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
462  memcpy(extra->start_bits, start_bits, sizeof(start_bits));  extra->flags = PCRE_EXTRA_STUDY_DATA;
463    extra->study_data = study;
464    
465    study->size = sizeof(pcre_study_data);
466    study->options = PCRE_STUDY_MAPPED;
467    memcpy(study->start_bits, start_bits, sizeof(start_bits));
468    
469  return (pcre_extra *)extra;  return extra;
470  }  }
471    
472  /* End of study.c */  /* End of study.c */

Legend:
Removed from v.7  
changed lines
  Added in v.73

  ViewVC Help
Powered by ViewVC 1.1.5