/[pcre]/code/tags/pcre-3.6/study.c
ViewVC logotype

Diff of /code/tags/pcre-3.6/study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 15 by nigel, Sat Feb 24 21:38:25 2007 UTC revision 53 by nigel, Sat Feb 24 21:39:42 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1998 University of Cambridge             Copyright (c) 1997-2001 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 37  the external pcre header. */ Line 41  the external pcre header. */
41    
42    
43  /*************************************************  /*************************************************
44    *      Set a bit and maybe its alternate case    *
45    *************************************************/
46    
47    /* Given a character, set its bit in the table, and also the bit for the other
48    version of a letter if we are caseless.
49    
50    Arguments:
51      start_bits    points to the bit map
52      c             is the character
53      caseless      the caseless flag
54      cd            the block with char table pointers
55    
56    Returns:        nothing
57    */
58    
59    static void
60    set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61    {
62    start_bits[c/8] |= (1 << (c&7));
63    if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64      start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65    }
66    
67    
68    
69    /*************************************************
70  *          Create bitmap of starting chars       *  *          Create bitmap of starting chars       *
71  *************************************************/  *************************************************/
72    
# Line 47  goes by, we may be able to get more clev Line 77  goes by, we may be able to get more clev
77  Arguments:  Arguments:
78    code         points to an expression    code         points to an expression
79    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
80      caseless     the current state of the caseless flag
81      cd           the block with char table pointers
82    
83  Returns:       TRUE if table built, FALSE otherwise  Returns:       TRUE if table built, FALSE otherwise
84  */  */
85    
86  static BOOL  static BOOL
87  set_start_bits(const uschar *code, uschar *start_bits)  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88      compile_data *cd)
89  {  {
90  register int c;  register int c;
91    
92    /* This next statement and the later reference to dummy are here in order to
93    trick the optimizer of the IBM C compiler for OS/2 into generating correct
94    code. Apparently IBM isn't going to fix the problem, and we would rather not
95    disable optimization (in this module it actually makes a big difference, and
96    the pcre module can use all the optimization it can get). */
97    
98    volatile int dummy;
99    
100  do  do
101    {    {
102    const uschar *tcode = code + 3;    const uschar *tcode = code + 3;
# Line 63  do Line 104  do
104    
105    while (try_next)    while (try_next)
106      {      {
107      try_next = FALSE;      /* If a branch starts with a bracket or a positive lookahead assertion,
108        recurse to set bits from within them. That's all for this branch. */
109    
110      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
111        {        {
112        if (!set_start_bits(tcode, start_bits)) return FALSE;        if (!set_start_bits(tcode, start_bits, caseless, cd))
113            return FALSE;
114          try_next = FALSE;
115        }        }
116    
117      else switch(*tcode)      else switch(*tcode)
# Line 75  do Line 119  do
119        default:        default:
120        return FALSE;        return FALSE;
121    
122          /* Skip over extended extraction bracket number */
123    
124          case OP_BRANUMBER:
125          tcode += 3;
126          break;
127    
128          /* Skip over lookbehind and negative lookahead assertions */
129    
130          case OP_ASSERT_NOT:
131          case OP_ASSERTBACK:
132          case OP_ASSERTBACK_NOT:
133          do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
134          tcode += 3;
135          break;
136    
137          /* Skip over an option setting, changing the caseless flag */
138    
139          case OP_OPT:
140          caseless = (tcode[1] & PCRE_CASELESS) != 0;
141          tcode += 2;
142          break;
143    
144        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
145    
146        case OP_BRAZERO:        case OP_BRAZERO:
147        case OP_BRAMINZERO:        case OP_BRAMINZERO:
148        if (!set_start_bits(++tcode, start_bits)) return FALSE;        if (!set_start_bits(++tcode, start_bits, caseless, cd))
149            return FALSE;
150          dummy = 1;
151        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
152        tcode += 3;        tcode += 3;
       try_next = TRUE;  
153        break;        break;
154    
155        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
# Line 91  do Line 158  do
158        case OP_MINSTAR:        case OP_MINSTAR:
159        case OP_QUERY:        case OP_QUERY:
160        case OP_MINQUERY:        case OP_MINQUERY:
161        start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));        set_bit(start_bits, tcode[1], caseless, cd);
162        tcode += 2;        tcode += 2;
       try_next = TRUE;  
163        break;        break;
164    
165        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
166    
167        case OP_UPTO:        case OP_UPTO:
168        case OP_MINUPTO:        case OP_MINUPTO:
169        start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));        set_bit(start_bits, tcode[3], caseless, cd);
170        tcode += 4;        tcode += 4;
       try_next = TRUE;  
171        break;        break;
172    
173        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
# Line 115  do Line 180  do
180    
181        case OP_PLUS:        case OP_PLUS:
182        case OP_MINPLUS:        case OP_MINPLUS:
183        start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));        set_bit(start_bits, tcode[1], caseless, cd);
184          try_next = FALSE;
185        break;        break;
186    
187        /* Single character type sets the bits and stops */        /* Single character type sets the bits and stops */
188    
189        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
190        for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];        for (c = 0; c < 32; c++)
191            start_bits[c] |= ~cd->cbits[c+cbit_digit];
192          try_next = FALSE;
193        break;        break;
194    
195        case OP_DIGIT:        case OP_DIGIT:
196        for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];        for (c = 0; c < 32; c++)
197            start_bits[c] |= cd->cbits[c+cbit_digit];
198          try_next = FALSE;
199        break;        break;
200    
201        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
202        for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];        for (c = 0; c < 32; c++)
203            start_bits[c] |= ~cd->cbits[c+cbit_space];
204          try_next = FALSE;
205        break;        break;
206    
207        case OP_WHITESPACE:        case OP_WHITESPACE:
208        for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];        for (c = 0; c < 32; c++)
209            start_bits[c] |= cd->cbits[c+cbit_space];
210          try_next = FALSE;
211        break;        break;
212    
213        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
214        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
215          start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);          start_bits[c] |= ~cd->cbits[c+cbit_word];
216          try_next = FALSE;
217        break;        break;
218    
219        case OP_WORDCHAR:        case OP_WORDCHAR:
220        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
221          start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);          start_bits[c] |= cd->cbits[c+cbit_word];
222          try_next = FALSE;
223        break;        break;
224    
225        /* One or more character type fudges the pointer and restarts, knowing        /* One or more character type fudges the pointer and restarts, knowing
# Line 152  do Line 228  do
228        case OP_TYPEPLUS:        case OP_TYPEPLUS:
229        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
230        tcode++;        tcode++;
       try_next = TRUE;  
231        break;        break;
232    
233        case OP_TYPEEXACT:        case OP_TYPEEXACT:
234        tcode += 3;        tcode += 3;
       try_next = TRUE;  
235        break;        break;
236    
237        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 174  do Line 248  do
248        switch(tcode[1])        switch(tcode[1])
249          {          {
250          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
251          for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];          for (c = 0; c < 32; c++)
252              start_bits[c] |= ~cd->cbits[c+cbit_digit];
253          break;          break;
254    
255          case OP_DIGIT:          case OP_DIGIT:
256          for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];          for (c = 0; c < 32; c++)
257              start_bits[c] |= cd->cbits[c+cbit_digit];
258          break;          break;
259    
260          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
261          for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];          for (c = 0; c < 32; c++)
262              start_bits[c] |= ~cd->cbits[c+cbit_space];
263          break;          break;
264    
265          case OP_WHITESPACE:          case OP_WHITESPACE:
266          for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];          for (c = 0; c < 32; c++)
267              start_bits[c] |= cd->cbits[c+cbit_space];
268          break;          break;
269    
270          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
271          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
272            start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);            start_bits[c] |= ~cd->cbits[c+cbit_word];
273          break;          break;
274    
275          case OP_WORDCHAR:          case OP_WORDCHAR:
276          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
277            start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);            start_bits[c] |= cd->cbits[c+cbit_word];
278          break;          break;
279          }          }
280    
281        tcode += 2;        tcode += 2;
       try_next = TRUE;  
282        break;        break;
283    
284        /* Character class: set the bits and either carry on or not,        /* Character class: set the bits and either carry on or not,
285        according to the repeat count. */        according to the repeat count. */
286    
287        case OP_CLASS:        case OP_CLASS:
       case OP_NEGCLASS:  
288          {          {
289          tcode++;          tcode++;
290          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
# Line 220  do Line 296  do
296            case OP_CRQUERY:            case OP_CRQUERY:
297            case OP_CRMINQUERY:            case OP_CRMINQUERY:
298            tcode++;            tcode++;
           try_next = TRUE;  
299            break;            break;
300    
301            case OP_CRRANGE:            case OP_CRRANGE:
302            case OP_CRMINRANGE:            case OP_CRMINRANGE:
303            if (((tcode[1] << 8) + tcode[2]) == 0)            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
304              {              else try_next = FALSE;
305              tcode += 5;            break;
306              try_next = TRUE;  
307              }            default:
308              try_next = FALSE;
309            break;            break;
310            }            }
311          }          }
# Line 267  Returns:    pointer to a pcre_extra bloc Line 343  Returns:    pointer to a pcre_extra bloc
343  pcre_extra *  pcre_extra *
344  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
345  {  {
 BOOL caseless;  
346  uschar start_bits[32];  uschar start_bits[32];
347  real_pcre_extra *extra;  real_pcre_extra *extra;
348  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
349    compile_data compile_block;
350    
351  *errorptr = NULL;  *errorptr = NULL;
352    
# Line 286  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 362  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
362    return NULL;    return NULL;
363    }    }
364    
 /* Caseless can either be from the compiled regex or from options. */  
   
 caseless = ((re->options | options) & PCRE_CASELESS) != 0;  
   
365  /* For an anchored pattern, or an unchored pattern that has a first char, or a  /* For an anchored pattern, or an unchored pattern that has a first char, or a
366  multiline pattern that matches only at "line starts", no further processing at  multiline pattern that matches only at "line starts", no further processing at
367  present. */  present. */
# Line 297  present. */ Line 369  present. */
369  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
370    return NULL;    return NULL;
371    
372  /* See if we can find a fixed set of initial characters for the pattern. */  /* Set the character tables in the block which is passed around */
373    
374  memset(start_bits, 0, 32 * sizeof(uschar));  compile_block.lcc = re->tables + lcc_offset;
375  if (!set_start_bits(re->code, start_bits)) return NULL;  compile_block.fcc = re->tables + fcc_offset;
376    compile_block.cbits = re->tables + cbits_offset;
377    compile_block.ctypes = re->tables + ctypes_offset;
378    
379  /* If this studying is caseless, scan the created bit map and duplicate the  /* See if we can find a fixed set of initial characters for the pattern. */
 bits for any letters. */  
380    
381  if (caseless)  memset(start_bits, 0, 32 * sizeof(uschar));
382    {  if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
383    register int c;    &compile_block)) return NULL;
   for (c = 0; c < 256; c++)  
     {  
     if ((start_bits[c/8] & (1 << (c&7))) != 0 &&  
         (pcre_ctypes[c] & ctype_letter) != 0)  
       {  
       int d = pcre_fcc[c];  
       start_bits[d/8] |= (1 << (d&7));  
       }  
     }  
   }  
384    
385  /* Get an "extra" block and put the information therein. */  /* Get an "extra" block and put the information therein. */
386    
# Line 329  if (extra == NULL) Line 392  if (extra == NULL)
392    return NULL;    return NULL;
393    }    }
394    
395  extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);  extra->options = PCRE_STUDY_MAPPED;
396  memcpy(extra->start_bits, start_bits, sizeof(start_bits));  memcpy(extra->start_bits, start_bits, sizeof(start_bits));
397    
398  return (pcre_extra *)extra;  return (pcre_extra *)extra;

Legend:
Removed from v.15  
changed lines
  Added in v.53

  ViewVC Help
Powered by ViewVC 1.1.5