/[pcre]/code/trunk/study.c
ViewVC logotype

Diff of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 7 by nigel, Sat Feb 24 21:38:09 2007 UTC revision 75 by nigel, Sat Feb 24 21:40:37 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997 University of Cambridge             Copyright (c) 1997-2004 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Redistribution and use in source and binary forms, with or without
16  computer system, and to redistribute it freely, subject to the following  modification, are permitted provided that the following conditions are met:
 restrictions:  
   
 1. This software is distributed in the hope that it will be useful,  
    but WITHOUT ANY WARRANTY; without even the implied warranty of  
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
17    
18  2. The origin of this software must not be misrepresented, either by      * Redistributions of source code must retain the above copyright notice,
19     explicit claim or by omission.        this list of conditions and the following disclaimer.
20    
21  3. Altered versions must be plainly marked as such, and must not be      * Redistributions in binary form must reproduce the above copyright
22     misrepresented as being the original software.        notice, this list of conditions and the following disclaimer in the
23          documentation and/or other materials provided with the distribution.
24    
25        * Neither the name of the University of Cambridge nor the names of its
26          contributors may be used to endorse or promote products derived from
27          this software without specific prior written permission.
28    
29    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39    POSSIBILITY OF SUCH DAMAGE.
40  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
41  */  */
42    
# Line 37  the external pcre header. */ Line 49  the external pcre header. */
49    
50    
51  /*************************************************  /*************************************************
52    *      Set a bit and maybe its alternate case    *
53    *************************************************/
54    
55    /* Given a character, set its bit in the table, and also the bit for the other
56    version of a letter if we are caseless.
57    
58    Arguments:
59      start_bits    points to the bit map
60      c             is the character
61      caseless      the caseless flag
62      cd            the block with char table pointers
63    
64    Returns:        nothing
65    */
66    
67    static void
68    set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
69    {
70    start_bits[c/8] |= (1 << (c&7));
71    if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
72      start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
73    }
74    
75    
76    
77    /*************************************************
78  *          Create bitmap of starting chars       *  *          Create bitmap of starting chars       *
79  *************************************************/  *************************************************/
80    
# Line 47  goes by, we may be able to get more clev Line 85  goes by, we may be able to get more clev
85  Arguments:  Arguments:
86    code         points to an expression    code         points to an expression
87    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
88      caseless     the current state of the caseless flag
89      utf8         TRUE if in UTF-8 mode
90      cd           the block with char table pointers
91    
92  Returns:       TRUE if table built, FALSE otherwise  Returns:       TRUE if table built, FALSE otherwise
93  */  */
94    
95  static BOOL  static BOOL
96  set_start_bits(const uschar *code, uschar *start_bits)  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
97      BOOL utf8, compile_data *cd)
98  {  {
99  register int c;  register int c;
100    
101    /* This next statement and the later reference to dummy are here in order to
102    trick the optimizer of the IBM C compiler for OS/2 into generating correct
103    code. Apparently IBM isn't going to fix the problem, and we would rather not
104    disable optimization (in this module it actually makes a big difference, and
105    the pcre module can use all the optimization it can get). */
106    
107    volatile int dummy;
108    
109  do  do
110    {    {
111    const uschar *tcode = code + 3;    const uschar *tcode = code + 1 + LINK_SIZE;
112    BOOL try_next = TRUE;    BOOL try_next = TRUE;
113    
114    while (try_next)    while (try_next)
115      {      {
116      try_next = FALSE;      /* If a branch starts with a bracket or a positive lookahead assertion,
117        recurse to set bits from within them. That's all for this branch. */
118    
119      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
120        {        {
121        if (!set_start_bits(tcode, start_bits)) return FALSE;        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
122            return FALSE;
123          try_next = FALSE;
124        }        }
125    
126      else switch(*tcode)      else switch(*tcode)
# Line 75  do Line 128  do
128        default:        default:
129        return FALSE;        return FALSE;
130    
131          /* Skip over callout */
132    
133          case OP_CALLOUT:
134          tcode += 2 + 2*LINK_SIZE;
135          break;
136    
137          /* Skip over extended extraction bracket number */
138    
139          case OP_BRANUMBER:
140          tcode += 3;
141          break;
142    
143          /* Skip over lookbehind and negative lookahead assertions */
144    
145          case OP_ASSERT_NOT:
146          case OP_ASSERTBACK:
147          case OP_ASSERTBACK_NOT:
148          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
149          tcode += 1+LINK_SIZE;
150          break;
151    
152          /* Skip over an option setting, changing the caseless flag */
153    
154          case OP_OPT:
155          caseless = (tcode[1] & PCRE_CASELESS) != 0;
156          tcode += 2;
157          break;
158    
159        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
160    
161        case OP_BRAZERO:        case OP_BRAZERO:
162        case OP_BRAMINZERO:        case OP_BRAMINZERO:
163        if (!set_start_bits(++tcode, start_bits)) return FALSE;        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
164        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);          return FALSE;
165        tcode += 3;        dummy = 1;
166        try_next = TRUE;        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
167          tcode += 1+LINK_SIZE;
168        break;        break;
169    
170        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
# Line 91  do Line 173  do
173        case OP_MINSTAR:        case OP_MINSTAR:
174        case OP_QUERY:        case OP_QUERY:
175        case OP_MINQUERY:        case OP_MINQUERY:
176        start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));        set_bit(start_bits, tcode[1], caseless, cd);
177        tcode += 2;        tcode += 2;
178        try_next = TRUE;  #ifdef SUPPORT_UTF8
179          if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
180    #endif
181        break;        break;
182    
183        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
184    
185        case OP_UPTO:        case OP_UPTO:
186        case OP_MINUPTO:        case OP_MINUPTO:
187        start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));        set_bit(start_bits, tcode[3], caseless, cd);
188        tcode += 4;        tcode += 4;
189        try_next = TRUE;  #ifdef SUPPORT_UTF8
190          if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
191    #endif
192        break;        break;
193    
194        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
195    
196        case OP_EXACT:       /* Fall through */        case OP_EXACT:       /* Fall through */
197        tcode++;        tcode += 2;
   
       case OP_CHARS:       /* Fall through */  
       tcode++;  
198    
199          case OP_CHAR:
200          case OP_CHARNC:
201        case OP_PLUS:        case OP_PLUS:
202        case OP_MINPLUS:        case OP_MINPLUS:
203        start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));        set_bit(start_bits, tcode[1], caseless, cd);
204          try_next = FALSE;
205        break;        break;
206    
207        /* Single character type sets the bits and stops */        /* Single character type sets the bits and stops */
208    
209        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
210        for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];        for (c = 0; c < 32; c++)
211            start_bits[c] |= ~cd->cbits[c+cbit_digit];
212          try_next = FALSE;
213        break;        break;
214    
215        case OP_DIGIT:        case OP_DIGIT:
216        for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];        for (c = 0; c < 32; c++)
217            start_bits[c] |= cd->cbits[c+cbit_digit];
218          try_next = FALSE;
219        break;        break;
220    
221        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
222        for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];        for (c = 0; c < 32; c++)
223            start_bits[c] |= ~cd->cbits[c+cbit_space];
224          try_next = FALSE;
225        break;        break;
226    
227        case OP_WHITESPACE:        case OP_WHITESPACE:
228        for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];        for (c = 0; c < 32; c++)
229            start_bits[c] |= cd->cbits[c+cbit_space];
230          try_next = FALSE;
231        break;        break;
232    
233        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
234        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
235          start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);          start_bits[c] |= ~cd->cbits[c+cbit_word];
236          try_next = FALSE;
237        break;        break;
238    
239        case OP_WORDCHAR:        case OP_WORDCHAR:
240        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
241          start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);          start_bits[c] |= cd->cbits[c+cbit_word];
242          try_next = FALSE;
243        break;        break;
244    
245        /* One or more character type fudges the pointer and restarts, knowing        /* One or more character type fudges the pointer and restarts, knowing
# Line 152  do Line 248  do
248        case OP_TYPEPLUS:        case OP_TYPEPLUS:
249        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
250        tcode++;        tcode++;
       try_next = TRUE;  
251        break;        break;
252    
253        case OP_TYPEEXACT:        case OP_TYPEEXACT:
254        tcode += 3;        tcode += 3;
       try_next = TRUE;  
255        break;        break;
256    
257        /* Zero or more repeats of character types set the bits and then        /* Zero or more repeats of character types set the bits and then
# Line 173  do Line 267  do
267        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
268        switch(tcode[1])        switch(tcode[1])
269          {          {
270            case OP_ANY:
271            return FALSE;
272    
273          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
274          for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];          for (c = 0; c < 32; c++)
275              start_bits[c] |= ~cd->cbits[c+cbit_digit];
276          break;          break;
277    
278          case OP_DIGIT:          case OP_DIGIT:
279          for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];          for (c = 0; c < 32; c++)
280              start_bits[c] |= cd->cbits[c+cbit_digit];
281          break;          break;
282    
283          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
284          for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];          for (c = 0; c < 32; c++)
285              start_bits[c] |= ~cd->cbits[c+cbit_space];
286          break;          break;
287    
288          case OP_WHITESPACE:          case OP_WHITESPACE:
289          for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];          for (c = 0; c < 32; c++)
290              start_bits[c] |= cd->cbits[c+cbit_space];
291          break;          break;
292    
293          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
294          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
295            start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);            start_bits[c] |= ~cd->cbits[c+cbit_word];
296          break;          break;
297    
298          case OP_WORDCHAR:          case OP_WORDCHAR:
299          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
300            start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);            start_bits[c] |= cd->cbits[c+cbit_word];
301          break;          break;
302          }          }
303    
304        tcode += 2;        tcode += 2;
       try_next = TRUE;  
305        break;        break;
306    
307        /* Character class: set the bits and either carry on or not,        /* Character class where all the information is in a bit map: set the
308        according to the repeat count. */        bits and either carry on or not, according to the repeat count. If it was
309          a negative class, and we are operating with UTF-8 characters, any byte
310          with a value >= 0xc4 is a potentially valid starter because it starts a
311          character with a value > 255. */
312    
313          case OP_NCLASS:
314          if (utf8)
315            {
316            start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
317            memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
318            }
319          /* Fall through */
320    
321        case OP_CLASS:        case OP_CLASS:
322          {          {
323          tcode++;          tcode++;
324          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];  
325            /* In UTF-8 mode, the bits in a bit map correspond to character
326            values, not to byte values. However, the bit map we are constructing is
327            for byte values. So we have to do a conversion for characters whose
328            value is > 127. In fact, there are only two possible starting bytes for
329            characters in the range 128 - 255. */
330    
331            if (utf8)
332              {
333              for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
334              for (c = 128; c < 256; c++)
335                {
336                if ((tcode[c/8] && (1 << (c&7))) != 0)
337                  {
338                  int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
339                  start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
340                  c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
341                  }
342                }
343              }
344    
345            /* In non-UTF-8 mode, the two bit maps are completely compatible. */
346    
347            else
348              {
349              for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
350              }
351    
352            /* Advance past the bit map, and act on what follows */
353    
354          tcode += 32;          tcode += 32;
355          switch (*tcode)          switch (*tcode)
356            {            {
# Line 219  do Line 359  do
359            case OP_CRQUERY:            case OP_CRQUERY:
360            case OP_CRMINQUERY:            case OP_CRMINQUERY:
361            tcode++;            tcode++;
           try_next = TRUE;  
362            break;            break;
363    
364            case OP_CRRANGE:            case OP_CRRANGE:
365            case OP_CRMINRANGE:            case OP_CRMINRANGE:
366            if (((tcode[1] << 8) + tcode[2]) == 0)            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
367              {              else try_next = FALSE;
368              tcode += 5;            break;
369              try_next = TRUE;  
370              }            default:
371              try_next = FALSE;
372            break;            break;
373            }            }
374          }          }
375        break; /* End of class handling */        break; /* End of bitmap class handling */
376    
377        }      /* End of switch */        }      /* End of switch */
378      }        /* End of try_next loop */      }        /* End of try_next loop */
379    
380    code += (code[1] << 8) + code[2];   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
381    }    }
382  while (*code == OP_ALT);  while (*code == OP_ALT);
383  return TRUE;  return TRUE;
# Line 259  Arguments: Line 399  Arguments:
399    errorptr  points to where to place error messages;    errorptr  points to where to place error messages;
400              set NULL unless error              set NULL unless error
401    
402  Returns:    pointer to a pcre_extra block,  Returns:    pointer to a pcre_extra block, with study_data filled in and the
403                  appropriate flag set;
404              NULL on error or if no optimization possible              NULL on error or if no optimization possible
405  */  */
406    
407  pcre_extra *  EXPORT pcre_extra *
408  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
409  {  {
 BOOL caseless;  
410  uschar start_bits[32];  uschar start_bits[32];
411  real_pcre_extra *extra;  pcre_extra *extra;
412    pcre_study_data *study;
413    const uschar *tables;
414  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
415    uschar *code = (uschar *)re + re->name_table_offset +
416      (re->name_count * re->name_entry_size);
417    compile_data compile_block;
418    
419  *errorptr = NULL;  *errorptr = NULL;
420    
# Line 285  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 430  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
430    return NULL;    return NULL;
431    }    }
432    
433  /* Caseless can either be from the compiled regex or from options. */  /* For an anchored pattern, or an unanchored pattern that has a first char, or
434    a multiline pattern that matches only at "line starts", no further processing
435  caseless = ((re->options | options) & PCRE_CASELESS) != 0;  at present. */
   
 /* For an anchored pattern, or an unchored pattern that has a first char, or a  
 multiline pattern that matches only at "line starts", no further processing at  
 present. */  
436    
437  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
438    return NULL;    return NULL;
439    
440  /* See if we can find a fixed set of initial characters for the pattern. */  /* Set the character tables in the block that is passed around */
441    
442  memset(start_bits, 0, 32 * sizeof(uschar));  tables = re->tables;
443  if (!set_start_bits(re->code, start_bits)) return NULL;  if (tables == NULL)
444      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, &tables);
445    
446    compile_block.lcc = tables + lcc_offset;
447    compile_block.fcc = tables + fcc_offset;
448    compile_block.cbits = tables + cbits_offset;
449    compile_block.ctypes = tables + ctypes_offset;
450    
451  /* If this studying is caseless, scan the created bit map and duplicate the  /* See if we can find a fixed set of initial characters for the pattern. */
 bits for any letters. */  
452    
453  if (caseless)  memset(start_bits, 0, 32 * sizeof(uschar));
454    {  if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
455    register int c;    (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
   for (c = 0; c < 256; c++)  
     {  
     if ((start_bits[c/8] & (1 << (c&7))) != 0 &&  
         (pcre_ctypes[c] & ctype_letter) != 0)  
       {  
       int d = pcre_fcc[c];  
       start_bits[d/8] |= (1 << (d&7));  
       }  
     }  
   }  
456    
457  /* Get an "extra" block and put the information therein. */  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
458    the latter, which is pointed to by the former, which may also get additional
459    data set later by the calling program. At the moment, the size of
460    pcre_study_data is fixed. We nevertheless save it in a field for returning via
461    the pcre_fullinfo() function so that if it becomes variable in the future, we
462    don't have to change that code. */
463    
464  extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));  extra = (pcre_extra *)(pcre_malloc)
465      (sizeof(pcre_extra) + sizeof(pcre_study_data));
466    
467  if (extra == NULL)  if (extra == NULL)
468    {    {
# Line 328  if (extra == NULL) Line 470  if (extra == NULL)
470    return NULL;    return NULL;
471    }    }
472    
473  extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);  study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
474  memcpy(extra->start_bits, start_bits, sizeof(start_bits));  extra->flags = PCRE_EXTRA_STUDY_DATA;
475    extra->study_data = study;
476    
477    study->size = sizeof(pcre_study_data);
478    study->options = PCRE_STUDY_MAPPED;
479    memcpy(study->start_bits, start_bits, sizeof(start_bits));
480    
481  return (pcre_extra *)extra;  return extra;
482  }  }
483    
484  /* End of study.c */  /* End of study.c */

Legend:
Removed from v.7  
changed lines
  Added in v.75

  ViewVC Help
Powered by ViewVC 1.1.5