/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 199 by ph10, Tue Jul 31 14:39:09 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    
52    /* Returns from set_start_bits() */
53    
54    enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57  /*************************************************  /*************************************************
58  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
59  *************************************************/  *************************************************/
# Line 72  if (caseless && (cd->ctypes[c] & ctype_l Line 81  if (caseless && (cd->ctypes[c] & ctype_l
81    
82    
83  /*************************************************  /*************************************************
84  *          Create bitmap of starting chars       *  *          Create bitmap of starting bytes       *
85  *************************************************/  *************************************************/
86    
87  /* This function scans a compiled unanchored expression and attempts to build a  /* This function scans a compiled unanchored expression recursively and
88  bitmap of the set of initial characters. If it can't, it returns FALSE. As time  attempts to build a bitmap of the set of possible starting bytes. As time goes
89  goes by, we may be able to get more clever at doing this.  by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90    useful for parenthesized groups in patterns such as (a*)b where the group
91    provides some optional starting bytes but scanning must continue at the outer
92    level to find at least one mandatory byte. At the outermost level, this
93    function fails unless the result is SSB_DONE.
94    
95  Arguments:  Arguments:
96    code         points to an expression    code         points to an expression
# Line 86  Arguments: Line 99  Arguments:
99    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
100    cd           the block with char table pointers    cd           the block with char table pointers
101    
102  Returns:       TRUE if table built, FALSE otherwise  Returns:       SSB_FAIL     => Failed to find any starting bytes
103                   SSB_DONE     => Found mandatory starting bytes
104                   SSB_CONTINUE => Found optional starting bytes
105  */  */
106    
107  static BOOL  static int
108  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109    BOOL utf8, compile_data *cd)    BOOL utf8, compile_data *cd)
110  {  {
111  register int c;  register int c;
112    int yield = SSB_DONE;
113    
114  #if 0  #if 0
115  /* ========================================================================= */  /* ========================================================================= */
# Line 114  volatile int dummy; Line 130  volatile int dummy;
130    
131  do  do
132    {    {
133    const uschar *tcode = code + 1 + LINK_SIZE;    const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134    BOOL try_next = TRUE;    BOOL try_next = TRUE;
135    
136    while (try_next)    while (try_next)    /* Loop for items in this branch */
137      {      {
138      /* If a branch starts with a bracket or a positive lookahead assertion,      int rc;
139      recurse to set bits from within them. That's all for this branch. */      switch(*tcode)
   
     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)  
140        {        {
141        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))        /* Fail if we reach something we don't understand */
         return FALSE;  
       try_next = FALSE;  
       }  
142    
     else switch(*tcode)  
       {  
143        default:        default:
144        return FALSE;        return SSB_FAIL;
145    
146        /* Skip over callout */        /* If we hit a bracket or a positive lookahead assertion, recurse to set
147          bits from within the subpattern. If it can't find anything, we have to
148          give up. If it finds some mandatory character(s), we are done for this
149          branch. Otherwise, carry on scanning after the subpattern. */
150    
151          case OP_BRA:
152          case OP_SBRA:
153          case OP_CBRA:
154          case OP_SCBRA:
155          case OP_ONCE:
156          case OP_ASSERT:
157          rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158          if (rc == SSB_FAIL) return SSB_FAIL;
159          if (rc == SSB_DONE) try_next = FALSE; else
160            {
161            do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162            tcode += 1 + LINK_SIZE;
163            }
164          break;
165    
166        case OP_CALLOUT:        /* If we hit ALT or KET, it means we haven't found anything mandatory in
167        tcode += 2 + 2*LINK_SIZE;        this branch, though we might have found something optional. For ALT, we
168          continue with the next alternative, but we have to arrange that the final
169          result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170          return SSB_CONTINUE: if this is the top level, that indicates failure,
171          but after a nested subpattern, it causes scanning to continue. */
172    
173          case OP_ALT:
174          yield = SSB_CONTINUE;
175          try_next = FALSE;
176        break;        break;
177    
178        /* Skip over extended extraction bracket number */        case OP_KET:
179          case OP_KETRMAX:
180          case OP_KETRMIN:
181          return SSB_CONTINUE;
182    
183        case OP_BRANUMBER:        /* Skip over callout */
184        tcode += 3;  
185          case OP_CALLOUT:
186          tcode += 2 + 2*LINK_SIZE;
187        break;        break;
188    
189        /* Skip over lookbehind and negative lookahead assertions */        /* Skip over lookbehind and negative lookahead assertions */
# Line 152  do Line 192  do
192        case OP_ASSERTBACK:        case OP_ASSERTBACK:
193        case OP_ASSERTBACK_NOT:        case OP_ASSERTBACK_NOT:
194        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
196        break;        break;
197    
198        /* Skip over an option setting, changing the caseless flag */        /* Skip over an option setting, changing the caseless flag */
# Line 166  do Line 206  do
206    
207        case OP_BRAZERO:        case OP_BRAZERO:
208        case OP_BRAMINZERO:        case OP_BRAMINZERO:
209        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
210          return FALSE;          return SSB_FAIL;
211  /* =========================================================================  /* =========================================================================
212        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
213        which was an old fudge for the benefit of OS/2.        which was an old fudge for the benefit of OS/2.
214        dummy = 1;        dummy = 1;
215    ========================================================================= */    ========================================================================= */
216        do tcode += GET(tcode,1); while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
218        break;        break;
219    
220        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
221    
222        case OP_STAR:        case OP_STAR:
223        case OP_MINSTAR:        case OP_MINSTAR:
224          case OP_POSSTAR:
225        case OP_QUERY:        case OP_QUERY:
226        case OP_MINQUERY:        case OP_MINQUERY:
227          case OP_POSQUERY:
228        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
229        tcode += 2;        tcode += 2;
230  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
231        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
232            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
233  #endif  #endif
234        break;        break;
235    
# Line 194  do Line 237  do
237    
238        case OP_UPTO:        case OP_UPTO:
239        case OP_MINUPTO:        case OP_MINUPTO:
240          case OP_POSUPTO:
241        set_bit(start_bits, tcode[3], caseless, cd);        set_bit(start_bits, tcode[3], caseless, cd);
242        tcode += 4;        tcode += 4;
243  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
244        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
245            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
246  #endif  #endif
247        break;        break;
248    
# Line 210  do Line 255  do
255        case OP_CHARNC:        case OP_CHARNC:
256        case OP_PLUS:        case OP_PLUS:
257        case OP_MINPLUS:        case OP_MINPLUS:
258          case OP_POSPLUS:
259        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
260        try_next = FALSE;        try_next = FALSE;
261        break;        break;
# Line 283  do Line 329  do
329    
330        case OP_TYPEUPTO:        case OP_TYPEUPTO:
331        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
332          case OP_TYPEPOSUPTO:
333        tcode += 2;               /* Fall through */        tcode += 2;               /* Fall through */
334    
335        case OP_TYPESTAR:        case OP_TYPESTAR:
336        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
337          case OP_TYPEPOSSTAR:
338        case OP_TYPEQUERY:        case OP_TYPEQUERY:
339        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
340          case OP_TYPEPOSQUERY:
341        switch(tcode[1])        switch(tcode[1])
342          {          {
343          case OP_ANY:          case OP_ANY:
344          return FALSE;          return SSB_FAIL;
345    
346          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
347          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
# Line 349  do Line 398  do
398        character with a value > 255. */        character with a value > 255. */
399    
400        case OP_NCLASS:        case OP_NCLASS:
401    #ifdef SUPPORT_UTF8
402        if (utf8)        if (utf8)
403          {          {
404          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
405          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
406          }          }
407    #endif
408        /* Fall through */        /* Fall through */
409    
410        case OP_CLASS:        case OP_CLASS:
# Line 366  do Line 417  do
417          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
418          characters in the range 128 - 255. */          characters in the range 128 - 255. */
419    
420    #ifdef SUPPORT_UTF8
421          if (utf8)          if (utf8)
422            {            {
423            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
# Line 383  do Line 435  do
435          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
436    
437          else          else
438    #endif
439            {            {
440            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
441            }            }
# Line 418  do Line 471  do
471    code += GET(code, 1);   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
472    }    }
473  while (*code == OP_ALT);  while (*code == OP_ALT);
474  return TRUE;  return yield;
475  }  }
476    
477    
# Line 442  Returns:    pointer to a pcre_extra bloc Line 495  Returns:    pointer to a pcre_extra bloc
495              NULL on error or if no optimization possible              NULL on error or if no optimization possible
496  */  */
497    
498  PCRE_DATA_SCOPE pcre_extra *  PCRE_EXP_DEFN pcre_extra *
499  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
500  {  {
501  uschar start_bits[32];  uschar start_bits[32];
# Line 492  compile_block.ctypes = tables + ctypes_o Line 545  compile_block.ctypes = tables + ctypes_o
545  /* See if we can find a fixed set of initial characters for the pattern. */  /* See if we can find a fixed set of initial characters for the pattern. */
546    
547  memset(start_bits, 0, 32 * sizeof(uschar));  memset(start_bits, 0, 32 * sizeof(uschar));
548  if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
549    (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;    (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
550    
551  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
552  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional

Legend:
Removed from v.91  
changed lines
  Added in v.199

  ViewVC Help
Powered by ViewVC 1.1.5