/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 85 by nigel, Sat Feb 24 21:41:13 2007 UTC revision 455 by ph10, Sat Sep 26 19:12:32 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    
52    /* Returns from set_start_bits() */
53    
54    enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57    
58    /*************************************************
59    *   Find the minimum subject length for a group  *
60    *************************************************/
61    
62    /* Scan a parenthesized group and compute the minimum length of subject that
63    is needed to match it. This is a lower bound; it does not mean there is a
64    string of that length that matches. In UTF8 mode, the result is in characters
65    rather than bytes.
66    
67    Arguments:
68      code       pointer to start of group (the bracket)
69      startcode  pointer to start of the whole pattern
70      options    the compiling options
71    
72    Returns:   the minimum length
73               -1 if \C was encountered
74               -2 internal error (missing capturing bracket)
75    */
76    
77    static int
78    find_minlength(const uschar *code, const uschar *startcode, int options)
79    {
80    int length = -1;
81    BOOL utf8 = (options & PCRE_UTF8) != 0;
82    BOOL had_recurse = FALSE;
83    register int branchlength = 0;
84    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
85    
86    if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
87    
88    /* Scan along the opcodes for this branch. If we get to the end of the
89    branch, check the length against that of the other branches. */
90    
91    for (;;)
92      {
93      int d, min;
94      uschar *cs, *ce;
95      register int op = *cc;
96    
97      switch (op)
98        {
99        case OP_CBRA:
100        case OP_SCBRA:
101        case OP_BRA:
102        case OP_SBRA:
103        case OP_ONCE:
104        case OP_COND:
105        case OP_SCOND:
106        d = find_minlength(cc, startcode, options);
107        if (d < 0) return d;
108        branchlength += d;
109        do cc += GET(cc, 1); while (*cc == OP_ALT);
110        cc += 1 + LINK_SIZE;
111        break;
112    
113        /* Reached end of a branch; if it's a ket it is the end of a nested
114        call. If it's ALT it is an alternation in a nested call. If it is
115        END it's the end of the outer call. All can be handled by the same code. */
116    
117        case OP_ALT:
118        case OP_KET:
119        case OP_KETRMAX:
120        case OP_KETRMIN:
121        case OP_END:
122        if (length < 0 || (!had_recurse && branchlength < length))
123          length = branchlength;
124        if (*cc != OP_ALT) return length;
125        cc += 1 + LINK_SIZE;
126        branchlength = 0;
127        had_recurse = FALSE;
128        break;
129    
130        /* Skip over assertive subpatterns */
131    
132        case OP_ASSERT:
133        case OP_ASSERT_NOT:
134        case OP_ASSERTBACK:
135        case OP_ASSERTBACK_NOT:
136        do cc += GET(cc, 1); while (*cc == OP_ALT);
137        /* Fall through */
138    
139        /* Skip over things that don't match chars */
140    
141        case OP_REVERSE:
142        case OP_CREF:
143        case OP_RREF:
144        case OP_DEF:
145        case OP_OPT:
146        case OP_CALLOUT:
147        case OP_SOD:
148        case OP_SOM:
149        case OP_EOD:
150        case OP_EODN:
151        case OP_CIRC:
152        case OP_DOLL:
153        case OP_NOT_WORD_BOUNDARY:
154        case OP_WORD_BOUNDARY:
155        cc += _pcre_OP_lengths[*cc];
156        break;
157    
158        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
159    
160        case OP_BRAZERO:
161        case OP_BRAMINZERO:
162        case OP_SKIPZERO:
163        cc += _pcre_OP_lengths[*cc];
164        do cc += GET(cc, 1); while (*cc == OP_ALT);
165        cc += 1 + LINK_SIZE;
166        break;
167    
168        /* Handle literal characters and + repetitions */
169    
170        case OP_CHAR:
171        case OP_CHARNC:
172        case OP_NOT:
173        case OP_PLUS:
174        case OP_MINPLUS:
175        case OP_POSPLUS:
176        case OP_NOTPLUS:
177        case OP_NOTMINPLUS:
178        case OP_NOTPOSPLUS:
179        branchlength++;
180        cc += 2;
181    #ifdef SUPPORT_UTF8
182        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
183    #endif
184        break;
185    
186        case OP_TYPEPLUS:
187        case OP_TYPEMINPLUS:
188        case OP_TYPEPOSPLUS:
189        branchlength++;
190        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
191        break;
192    
193        /* Handle exact repetitions. The count is already in characters, but we
194        need to skip over a multibyte character in UTF8 mode.  */
195    
196        case OP_EXACT:
197        case OP_NOTEXACT:
198        branchlength += GET2(cc,1);
199        cc += 4;
200    #ifdef SUPPORT_UTF8
201        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
202    #endif
203        break;
204    
205        case OP_TYPEEXACT:
206        branchlength += GET2(cc,1);
207        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
208        break;
209    
210        /* Handle single-char non-literal matchers */
211    
212        case OP_PROP:
213        case OP_NOTPROP:
214        cc += 2;
215        /* Fall through */
216    
217        case OP_NOT_DIGIT:
218        case OP_DIGIT:
219        case OP_NOT_WHITESPACE:
220        case OP_WHITESPACE:
221        case OP_NOT_WORDCHAR:
222        case OP_WORDCHAR:
223        case OP_ANY:
224        case OP_ALLANY:
225        case OP_EXTUNI:
226        case OP_HSPACE:
227        case OP_NOT_HSPACE:
228        case OP_VSPACE:
229        case OP_NOT_VSPACE:
230        branchlength++;
231        cc++;
232        break;
233    
234        /* "Any newline" might match two characters */
235    
236        case OP_ANYNL:
237        branchlength += 2;
238        cc++;
239        break;
240    
241        /* The single-byte matcher means we can't proceed in UTF-8 mode */
242    
243        case OP_ANYBYTE:
244    #ifdef SUPPORT_UTF8
245        if (utf8) return -1;
246    #endif
247        branchlength++;
248        cc++;
249        break;
250    
251        /* For repeated character types, we have to test for \p and \P, which have
252        an extra two bytes of parameters. */
253    
254        case OP_TYPESTAR:
255        case OP_TYPEMINSTAR:
256        case OP_TYPEQUERY:
257        case OP_TYPEMINQUERY:
258        case OP_TYPEPOSSTAR:
259        case OP_TYPEPOSQUERY:
260        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
261        cc += _pcre_OP_lengths[op];
262        break;
263    
264        case OP_TYPEUPTO:
265        case OP_TYPEMINUPTO:
266        case OP_TYPEPOSUPTO:
267        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
268        cc += _pcre_OP_lengths[op];
269        break;
270    
271        /* Check a class for variable quantification */
272    
273    #ifdef SUPPORT_UTF8
274        case OP_XCLASS:
275        cc += GET(cc, 1) - 33;
276        /* Fall through */
277    #endif
278    
279        case OP_CLASS:
280        case OP_NCLASS:
281        cc += 33;
282    
283        switch (*cc)
284          {
285          case OP_CRPLUS:
286          case OP_CRMINPLUS:
287          branchlength++;
288          /* Fall through */
289    
290          case OP_CRSTAR:
291          case OP_CRMINSTAR:
292          case OP_CRQUERY:
293          case OP_CRMINQUERY:
294          cc++;
295          break;
296    
297          case OP_CRRANGE:
298          case OP_CRMINRANGE:
299          branchlength += GET2(cc,1);
300          cc += 5;
301          break;
302    
303          default:
304          branchlength++;
305          break;
306          }
307        break;
308    
309        /* Backreferences and subroutine calls are treated in the same way: we find
310        the minimum length for the subpattern. A recursion, however, causes an
311        a flag to be set that causes the length of this branch to be ignored. The
312        logic is that a recursion can only make sense if there is another
313        alternation that stops the recursing. That will provide the minimum length
314        (when no recursion happens). A backreference within the group that it is
315        referencing behaves in the same way. */
316    
317        case OP_REF:
318        ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
319        if (cs == NULL) return -2;
320        do ce += GET(ce, 1); while (*ce == OP_ALT);
321        if (cc > cs && cc < ce)
322          {
323          d = 0;
324          had_recurse = TRUE;
325          }
326        else d = find_minlength(cs, startcode, options);
327        cc += 3;
328    
329        /* Handle repeated back references */
330    
331        switch (*cc)
332          {
333          case OP_CRSTAR:
334          case OP_CRMINSTAR:
335          case OP_CRQUERY:
336          case OP_CRMINQUERY:
337          min = 0;
338          cc++;
339          break;
340    
341          case OP_CRRANGE:
342          case OP_CRMINRANGE:
343          min = GET2(cc, 1);
344          cc += 5;
345          break;
346    
347          default:
348          min = 1;
349          break;
350          }
351    
352        branchlength += min * d;
353        break;
354    
355        case OP_RECURSE:
356        cs = ce = (uschar *)startcode + GET(cc, 1);
357        if (cs == NULL) return -2;
358        do ce += GET(ce, 1); while (*ce == OP_ALT);
359        if (cc > cs && cc < ce)
360          had_recurse = TRUE;
361        else
362          branchlength += find_minlength(cs, startcode, options);
363        cc += 1 + LINK_SIZE;
364        break;
365    
366        /* Anything else does not or need not match a character. We can get the
367        item's length from the table, but for those that can match zero occurrences
368        of a character, we must take special action for UTF-8 characters. */
369    
370        case OP_UPTO:
371        case OP_NOTUPTO:
372        case OP_MINUPTO:
373        case OP_NOTMINUPTO:
374        case OP_POSUPTO:
375        case OP_STAR:
376        case OP_MINSTAR:
377        case OP_NOTMINSTAR:
378        case OP_POSSTAR:
379        case OP_NOTPOSSTAR:
380        case OP_QUERY:
381        case OP_MINQUERY:
382        case OP_NOTMINQUERY:
383        case OP_POSQUERY:
384        case OP_NOTPOSQUERY:
385        cc += _pcre_OP_lengths[op];
386    #ifdef SUPPORT_UTF8
387        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
388    #endif
389        break;
390    
391        /* For the record, these are the opcodes that are matched by "default":
392        OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
393        OP_THEN. */
394    
395        default:
396        cc += _pcre_OP_lengths[op];
397        break;
398        }
399      }
400    /* Control never gets here */
401    }
402    
403    
404    
405  /*************************************************  /*************************************************
406  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
407  *************************************************/  *************************************************/
# Line 72  if (caseless && (cd->ctypes[c] & ctype_l Line 429  if (caseless && (cd->ctypes[c] & ctype_l
429    
430    
431  /*************************************************  /*************************************************
432  *          Create bitmap of starting chars       *  *          Create bitmap of starting bytes       *
433  *************************************************/  *************************************************/
434    
435  /* This function scans a compiled unanchored expression and attempts to build a  /* This function scans a compiled unanchored expression recursively and
436  bitmap of the set of initial characters. If it can't, it returns FALSE. As time  attempts to build a bitmap of the set of possible starting bytes. As time goes
437  goes by, we may be able to get more clever at doing this.  by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
438    useful for parenthesized groups in patterns such as (a*)b where the group
439    provides some optional starting bytes but scanning must continue at the outer
440    level to find at least one mandatory byte. At the outermost level, this
441    function fails unless the result is SSB_DONE.
442    
443  Arguments:  Arguments:
444    code         points to an expression    code         points to an expression
# Line 86  Arguments: Line 447  Arguments:
447    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
448    cd           the block with char table pointers    cd           the block with char table pointers
449    
450  Returns:       TRUE if table built, FALSE otherwise  Returns:       SSB_FAIL     => Failed to find any starting bytes
451                   SSB_DONE     => Found mandatory starting bytes
452                   SSB_CONTINUE => Found optional starting bytes
453  */  */
454    
455  static BOOL  static int
456  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
457    BOOL utf8, compile_data *cd)    BOOL utf8, compile_data *cd)
458  {  {
459  register int c;  register int c;
460    int yield = SSB_DONE;
461    
462    #if 0
463    /* ========================================================================= */
464    /* The following comment and code was inserted in January 1999. In May 2006,
465    when it was observed to cause compiler warnings about unused values, I took it
466    out again. If anybody is still using OS/2, they will have to put it back
467    manually. */
468    
469  /* This next statement and the later reference to dummy are here in order to  /* This next statement and the later reference to dummy are here in order to
470  trick the optimizer of the IBM C compiler for OS/2 into generating correct  trick the optimizer of the IBM C compiler for OS/2 into generating correct
# Line 102  disable optimization (in this module it Line 473  disable optimization (in this module it
473  the pcre module can use all the optimization it can get). */  the pcre module can use all the optimization it can get). */
474    
475  volatile int dummy;  volatile int dummy;
476    /* ========================================================================= */
477    #endif
478    
479  do  do
480    {    {
481    const uschar *tcode = code + 1 + LINK_SIZE;    const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
482    BOOL try_next = TRUE;    BOOL try_next = TRUE;
483    
484    while (try_next)    while (try_next)    /* Loop for items in this branch */
485      {      {
486      /* If a branch starts with a bracket or a positive lookahead assertion,      int rc;
487      recurse to set bits from within them. That's all for this branch. */      switch(*tcode)
   
     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)  
488        {        {
489        if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))        /* Fail if we reach something we don't understand */
         return FALSE;  
       try_next = FALSE;  
       }  
490    
     else switch(*tcode)  
       {  
491        default:        default:
492        return FALSE;        return SSB_FAIL;
493    
494        /* Skip over callout */        /* If we hit a bracket or a positive lookahead assertion, recurse to set
495          bits from within the subpattern. If it can't find anything, we have to
496          give up. If it finds some mandatory character(s), we are done for this
497          branch. Otherwise, carry on scanning after the subpattern. */
498    
499          case OP_BRA:
500          case OP_SBRA:
501          case OP_CBRA:
502          case OP_SCBRA:
503          case OP_ONCE:
504          case OP_ASSERT:
505          rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
506          if (rc == SSB_FAIL) return SSB_FAIL;
507          if (rc == SSB_DONE) try_next = FALSE; else
508            {
509            do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
510            tcode += 1 + LINK_SIZE;
511            }
512          break;
513    
514        case OP_CALLOUT:        /* If we hit ALT or KET, it means we haven't found anything mandatory in
515        tcode += 2 + 2*LINK_SIZE;        this branch, though we might have found something optional. For ALT, we
516          continue with the next alternative, but we have to arrange that the final
517          result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
518          return SSB_CONTINUE: if this is the top level, that indicates failure,
519          but after a nested subpattern, it causes scanning to continue. */
520    
521          case OP_ALT:
522          yield = SSB_CONTINUE;
523          try_next = FALSE;
524        break;        break;
525    
526        /* Skip over extended extraction bracket number */        case OP_KET:
527          case OP_KETRMAX:
528          case OP_KETRMIN:
529          return SSB_CONTINUE;
530    
531        case OP_BRANUMBER:        /* Skip over callout */
532        tcode += 3;  
533          case OP_CALLOUT:
534          tcode += 2 + 2*LINK_SIZE;
535        break;        break;
536    
537        /* Skip over lookbehind and negative lookahead assertions */        /* Skip over lookbehind and negative lookahead assertions */
# Line 143  do Line 540  do
540        case OP_ASSERTBACK:        case OP_ASSERTBACK:
541        case OP_ASSERTBACK_NOT:        case OP_ASSERTBACK_NOT:
542        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
543        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
544        break;        break;
545    
546        /* Skip over an option setting, changing the caseless flag */        /* Skip over an option setting, changing the caseless flag */
# Line 157  do Line 554  do
554    
555        case OP_BRAZERO:        case OP_BRAZERO:
556        case OP_BRAMINZERO:        case OP_BRAMINZERO:
557        if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
558          return FALSE;          return SSB_FAIL;
559    /* =========================================================================
560          See the comment at the head of this function concerning the next line,
561          which was an old fudge for the benefit of OS/2.
562        dummy = 1;        dummy = 1;
563      ========================================================================= */
564        do tcode += GET(tcode,1); while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
565        tcode += 1+LINK_SIZE;        tcode += 1 + LINK_SIZE;
566          break;
567    
568          /* SKIPZERO skips the bracket. */
569    
570          case OP_SKIPZERO:
571          tcode++;
572          do tcode += GET(tcode,1); while (*tcode == OP_ALT);
573          tcode += 1 + LINK_SIZE;
574        break;        break;
575    
576        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
577    
578        case OP_STAR:        case OP_STAR:
579        case OP_MINSTAR:        case OP_MINSTAR:
580          case OP_POSSTAR:
581        case OP_QUERY:        case OP_QUERY:
582        case OP_MINQUERY:        case OP_MINQUERY:
583          case OP_POSQUERY:
584        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
585        tcode += 2;        tcode += 2;
586  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
587        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
588            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
589  #endif  #endif
590        break;        break;
591    
# Line 181  do Line 593  do
593    
594        case OP_UPTO:        case OP_UPTO:
595        case OP_MINUPTO:        case OP_MINUPTO:
596          case OP_POSUPTO:
597        set_bit(start_bits, tcode[3], caseless, cd);        set_bit(start_bits, tcode[3], caseless, cd);
598        tcode += 4;        tcode += 4;
599  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
600        if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;        if (utf8 && tcode[-1] >= 0xc0)
601            tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
602  #endif  #endif
603        break;        break;
604    
# Line 197  do Line 611  do
611        case OP_CHARNC:        case OP_CHARNC:
612        case OP_PLUS:        case OP_PLUS:
613        case OP_MINPLUS:        case OP_MINPLUS:
614          case OP_POSPLUS:
615        set_bit(start_bits, tcode[1], caseless, cd);        set_bit(start_bits, tcode[1], caseless, cd);
616        try_next = FALSE;        try_next = FALSE;
617        break;        break;
# Line 215  do Line 630  do
630        try_next = FALSE;        try_next = FALSE;
631        break;        break;
632    
633          /* The cbit_space table has vertical tab as whitespace; we have to
634          discard it. */
635    
636        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
637        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
638          start_bits[c] |= ~cd->cbits[c+cbit_space];          {
639            int d = cd->cbits[c+cbit_space];
640            if (c == 1) d &= ~0x08;
641            start_bits[c] |= ~d;
642            }
643        try_next = FALSE;        try_next = FALSE;
644        break;        break;
645    
646          /* The cbit_space table has vertical tab as whitespace; we have to
647          discard it. */
648    
649        case OP_WHITESPACE:        case OP_WHITESPACE:
650        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
651          start_bits[c] |= cd->cbits[c+cbit_space];          {
652            int d = cd->cbits[c+cbit_space];
653            if (c == 1) d &= ~0x08;
654            start_bits[c] |= d;
655            }
656        try_next = FALSE;        try_next = FALSE;
657        break;        break;
658    
# Line 256  do Line 685  do
685    
686        case OP_TYPEUPTO:        case OP_TYPEUPTO:
687        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
688          case OP_TYPEPOSUPTO:
689        tcode += 2;               /* Fall through */        tcode += 2;               /* Fall through */
690    
691        case OP_TYPESTAR:        case OP_TYPESTAR:
692        case OP_TYPEMINSTAR:        case OP_TYPEMINSTAR:
693          case OP_TYPEPOSSTAR:
694        case OP_TYPEQUERY:        case OP_TYPEQUERY:
695        case OP_TYPEMINQUERY:        case OP_TYPEMINQUERY:
696          case OP_TYPEPOSQUERY:
697        switch(tcode[1])        switch(tcode[1])
698          {          {
699          case OP_ANY:          case OP_ANY:
700          return FALSE;          case OP_ALLANY:
701            return SSB_FAIL;
702    
703          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
704          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
# Line 277  do Line 710  do
710            start_bits[c] |= cd->cbits[c+cbit_digit];            start_bits[c] |= cd->cbits[c+cbit_digit];
711          break;          break;
712    
713            /* The cbit_space table has vertical tab as whitespace; we have to
714            discard it. */
715    
716          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
717          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
718            start_bits[c] |= ~cd->cbits[c+cbit_space];            {
719              int d = cd->cbits[c+cbit_space];
720              if (c == 1) d &= ~0x08;
721              start_bits[c] |= ~d;
722              }
723          break;          break;
724    
725            /* The cbit_space table has vertical tab as whitespace; we have to
726            discard it. */
727    
728          case OP_WHITESPACE:          case OP_WHITESPACE:
729          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
730            start_bits[c] |= cd->cbits[c+cbit_space];            {
731              int d = cd->cbits[c+cbit_space];
732              if (c == 1) d &= ~0x08;
733              start_bits[c] |= d;
734              }
735          break;          break;
736    
737          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
# Line 308  do Line 755  do
755        character with a value > 255. */        character with a value > 255. */
756    
757        case OP_NCLASS:        case OP_NCLASS:
758    #ifdef SUPPORT_UTF8
759        if (utf8)        if (utf8)
760          {          {
761          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
762          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
763          }          }
764    #endif
765        /* Fall through */        /* Fall through */
766    
767        case OP_CLASS:        case OP_CLASS:
# Line 325  do Line 774  do
774          value is > 127. In fact, there are only two possible starting bytes for          value is > 127. In fact, there are only two possible starting bytes for
775          characters in the range 128 - 255. */          characters in the range 128 - 255. */
776    
777    #ifdef SUPPORT_UTF8
778          if (utf8)          if (utf8)
779            {            {
780            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
# Line 342  do Line 792  do
792          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
793    
794          else          else
795    #endif
796            {            {
797            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
798            }            }
# Line 377  do Line 828  do
828    code += GET(code, 1);   /* Advance to next branch */    code += GET(code, 1);   /* Advance to next branch */
829    }    }
830  while (*code == OP_ALT);  while (*code == OP_ALT);
831  return TRUE;  return yield;
832  }  }
833    
834    
# Line 397  Arguments: Line 848  Arguments:
848              set NULL unless error              set NULL unless error
849    
850  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
851                appropriate flag set;                appropriate flags set;
852              NULL on error or if no optimization possible              NULL on error or if no optimization possible
853  */  */
854    
855  PCRE_EXPORT pcre_extra *  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
856  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
857  {  {
858    int min;
859    BOOL bits_set = FALSE;
860  uschar start_bits[32];  uschar start_bits[32];
861  pcre_extra *extra;  pcre_extra *extra;
862  pcre_study_data *study;  pcre_study_data *study;
863  const uschar *tables;  const uschar *tables;
864  const real_pcre *re = (const real_pcre *)external_re;  uschar *code;
 uschar *code = (uschar *)re + re->name_table_offset +  
   (re->name_count * re->name_entry_size);  
865  compile_data compile_block;  compile_data compile_block;
866    const real_pcre *re = (const real_pcre *)external_re;
867    
868  *errorptr = NULL;  *errorptr = NULL;
869    
# Line 427  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 879  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
879    return NULL;    return NULL;
880    }    }
881    
882    code = (uschar *)re + re->name_table_offset +
883      (re->name_count * re->name_entry_size);
884    
885  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
886  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
887  at present. */  seeking a list of starting bytes. */
888    
889  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & PCRE_ANCHORED) == 0 &&
890    return NULL;      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
891      {
892      /* Set the character tables in the block that is passed around */
893    
894      tables = re->tables;
895      if (tables == NULL)
896        (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
897        (void *)(&tables));
898    
899      compile_block.lcc = tables + lcc_offset;
900      compile_block.fcc = tables + fcc_offset;
901      compile_block.cbits = tables + cbits_offset;
902      compile_block.ctypes = tables + ctypes_offset;
903    
904      /* See if we can find a fixed set of initial characters for the pattern. */
905    
906      memset(start_bits, 0, 32 * sizeof(uschar));
907      bits_set = set_start_bits(code, start_bits,
908        (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
909        &compile_block) == SSB_DONE;
910      }
911    
912    /* Find the minimum length of subject string. */
913    
914    min = find_minlength(code, code, re->options);
915    
916  /* Set the character tables in the block that is passed around */  /* Return NULL if no optimization is possible. */
917    
918  tables = re->tables;  if (!bits_set && min < 0) return NULL;
 if (tables == NULL)  
   (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,  
   (void *)(&tables));  
   
 compile_block.lcc = tables + lcc_offset;  
 compile_block.fcc = tables + fcc_offset;  
 compile_block.cbits = tables + cbits_offset;  
 compile_block.ctypes = tables + ctypes_offset;  
   
 /* See if we can find a fixed set of initial characters for the pattern. */  
   
 memset(start_bits, 0, 32 * sizeof(uschar));  
 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  
   (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;  
919    
920  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
921  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 473  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 938  extra->flags = PCRE_EXTRA_STUDY_DATA;
938  extra->study_data = study;  extra->study_data = study;
939    
940  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
941  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
942  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
943    if (bits_set)
944      {
945      study->flags |= PCRE_STUDY_MAPPED;
946      memcpy(study->start_bits, start_bits, sizeof(start_bits));
947      }
948    
949    if (min >= 0)
950      {
951      study->flags |= PCRE_STUDY_MINLEN;
952      study->minlength = min;
953      }
954    
955  return extra;  return extra;
956  }  }

Legend:
Removed from v.85  
changed lines
  Added in v.455

  ViewVC Help
Powered by ViewVC 1.1.5