/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 107 by ph10, Wed Mar 7 11:02:28 2007 UTC revision 613 by ph10, Sat Jul 2 16:59:52 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting functions. */  supporting functions. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
55  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56    
57    
58    
59    /*************************************************
60    *   Find the minimum subject length for a group  *
61    *************************************************/
62    
63    /* Scan a parenthesized group and compute the minimum length of subject that
64    is needed to match it. This is a lower bound; it does not mean there is a
65    string of that length that matches. In UTF8 mode, the result is in characters
66    rather than bytes.
67    
68    Arguments:
69      code        pointer to start of group (the bracket)
70      startcode   pointer to start of the whole pattern
71      options     the compiling options
72      had_accept  pointer to flag for (*ACCEPT) encountered
73    
74    Returns:   the minimum length
75               -1 if \C was encountered
76               -2 internal error (missing capturing bracket)
77               -3 internal error (opcode not listed)
78    */
79    
80    static int
81    find_minlength(const uschar *code, const uschar *startcode, int options,
82      BOOL *had_accept_ptr)
83    {
84    int length = -1;
85    BOOL utf8 = (options & PCRE_UTF8) != 0;
86    BOOL had_recurse = FALSE;
87    register int branchlength = 0;
88    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
89    
90    if (*code == OP_CBRA || *code == OP_SCBRA ||
91        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
92    
93    /* Scan along the opcodes for this branch. If we get to the end of the
94    branch, check the length against that of the other branches. */
95    
96    for (;;)
97      {
98      int d, min;
99      uschar *cs, *ce;
100      register int op = *cc;
101    
102      switch (op)
103        {
104        case OP_COND:
105        case OP_SCOND:
106    
107        /* If there is only one branch in a condition, the implied branch has zero
108        length, so we don't add anything. This covers the DEFINE "condition"
109        automatically. */
110    
111        cs = cc + GET(cc, 1);
112        if (*cs != OP_ALT)
113          {
114          cc = cs + 1 + LINK_SIZE;
115          break;
116          }
117    
118        /* Otherwise we can fall through and treat it the same as any other
119        subpattern. */
120    
121        case OP_CBRA:
122        case OP_SCBRA:
123        case OP_BRA:
124        case OP_SBRA:
125        case OP_CBRAPOS:
126        case OP_SCBRAPOS:
127        case OP_BRAPOS:
128        case OP_SBRAPOS:
129        case OP_ONCE:
130        d = find_minlength(cc, startcode, options, had_accept_ptr);
131        if (d < 0) return d;
132        branchlength += d;
133        if (*had_accept_ptr) return branchlength;
134        do cc += GET(cc, 1); while (*cc == OP_ALT);
135        cc += 1 + LINK_SIZE;
136        break;
137    
138        /* Reached end of a branch; if it's a ket it is the end of a nested
139        call. If it's ALT it is an alternation in a nested call. If it is END it's
140        the end of the outer call. All can be handled by the same code. If it is
141        ACCEPT, it is essentially the same as END, but we set a flag so that
142        counting stops. */
143    
144        case OP_ACCEPT:
145        case OP_ASSERT_ACCEPT:
146        *had_accept_ptr = TRUE;
147        /* Fall through */
148        case OP_ALT:
149        case OP_KET:
150        case OP_KETRMAX:
151        case OP_KETRMIN:
152        case OP_KETRPOS:
153        case OP_END:
154        if (length < 0 || (!had_recurse && branchlength < length))
155          length = branchlength;
156        if (op != OP_ALT) return length;
157        cc += 1 + LINK_SIZE;
158        branchlength = 0;
159        had_recurse = FALSE;
160        break;
161    
162        /* Skip over assertive subpatterns */
163    
164        case OP_ASSERT:
165        case OP_ASSERT_NOT:
166        case OP_ASSERTBACK:
167        case OP_ASSERTBACK_NOT:
168        do cc += GET(cc, 1); while (*cc == OP_ALT);
169        /* Fall through */
170    
171        /* Skip over things that don't match chars */
172    
173        case OP_REVERSE:
174        case OP_CREF:
175        case OP_NCREF:
176        case OP_RREF:
177        case OP_NRREF:
178        case OP_DEF:
179        case OP_CALLOUT:
180        case OP_SOD:
181        case OP_SOM:
182        case OP_EOD:
183        case OP_EODN:
184        case OP_CIRC:
185        case OP_CIRCM:
186        case OP_DOLL:
187        case OP_DOLLM:
188        case OP_NOT_WORD_BOUNDARY:
189        case OP_WORD_BOUNDARY:
190        cc += _pcre_OP_lengths[*cc];
191        break;
192    
193        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
194    
195        case OP_BRAZERO:
196        case OP_BRAMINZERO:
197        case OP_BRAPOSZERO:
198        case OP_SKIPZERO:
199        cc += _pcre_OP_lengths[*cc];
200        do cc += GET(cc, 1); while (*cc == OP_ALT);
201        cc += 1 + LINK_SIZE;
202        break;
203    
204        /* Handle literal characters and + repetitions */
205    
206        case OP_CHAR:
207        case OP_CHARI:
208        case OP_NOT:
209        case OP_NOTI:
210        case OP_PLUS:
211        case OP_PLUSI:
212        case OP_MINPLUS:
213        case OP_MINPLUSI:
214        case OP_POSPLUS:
215        case OP_POSPLUSI:
216        case OP_NOTPLUS:
217        case OP_NOTPLUSI:
218        case OP_NOTMINPLUS:
219        case OP_NOTMINPLUSI:
220        case OP_NOTPOSPLUS:
221        case OP_NOTPOSPLUSI:
222        branchlength++;
223        cc += 2;
224    #ifdef SUPPORT_UTF8
225        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
226    #endif
227        break;
228    
229        case OP_TYPEPLUS:
230        case OP_TYPEMINPLUS:
231        case OP_TYPEPOSPLUS:
232        branchlength++;
233        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
234        break;
235    
236        /* Handle exact repetitions. The count is already in characters, but we
237        need to skip over a multibyte character in UTF8 mode.  */
238    
239        case OP_EXACT:
240        case OP_EXACTI:
241        case OP_NOTEXACT:
242        case OP_NOTEXACTI:
243        branchlength += GET2(cc,1);
244        cc += 4;
245    #ifdef SUPPORT_UTF8
246        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
247    #endif
248        break;
249    
250        case OP_TYPEEXACT:
251        branchlength += GET2(cc,1);
252        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
253        break;
254    
255        /* Handle single-char non-literal matchers */
256    
257        case OP_PROP:
258        case OP_NOTPROP:
259        cc += 2;
260        /* Fall through */
261    
262        case OP_NOT_DIGIT:
263        case OP_DIGIT:
264        case OP_NOT_WHITESPACE:
265        case OP_WHITESPACE:
266        case OP_NOT_WORDCHAR:
267        case OP_WORDCHAR:
268        case OP_ANY:
269        case OP_ALLANY:
270        case OP_EXTUNI:
271        case OP_HSPACE:
272        case OP_NOT_HSPACE:
273        case OP_VSPACE:
274        case OP_NOT_VSPACE:
275        branchlength++;
276        cc++;
277        break;
278    
279        /* "Any newline" might match two characters, but it also might match just
280        one. */
281    
282        case OP_ANYNL:
283        branchlength += 1;
284        cc++;
285        break;
286    
287        /* The single-byte matcher means we can't proceed in UTF-8 mode */
288    
289        case OP_ANYBYTE:
290    #ifdef SUPPORT_UTF8
291        if (utf8) return -1;
292    #endif
293        branchlength++;
294        cc++;
295        break;
296    
297        /* For repeated character types, we have to test for \p and \P, which have
298        an extra two bytes of parameters. */
299    
300        case OP_TYPESTAR:
301        case OP_TYPEMINSTAR:
302        case OP_TYPEQUERY:
303        case OP_TYPEMINQUERY:
304        case OP_TYPEPOSSTAR:
305        case OP_TYPEPOSQUERY:
306        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
307        cc += _pcre_OP_lengths[op];
308        break;
309    
310        case OP_TYPEUPTO:
311        case OP_TYPEMINUPTO:
312        case OP_TYPEPOSUPTO:
313        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
314        cc += _pcre_OP_lengths[op];
315        break;
316    
317        /* Check a class for variable quantification */
318    
319    #ifdef SUPPORT_UTF8
320        case OP_XCLASS:
321        cc += GET(cc, 1) - 33;
322        /* Fall through */
323    #endif
324    
325        case OP_CLASS:
326        case OP_NCLASS:
327        cc += 33;
328    
329        switch (*cc)
330          {
331          case OP_CRPLUS:
332          case OP_CRMINPLUS:
333          branchlength++;
334          /* Fall through */
335    
336          case OP_CRSTAR:
337          case OP_CRMINSTAR:
338          case OP_CRQUERY:
339          case OP_CRMINQUERY:
340          cc++;
341          break;
342    
343          case OP_CRRANGE:
344          case OP_CRMINRANGE:
345          branchlength += GET2(cc,1);
346          cc += 5;
347          break;
348    
349          default:
350          branchlength++;
351          break;
352          }
353        break;
354    
355        /* Backreferences and subroutine calls are treated in the same way: we find
356        the minimum length for the subpattern. A recursion, however, causes an
357        a flag to be set that causes the length of this branch to be ignored. The
358        logic is that a recursion can only make sense if there is another
359        alternation that stops the recursing. That will provide the minimum length
360        (when no recursion happens). A backreference within the group that it is
361        referencing behaves in the same way.
362    
363        If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
364        matches an empty string (by default it causes a matching failure), so in
365        that case we must set the minimum length to zero. */
366    
367        case OP_REF:
368        case OP_REFI:
369        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
370          {
371          ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
372          if (cs == NULL) return -2;
373          do ce += GET(ce, 1); while (*ce == OP_ALT);
374          if (cc > cs && cc < ce)
375            {
376            d = 0;
377            had_recurse = TRUE;
378            }
379          else
380            {
381            d = find_minlength(cs, startcode, options, had_accept_ptr);
382            *had_accept_ptr = FALSE;
383            }
384          }
385        else d = 0;
386        cc += 3;
387    
388        /* Handle repeated back references */
389    
390        switch (*cc)
391          {
392          case OP_CRSTAR:
393          case OP_CRMINSTAR:
394          case OP_CRQUERY:
395          case OP_CRMINQUERY:
396          min = 0;
397          cc++;
398          break;
399    
400          case OP_CRPLUS:
401          case OP_CRMINPLUS:
402          min = 1;
403          cc++;
404          break;
405    
406          case OP_CRRANGE:
407          case OP_CRMINRANGE:
408          min = GET2(cc, 1);
409          cc += 5;
410          break;
411    
412          default:
413          min = 1;
414          break;
415          }
416    
417        branchlength += min * d;
418        break;
419    
420        case OP_RECURSE:
421        cs = ce = (uschar *)startcode + GET(cc, 1);
422        if (cs == NULL) return -2;
423        do ce += GET(ce, 1); while (*ce == OP_ALT);
424        if (cc > cs && cc < ce)
425          had_recurse = TRUE;
426        else
427          {
428          branchlength += find_minlength(cs, startcode, options, had_accept_ptr);
429          *had_accept_ptr = FALSE;
430          }
431        cc += 1 + LINK_SIZE;
432        break;
433    
434        /* Anything else does not or need not match a character. We can get the
435        item's length from the table, but for those that can match zero occurrences
436        of a character, we must take special action for UTF-8 characters. As it
437        happens, the "NOT" versions of these opcodes are used at present only for
438        ASCII characters, so they could be omitted from this list. However, in
439        future that may change, so we include them here so as not to leave a
440        gotcha for a future maintainer. */
441    
442        case OP_UPTO:
443        case OP_UPTOI:
444        case OP_NOTUPTO:
445        case OP_NOTUPTOI:
446        case OP_MINUPTO:
447        case OP_MINUPTOI:
448        case OP_NOTMINUPTO:
449        case OP_NOTMINUPTOI:
450        case OP_POSUPTO:
451        case OP_POSUPTOI:
452        case OP_NOTPOSUPTO:
453        case OP_NOTPOSUPTOI:
454    
455        case OP_STAR:
456        case OP_STARI:
457        case OP_NOTSTAR:
458        case OP_NOTSTARI:
459        case OP_MINSTAR:
460        case OP_MINSTARI:
461        case OP_NOTMINSTAR:
462        case OP_NOTMINSTARI:
463        case OP_POSSTAR:
464        case OP_POSSTARI:
465        case OP_NOTPOSSTAR:
466        case OP_NOTPOSSTARI:
467    
468        case OP_QUERY:
469        case OP_QUERYI:
470        case OP_NOTQUERY:
471        case OP_NOTQUERYI:
472        case OP_MINQUERY:
473        case OP_MINQUERYI:
474        case OP_NOTMINQUERY:
475        case OP_NOTMINQUERYI:
476        case OP_POSQUERY:
477        case OP_POSQUERYI:
478        case OP_NOTPOSQUERY:
479        case OP_NOTPOSQUERYI:
480    
481        cc += _pcre_OP_lengths[op];
482    #ifdef SUPPORT_UTF8
483        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
484    #endif
485        break;
486    
487        /* Skip these, but we need to add in the name length. */
488    
489        case OP_MARK:
490        case OP_PRUNE_ARG:
491        case OP_SKIP_ARG:
492        cc += _pcre_OP_lengths[op] + cc[1];
493        break;
494    
495        case OP_THEN_ARG:
496        cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
497        break;
498    
499        /* The remaining opcodes are just skipped over. */
500    
501        case OP_CLOSE:
502        case OP_COMMIT:
503        case OP_FAIL:
504        case OP_PRUNE:
505        case OP_SET_SOM:
506        case OP_SKIP:
507        case OP_THEN:
508        cc += _pcre_OP_lengths[op];
509        break;
510    
511        /* This should not occur: we list all opcodes explicitly so that when
512        new ones get added they are properly considered. */
513    
514        default:
515        return -3;
516        }
517      }
518    /* Control never gets here */
519    }
520    
521    
522    
523  /*************************************************  /*************************************************
524  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
525  *************************************************/  *************************************************/
526    
527  /* Given a character, set its bit in the table, and also the bit for the other  /* Given a character, set its first byte's bit in the table, and also the
528  version of a letter if we are caseless.  corresponding bit for the other version of a letter if we are caseless. In
529    UTF-8 mode, for characters greater than 127, we can only do the caseless thing
530    when Unicode property support is available.
531    
532  Arguments:  Arguments:
533    start_bits    points to the bit map    start_bits    points to the bit map
534    c             is the character    p             points to the character
535    caseless      the caseless flag    caseless      the caseless flag
536    cd            the block with char table pointers    cd            the block with char table pointers
537      utf8          TRUE for UTF-8 mode
538    
539    Returns:        pointer after the character
540    */
541    
542    static const uschar *
543    set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
544      compile_data *cd, BOOL utf8)
545    {
546    unsigned int c = *p;
547    
548    SET_BIT(c);
549    
550    #ifdef SUPPORT_UTF8
551    if (utf8 && c > 127)
552      {
553      GETCHARINC(c, p);
554    #ifdef SUPPORT_UCP
555      if (caseless)
556        {
557        uschar buff[8];
558        c = UCD_OTHERCASE(c);
559        (void)_pcre_ord2utf8(c, buff);
560        SET_BIT(buff[0]);
561        }
562    #endif
563      return p;
564      }
565    #endif
566    
567    /* Not UTF-8 mode, or character is less than 127. */
568    
569    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
570    return p + 1;
571    }
572    
573    
574    
575    /*************************************************
576    *     Set bits for a positive character type     *
577    *************************************************/
578    
579    /* This function sets starting bits for a character type. In UTF-8 mode, we can
580    only do a direct setting for bytes less than 128, as otherwise there can be
581    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
582    environment, the tables will only recognize ASCII characters anyway, but in at
583    least one Windows environment, some higher bytes bits were set in the tables.
584    So we deal with that case by considering the UTF-8 encoding.
585    
586    Arguments:
587      start_bits     the starting bitmap
588      cbit type      the type of character wanted
589      table_limit    32 for non-UTF-8; 16 for UTF-8
590      cd             the block with char table pointers
591    
592  Returns:        nothing  Returns:         nothing
593  */  */
594    
595  static void  static void
596  set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)  set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
597      compile_data *cd)
598  {  {
599  start_bits[c/8] |= (1 << (c&7));  register int c;
600  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
601    start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  if (table_limit == 32) return;
602    for (c = 128; c < 256; c++)
603      {
604      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
605        {
606        uschar buff[8];
607        (void)_pcre_ord2utf8(c, buff);
608        SET_BIT(buff[0]);
609        }
610      }
611    }
612    
613    
614    /*************************************************
615    *     Set bits for a negative character type     *
616    *************************************************/
617    
618    /* This function sets starting bits for a negative character type such as \D.
619    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
620    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
621    Unlike in the positive case, where we can set appropriate starting bits for
622    specific high-valued UTF-8 characters, in this case we have to set the bits for
623    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
624    0xc0 (192) for simplicity.
625    
626    Arguments:
627      start_bits     the starting bitmap
628      cbit type      the type of character wanted
629      table_limit    32 for non-UTF-8; 16 for UTF-8
630      cd             the block with char table pointers
631    
632    Returns:         nothing
633    */
634    
635    static void
636    set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
637      compile_data *cd)
638    {
639    register int c;
640    for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
641    if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
642  }  }
643    
644    
# Line 91  function fails unless the result is SSB_ Line 658  function fails unless the result is SSB_
658  Arguments:  Arguments:
659    code         points to an expression    code         points to an expression
660    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
   caseless     the current state of the caseless flag  
661    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
662    cd           the block with char table pointers    cd           the block with char table pointers
663    
664  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
665                 SSB_DONE     => Found mandatory starting bytes                 SSB_DONE     => Found mandatory starting bytes
666                 SSB_CONTINUE => Found optional starting bytes                 SSB_CONTINUE => Found optional starting bytes
667                   SSB_UNKNOWN  => Hit an unrecognized opcode
668  */  */
669    
670  static int  static int
671  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
672    BOOL utf8, compile_data *cd)    compile_data *cd)
673  {  {
674  register int c;  register int c;
675  int yield = SSB_DONE;  int yield = SSB_DONE;
676    int table_limit = utf8? 16:32;
677    
678  #if 0  #if 0
679  /* ========================================================================= */  /* ========================================================================= */
# Line 126  volatile int dummy; Line 694  volatile int dummy;
694    
695  do  do
696    {    {
   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;  
697    BOOL try_next = TRUE;    BOOL try_next = TRUE;
698      const uschar *tcode = code + 1 + LINK_SIZE;
699    
700      if (*code == OP_CBRA || *code == OP_SCBRA ||
701          *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
702    
703    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
704      {      {
705      int rc;      int rc;
706    
707      switch(*tcode)      switch(*tcode)
708        {        {
709        /* Fail if we reach something we don't understand */        /* If we reach something we don't understand, it means a new opcode has
710          been created that hasn't been added to this code. Hopefully this problem
711          will be discovered during testing. */
712    
713        default:        default:
714          return SSB_UNKNOWN;
715    
716          /* Fail for a valid opcode that implies no starting bits. */
717    
718          case OP_ACCEPT:
719          case OP_ASSERT_ACCEPT:
720          case OP_ALLANY:
721          case OP_ANY:
722          case OP_ANYBYTE:
723          case OP_CIRC:
724          case OP_CIRCM:
725          case OP_CLOSE:
726          case OP_COMMIT:
727          case OP_COND:
728          case OP_CREF:
729          case OP_DEF:
730          case OP_DOLL:
731          case OP_DOLLM:
732          case OP_END:
733          case OP_EOD:
734          case OP_EODN:
735          case OP_EXTUNI:
736          case OP_FAIL:
737          case OP_MARK:
738          case OP_NCREF:
739          case OP_NOT:
740          case OP_NOTEXACT:
741          case OP_NOTEXACTI:
742          case OP_NOTI:
743          case OP_NOTMINPLUS:
744          case OP_NOTMINPLUSI:
745          case OP_NOTMINQUERY:
746          case OP_NOTMINQUERYI:
747          case OP_NOTMINSTAR:
748          case OP_NOTMINSTARI:
749          case OP_NOTMINUPTO:
750          case OP_NOTMINUPTOI:
751          case OP_NOTPLUS:
752          case OP_NOTPLUSI:
753          case OP_NOTPOSPLUS:
754          case OP_NOTPOSPLUSI:
755          case OP_NOTPOSQUERY:
756          case OP_NOTPOSQUERYI:
757          case OP_NOTPOSSTAR:
758          case OP_NOTPOSSTARI:
759          case OP_NOTPOSUPTO:
760          case OP_NOTPOSUPTOI:
761          case OP_NOTPROP:
762          case OP_NOTQUERY:
763          case OP_NOTQUERYI:
764          case OP_NOTSTAR:
765          case OP_NOTSTARI:
766          case OP_NOTUPTO:
767          case OP_NOTUPTOI:
768          case OP_NOT_HSPACE:
769          case OP_NOT_VSPACE:
770          case OP_NOT_WORD_BOUNDARY:
771          case OP_NRREF:
772          case OP_PROP:
773          case OP_PRUNE:
774          case OP_PRUNE_ARG:
775          case OP_RECURSE:
776          case OP_REF:
777          case OP_REFI:
778          case OP_REVERSE:
779          case OP_RREF:
780          case OP_SCOND:
781          case OP_SET_SOM:
782          case OP_SKIP:
783          case OP_SKIP_ARG:
784          case OP_SOD:
785          case OP_SOM:
786          case OP_THEN:
787          case OP_THEN_ARG:
788          case OP_WORD_BOUNDARY:
789          case OP_XCLASS:
790        return SSB_FAIL;        return SSB_FAIL;
791    
792        /* If we hit a bracket or a positive lookahead assertion, recurse to set        /* If we hit a bracket or a positive lookahead assertion, recurse to set
# Line 148  do Line 798  do
798        case OP_SBRA:        case OP_SBRA:
799        case OP_CBRA:        case OP_CBRA:
800        case OP_SCBRA:        case OP_SCBRA:
801          case OP_BRAPOS:
802          case OP_SBRAPOS:
803          case OP_CBRAPOS:
804          case OP_SCBRAPOS:
805        case OP_ONCE:        case OP_ONCE:
806        case OP_ASSERT:        case OP_ASSERT:
807        rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf8, cd);
808        if (rc == SSB_FAIL) return SSB_FAIL;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
809        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
810          {          {
811          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
# Line 174  do Line 828  do
828        case OP_KET:        case OP_KET:
829        case OP_KETRMAX:        case OP_KETRMAX:
830        case OP_KETRMIN:        case OP_KETRMIN:
831          case OP_KETRPOS:
832        return SSB_CONTINUE;        return SSB_CONTINUE;
833    
834        /* Skip over callout */        /* Skip over callout */
# Line 191  do Line 846  do
846        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
847        break;        break;
848    
       /* Skip over an option setting, changing the caseless flag */  
   
       case OP_OPT:  
       caseless = (tcode[1] & PCRE_CASELESS) != 0;  
       tcode += 2;  
       break;  
   
849        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
850    
851        case OP_BRAZERO:        case OP_BRAZERO:
852        case OP_BRAMINZERO:        case OP_BRAMINZERO:
853        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)        case OP_BRAPOSZERO:
854          return SSB_FAIL;        rc = set_start_bits(++tcode, start_bits, utf8, cd);
855          if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
856  /* =========================================================================  /* =========================================================================
857        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
858        which was an old fudge for the benefit of OS/2.        which was an old fudge for the benefit of OS/2.
# Line 213  do Line 862  do
862        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
863        break;        break;
864    
865          /* SKIPZERO skips the bracket. */
866    
867          case OP_SKIPZERO:
868          tcode++;
869          do tcode += GET(tcode,1); while (*tcode == OP_ALT);
870          tcode += 1 + LINK_SIZE;
871          break;
872    
873        /* Single-char * or ? sets the bit and tries the next item */        /* Single-char * or ? sets the bit and tries the next item */
874    
875        case OP_STAR:        case OP_STAR:
# Line 221  do Line 878  do
878        case OP_QUERY:        case OP_QUERY:
879        case OP_MINQUERY:        case OP_MINQUERY:
880        case OP_POSQUERY:        case OP_POSQUERY:
881        set_bit(start_bits, tcode[1], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
882        tcode += 2;        break;
883  #ifdef SUPPORT_UTF8  
884        if (utf8 && tcode[-1] >= 0xc0)        case OP_STARI:
885          tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];        case OP_MINSTARI:
886  #endif        case OP_POSSTARI:
887          case OP_QUERYI:
888          case OP_MINQUERYI:
889          case OP_POSQUERYI:
890          tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
891        break;        break;
892    
893        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 234  do Line 895  do
895        case OP_UPTO:        case OP_UPTO:
896        case OP_MINUPTO:        case OP_MINUPTO:
897        case OP_POSUPTO:        case OP_POSUPTO:
898        set_bit(start_bits, tcode[3], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
899        tcode += 4;        break;
900  #ifdef SUPPORT_UTF8  
901        if (utf8 && tcode[-1] >= 0xc0)        case OP_UPTOI:
902          tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];        case OP_MINUPTOI:
903  #endif        case OP_POSUPTOI:
904          tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
905        break;        break;
906    
907        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
908    
909        case OP_EXACT:       /* Fall through */        case OP_EXACT:
910        tcode += 2;        tcode += 2;
911          /* Fall through */
912        case OP_CHAR:        case OP_CHAR:
       case OP_CHARNC:  
913        case OP_PLUS:        case OP_PLUS:
914        case OP_MINPLUS:        case OP_MINPLUS:
915        case OP_POSPLUS:        case OP_POSPLUS:
916        set_bit(start_bits, tcode[1], caseless, cd);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
917          try_next = FALSE;
918          break;
919    
920          case OP_EXACTI:
921          tcode += 2;
922          /* Fall through */
923          case OP_CHARI:
924          case OP_PLUSI:
925          case OP_MINPLUSI:
926          case OP_POSPLUSI:
927          (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
928          try_next = FALSE;
929          break;
930    
931          /* Special spacing and line-terminating items. These recognize specific
932          lists of characters. The difference between VSPACE and ANYNL is that the
933          latter can match the two-character CRLF sequence, but that is not
934          relevant for finding the first character, so their code here is
935          identical. */
936    
937          case OP_HSPACE:
938          SET_BIT(0x09);
939          SET_BIT(0x20);
940          if (utf8)
941            {
942            SET_BIT(0xC2);  /* For U+00A0 */
943            SET_BIT(0xE1);  /* For U+1680, U+180E */
944            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
945            SET_BIT(0xE3);  /* For U+3000 */
946            }
947          else SET_BIT(0xA0);
948        try_next = FALSE;        try_next = FALSE;
949        break;        break;
950    
951        /* Single character type sets the bits and stops */        case OP_ANYNL:
952          case OP_VSPACE:
953          SET_BIT(0x0A);
954          SET_BIT(0x0B);
955          SET_BIT(0x0C);
956          SET_BIT(0x0D);
957          if (utf8)
958            {
959            SET_BIT(0xC2);  /* For U+0085 */
960            SET_BIT(0xE2);  /* For U+2028, U+2029 */
961            }
962          else SET_BIT(0x85);
963          try_next = FALSE;
964          break;
965    
966          /* Single character types set the bits and stop. Note that if PCRE_UCP
967          is set, we do not see these op codes because \d etc are converted to
968          properties. Therefore, these apply in the case when only characters less
969          than 256 are recognized to match the types. */
970    
971        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
972        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
973        try_next = FALSE;        try_next = FALSE;
974        break;        break;
975    
976        case OP_DIGIT:        case OP_DIGIT:
977        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
978        try_next = FALSE;        try_next = FALSE;
979        break;        break;
980    
981        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
982        discard it. */        ensure it is set as not whitespace. */
983    
984        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
985        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
986          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
987        try_next = FALSE;        try_next = FALSE;
988        break;        break;
989    
990        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
991        discard it. */        not set it from the table. */
992    
993        case OP_WHITESPACE:        case OP_WHITESPACE:
994        for (c = 0; c < 32; c++)        c = start_bits[1];    /* Save in case it was already set */
995          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
996          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
997        try_next = FALSE;        try_next = FALSE;
998        break;        break;
999    
1000        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
1001        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
1002        try_next = FALSE;        try_next = FALSE;
1003        break;        break;
1004    
1005        case OP_WORDCHAR:        case OP_WORDCHAR:
1006        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
1007        try_next = FALSE;        try_next = FALSE;
1008        break;        break;
1009    
# Line 313  do Line 1012  do
1012    
1013        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1014        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
1015          case OP_TYPEPOSPLUS:
1016        tcode++;        tcode++;
1017        break;        break;
1018    
# Line 336  do Line 1036  do
1036        case OP_TYPEPOSQUERY:        case OP_TYPEPOSQUERY:
1037        switch(tcode[1])        switch(tcode[1])
1038          {          {
1039            default:
1040          case OP_ANY:          case OP_ANY:
1041            case OP_ALLANY:
1042          return SSB_FAIL;          return SSB_FAIL;
1043    
1044            case OP_HSPACE:
1045            SET_BIT(0x09);
1046            SET_BIT(0x20);
1047            if (utf8)
1048              {
1049              SET_BIT(0xC2);  /* For U+00A0 */
1050              SET_BIT(0xE1);  /* For U+1680, U+180E */
1051              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1052              SET_BIT(0xE3);  /* For U+3000 */
1053              }
1054            else SET_BIT(0xA0);
1055            break;
1056    
1057            case OP_ANYNL:
1058            case OP_VSPACE:
1059            SET_BIT(0x0A);
1060            SET_BIT(0x0B);
1061            SET_BIT(0x0C);
1062            SET_BIT(0x0D);
1063            if (utf8)
1064              {
1065              SET_BIT(0xC2);  /* For U+0085 */
1066              SET_BIT(0xE2);  /* For U+2028, U+2029 */
1067              }
1068            else SET_BIT(0x85);
1069            break;
1070    
1071          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
1072          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
1073          break;          break;
1074    
1075          case OP_DIGIT:          case OP_DIGIT:
1076          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
1077          break;          break;
1078    
1079          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1080          discard it. */          ensure it gets set as not whitespace. */
1081    
1082          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1083          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1084            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
1085          break;          break;
1086    
1087          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1088          discard it. */          avoid setting it. */
1089    
1090          case OP_WHITESPACE:          case OP_WHITESPACE:
1091          for (c = 0; c < 32; c++)          c = start_bits[1];    /* Save in case it was already set */
1092            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
1093            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
1094          break;          break;
1095    
1096          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
1097          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
1098          break;          break;
1099    
1100          case OP_WORDCHAR:          case OP_WORDCHAR:
1101          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
1102          break;          break;
1103          }          }
1104    
# Line 394  do Line 1112  do
1112        character with a value > 255. */        character with a value > 255. */
1113    
1114        case OP_NCLASS:        case OP_NCLASS:
1115  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1116        if (utf8)        if (utf8)
1117          {          {
1118          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */          start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1119          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */          memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1120          }          }
1121  #endif  #endif
1122        /* Fall through */        /* Fall through */
1123    
1124        case OP_CLASS:        case OP_CLASS:
# Line 431  do Line 1149  do
1149          /* In non-UTF-8 mode, the two bit maps are completely compatible. */          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1150    
1151          else          else
1152  #endif  #endif
1153            {            {
1154            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1155            }            }
1156    
1157          /* Advance past the bit map, and act on what follows */          /* Advance past the bit map, and act on what follows. For a zero
1158            minimum repeat, continue; otherwise stop processing. */
1159    
1160          tcode += 32;          tcode += 32;
1161          switch (*tcode)          switch (*tcode)
# Line 453  do Line 1172  do
1172            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1173              else try_next = FALSE;              else try_next = FALSE;
1174            break;            break;
1175    
1176            default:            default:
1177            try_next = FALSE;            try_next = FALSE;
1178            break;            break;
# Line 472  return yield; Line 1191  return yield;
1191    
1192    
1193    
1194    
1195    
1196  /*************************************************  /*************************************************
1197  *          Study a compiled expression           *  *          Study a compiled expression           *
1198  *************************************************/  *************************************************/
# Line 487  Arguments: Line 1208  Arguments:
1208              set NULL unless error              set NULL unless error
1209    
1210  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
1211                appropriate flag set;                appropriate flags set;
1212              NULL on error or if no optimization possible              NULL on error or if no optimization possible
1213  */  */
1214    
1215  PCRE_DATA_SCOPE pcre_extra *  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1216  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
1217  {  {
1218    int min;
1219    BOOL bits_set = FALSE;
1220    BOOL had_accept = FALSE;
1221  uschar start_bits[32];  uschar start_bits[32];
1222  pcre_extra *extra;  pcre_extra *extra;
1223  pcre_study_data *study;  pcre_study_data *study;
# Line 520  code = (uschar *)re + re->name_table_off Line 1244  code = (uschar *)re + re->name_table_off
1244    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
1245    
1246  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
1247  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
1248  at present. */  seeking a list of starting bytes. */
1249    
1250  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & PCRE_ANCHORED) == 0 &&
1251    return NULL;      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1252      {
1253      int rc;
1254    
1255  /* Set the character tables in the block that is passed around */    /* Set the character tables in the block that is passed around */
1256    
1257  tables = re->tables;    tables = re->tables;
1258  if (tables == NULL)    if (tables == NULL)
1259    (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,      (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1260    (void *)(&tables));      (void *)(&tables));
1261    
1262  compile_block.lcc = tables + lcc_offset;    compile_block.lcc = tables + lcc_offset;
1263  compile_block.fcc = tables + fcc_offset;    compile_block.fcc = tables + fcc_offset;
1264  compile_block.cbits = tables + cbits_offset;    compile_block.cbits = tables + cbits_offset;
1265  compile_block.ctypes = tables + ctypes_offset;    compile_block.ctypes = tables + ctypes_offset;
1266    
1267  /* See if we can find a fixed set of initial characters for the pattern. */    /* See if we can find a fixed set of initial characters for the pattern. */
1268    
1269  memset(start_bits, 0, 32 * sizeof(uschar));    memset(start_bits, 0, 32 * sizeof(uschar));
1270  if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,    rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1271    (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;      &compile_block);
1272      bits_set = rc == SSB_DONE;
1273      if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";
1274      }
1275    
1276    /* Find the minimum length of subject string. */
1277    
1278    switch(min = find_minlength(code, code, re->options, &had_accept))
1279      {
1280      case -2: *errorptr = "internal error: missing capturing bracket"; break;
1281      case -3: *errorptr = "internal error: opcode not recognized"; break;
1282      default: break;
1283      }
1284    
1285    /* Return NULL if there's been an error or if no optimization is possible. */
1286    
1287    if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;
1288    
1289  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1290  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 565  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 1307  extra->flags = PCRE_EXTRA_STUDY_DATA;
1307  extra->study_data = study;  extra->study_data = study;
1308    
1309  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
1310  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
1311  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
1312    if (bits_set)
1313      {
1314      study->flags |= PCRE_STUDY_MAPPED;
1315      memcpy(study->start_bits, start_bits, sizeof(start_bits));
1316      }
1317    
1318    if (min >= 0)
1319      {
1320      study->flags |= PCRE_STUDY_MINLEN;
1321      study->minlength = min;
1322      }
1323    
1324  return extra;  return extra;
1325  }  }

Legend:
Removed from v.107  
changed lines
  Added in v.613

  ViewVC Help
Powered by ViewVC 1.1.5