/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 353 by ph10, Mon Jul 7 15:44:24 2008 UTC revision 611 by ph10, Wed Jun 29 08:49:21 2011 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 48  supporting functions. */ Line 48  supporting functions. */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52    
53  /* Returns from set_start_bits() */  /* Returns from set_start_bits() */
54    
55  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56    
57    
58    
59    /*************************************************
60    *   Find the minimum subject length for a group  *
61    *************************************************/
62    
63    /* Scan a parenthesized group and compute the minimum length of subject that
64    is needed to match it. This is a lower bound; it does not mean there is a
65    string of that length that matches. In UTF8 mode, the result is in characters
66    rather than bytes.
67    
68    Arguments:
69      code       pointer to start of group (the bracket)
70      startcode  pointer to start of the whole pattern
71      options    the compiling options
72    
73    Returns:   the minimum length
74               -1 if \C was encountered
75               -2 internal error (missing capturing bracket)
76               -3 internal error (opcode not listed)
77    */
78    
79    static int
80    find_minlength(const uschar *code, const uschar *startcode, int options)
81    {
82    int length = -1;
83    BOOL utf8 = (options & PCRE_UTF8) != 0;
84    BOOL had_recurse = FALSE;
85    register int branchlength = 0;
86    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
87    
88    if (*code == OP_CBRA || *code == OP_SCBRA ||
89        *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
90    
91    /* Scan along the opcodes for this branch. If we get to the end of the
92    branch, check the length against that of the other branches. */
93    
94    for (;;)
95      {
96      int d, min;
97      uschar *cs, *ce;
98      register int op = *cc;
99    
100      switch (op)
101        {
102        case OP_COND:
103        case OP_SCOND:
104    
105        /* If there is only one branch in a condition, the implied branch has zero
106        length, so we don't add anything. This covers the DEFINE "condition"
107        automatically. */
108    
109        cs = cc + GET(cc, 1);
110        if (*cs != OP_ALT)
111          {
112          cc = cs + 1 + LINK_SIZE;
113          break;
114          }
115    
116        /* Otherwise we can fall through and treat it the same as any other
117        subpattern. */
118    
119        case OP_CBRA:
120        case OP_SCBRA:
121        case OP_BRA:
122        case OP_SBRA:
123        case OP_CBRAPOS:
124        case OP_SCBRAPOS:
125        case OP_BRAPOS:
126        case OP_SBRAPOS:
127        case OP_ONCE:
128        d = find_minlength(cc, startcode, options);
129        if (d < 0) return d;
130        branchlength += d;
131        do cc += GET(cc, 1); while (*cc == OP_ALT);
132        cc += 1 + LINK_SIZE;
133        break;
134    
135        /* Reached end of a branch; if it's a ket it is the end of a nested
136        call. If it's ALT it is an alternation in a nested call. If it is
137        END it's the end of the outer call. All can be handled by the same code. */
138    
139        case OP_ALT:
140        case OP_KET:
141        case OP_KETRMAX:
142        case OP_KETRMIN:
143        case OP_KETRPOS:
144        case OP_END:
145        if (length < 0 || (!had_recurse && branchlength < length))
146          length = branchlength;
147        if (*cc != OP_ALT) return length;
148        cc += 1 + LINK_SIZE;
149        branchlength = 0;
150        had_recurse = FALSE;
151        break;
152    
153        /* Skip over assertive subpatterns */
154    
155        case OP_ASSERT:
156        case OP_ASSERT_NOT:
157        case OP_ASSERTBACK:
158        case OP_ASSERTBACK_NOT:
159        do cc += GET(cc, 1); while (*cc == OP_ALT);
160        /* Fall through */
161    
162        /* Skip over things that don't match chars */
163    
164        case OP_REVERSE:
165        case OP_CREF:
166        case OP_NCREF:
167        case OP_RREF:
168        case OP_NRREF:
169        case OP_DEF:
170        case OP_CALLOUT:
171        case OP_SOD:
172        case OP_SOM:
173        case OP_EOD:
174        case OP_EODN:
175        case OP_CIRC:
176        case OP_CIRCM:
177        case OP_DOLL:
178        case OP_DOLLM:
179        case OP_NOT_WORD_BOUNDARY:
180        case OP_WORD_BOUNDARY:
181        cc += _pcre_OP_lengths[*cc];
182        break;
183    
184        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
185    
186        case OP_BRAZERO:
187        case OP_BRAMINZERO:
188        case OP_BRAPOSZERO:
189        case OP_SKIPZERO:
190        cc += _pcre_OP_lengths[*cc];
191        do cc += GET(cc, 1); while (*cc == OP_ALT);
192        cc += 1 + LINK_SIZE;
193        break;
194    
195        /* Handle literal characters and + repetitions */
196    
197        case OP_CHAR:
198        case OP_CHARI:
199        case OP_NOT:
200        case OP_NOTI:
201        case OP_PLUS:
202        case OP_PLUSI:
203        case OP_MINPLUS:
204        case OP_MINPLUSI:
205        case OP_POSPLUS:
206        case OP_POSPLUSI:
207        case OP_NOTPLUS:
208        case OP_NOTPLUSI:
209        case OP_NOTMINPLUS:
210        case OP_NOTMINPLUSI:
211        case OP_NOTPOSPLUS:
212        case OP_NOTPOSPLUSI:
213        branchlength++;
214        cc += 2;
215    #ifdef SUPPORT_UTF8
216        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
217    #endif
218        break;
219    
220        case OP_TYPEPLUS:
221        case OP_TYPEMINPLUS:
222        case OP_TYPEPOSPLUS:
223        branchlength++;
224        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
225        break;
226    
227        /* Handle exact repetitions. The count is already in characters, but we
228        need to skip over a multibyte character in UTF8 mode.  */
229    
230        case OP_EXACT:
231        case OP_EXACTI:
232        case OP_NOTEXACT:
233        case OP_NOTEXACTI:
234        branchlength += GET2(cc,1);
235        cc += 4;
236    #ifdef SUPPORT_UTF8
237        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
238    #endif
239        break;
240    
241        case OP_TYPEEXACT:
242        branchlength += GET2(cc,1);
243        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
244        break;
245    
246        /* Handle single-char non-literal matchers */
247    
248        case OP_PROP:
249        case OP_NOTPROP:
250        cc += 2;
251        /* Fall through */
252    
253        case OP_NOT_DIGIT:
254        case OP_DIGIT:
255        case OP_NOT_WHITESPACE:
256        case OP_WHITESPACE:
257        case OP_NOT_WORDCHAR:
258        case OP_WORDCHAR:
259        case OP_ANY:
260        case OP_ALLANY:
261        case OP_EXTUNI:
262        case OP_HSPACE:
263        case OP_NOT_HSPACE:
264        case OP_VSPACE:
265        case OP_NOT_VSPACE:
266        branchlength++;
267        cc++;
268        break;
269    
270        /* "Any newline" might match two characters, but it also might match just
271        one. */
272    
273        case OP_ANYNL:
274        branchlength += 1;
275        cc++;
276        break;
277    
278        /* The single-byte matcher means we can't proceed in UTF-8 mode */
279    
280        case OP_ANYBYTE:
281    #ifdef SUPPORT_UTF8
282        if (utf8) return -1;
283    #endif
284        branchlength++;
285        cc++;
286        break;
287    
288        /* For repeated character types, we have to test for \p and \P, which have
289        an extra two bytes of parameters. */
290    
291        case OP_TYPESTAR:
292        case OP_TYPEMINSTAR:
293        case OP_TYPEQUERY:
294        case OP_TYPEMINQUERY:
295        case OP_TYPEPOSSTAR:
296        case OP_TYPEPOSQUERY:
297        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
298        cc += _pcre_OP_lengths[op];
299        break;
300    
301        case OP_TYPEUPTO:
302        case OP_TYPEMINUPTO:
303        case OP_TYPEPOSUPTO:
304        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
305        cc += _pcre_OP_lengths[op];
306        break;
307    
308        /* Check a class for variable quantification */
309    
310    #ifdef SUPPORT_UTF8
311        case OP_XCLASS:
312        cc += GET(cc, 1) - 33;
313        /* Fall through */
314    #endif
315    
316        case OP_CLASS:
317        case OP_NCLASS:
318        cc += 33;
319    
320        switch (*cc)
321          {
322          case OP_CRPLUS:
323          case OP_CRMINPLUS:
324          branchlength++;
325          /* Fall through */
326    
327          case OP_CRSTAR:
328          case OP_CRMINSTAR:
329          case OP_CRQUERY:
330          case OP_CRMINQUERY:
331          cc++;
332          break;
333    
334          case OP_CRRANGE:
335          case OP_CRMINRANGE:
336          branchlength += GET2(cc,1);
337          cc += 5;
338          break;
339    
340          default:
341          branchlength++;
342          break;
343          }
344        break;
345    
346        /* Backreferences and subroutine calls are treated in the same way: we find
347        the minimum length for the subpattern. A recursion, however, causes an
348        a flag to be set that causes the length of this branch to be ignored. The
349        logic is that a recursion can only make sense if there is another
350        alternation that stops the recursing. That will provide the minimum length
351        (when no recursion happens). A backreference within the group that it is
352        referencing behaves in the same way.
353    
354        If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
355        matches an empty string (by default it causes a matching failure), so in
356        that case we must set the minimum length to zero. */
357    
358        case OP_REF:
359        case OP_REFI:
360        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
361          {
362          ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
363          if (cs == NULL) return -2;
364          do ce += GET(ce, 1); while (*ce == OP_ALT);
365          if (cc > cs && cc < ce)
366            {
367            d = 0;
368            had_recurse = TRUE;
369            }
370          else d = find_minlength(cs, startcode, options);
371          }
372        else d = 0;
373        cc += 3;
374    
375        /* Handle repeated back references */
376    
377        switch (*cc)
378          {
379          case OP_CRSTAR:
380          case OP_CRMINSTAR:
381          case OP_CRQUERY:
382          case OP_CRMINQUERY:
383          min = 0;
384          cc++;
385          break;
386    
387          case OP_CRPLUS:
388          case OP_CRMINPLUS:
389          min = 1;
390          cc++;
391          break;
392    
393          case OP_CRRANGE:
394          case OP_CRMINRANGE:
395          min = GET2(cc, 1);
396          cc += 5;
397          break;
398    
399          default:
400          min = 1;
401          break;
402          }
403    
404        branchlength += min * d;
405        break;
406    
407        case OP_RECURSE:
408        cs = ce = (uschar *)startcode + GET(cc, 1);
409        if (cs == NULL) return -2;
410        do ce += GET(ce, 1); while (*ce == OP_ALT);
411        if (cc > cs && cc < ce)
412          had_recurse = TRUE;
413        else
414          branchlength += find_minlength(cs, startcode, options);
415        cc += 1 + LINK_SIZE;
416        break;
417    
418        /* Anything else does not or need not match a character. We can get the
419        item's length from the table, but for those that can match zero occurrences
420        of a character, we must take special action for UTF-8 characters. As it
421        happens, the "NOT" versions of these opcodes are used at present only for
422        ASCII characters, so they could be omitted from this list. However, in
423        future that may change, so we include them here so as not to leave a
424        gotcha for a future maintainer. */
425    
426        case OP_UPTO:
427        case OP_UPTOI:
428        case OP_NOTUPTO:
429        case OP_NOTUPTOI:
430        case OP_MINUPTO:
431        case OP_MINUPTOI:
432        case OP_NOTMINUPTO:
433        case OP_NOTMINUPTOI:
434        case OP_POSUPTO:
435        case OP_POSUPTOI:
436        case OP_NOTPOSUPTO:
437        case OP_NOTPOSUPTOI:
438    
439        case OP_STAR:
440        case OP_STARI:
441        case OP_NOTSTAR:
442        case OP_NOTSTARI:
443        case OP_MINSTAR:
444        case OP_MINSTARI:
445        case OP_NOTMINSTAR:
446        case OP_NOTMINSTARI:
447        case OP_POSSTAR:
448        case OP_POSSTARI:
449        case OP_NOTPOSSTAR:
450        case OP_NOTPOSSTARI:
451    
452        case OP_QUERY:
453        case OP_QUERYI:
454        case OP_NOTQUERY:
455        case OP_NOTQUERYI:
456        case OP_MINQUERY:
457        case OP_MINQUERYI:
458        case OP_NOTMINQUERY:
459        case OP_NOTMINQUERYI:
460        case OP_POSQUERY:
461        case OP_POSQUERYI:
462        case OP_NOTPOSQUERY:
463        case OP_NOTPOSQUERYI:
464    
465        cc += _pcre_OP_lengths[op];
466    #ifdef SUPPORT_UTF8
467        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
468    #endif
469        break;
470    
471        /* Skip these, but we need to add in the name length. */
472    
473        case OP_MARK:
474        case OP_PRUNE_ARG:
475        case OP_SKIP_ARG:
476        cc += _pcre_OP_lengths[op] + cc[1];
477        break;
478    
479        case OP_THEN_ARG:
480        cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
481        break;
482    
483        /* The remaining opcodes are just skipped over. */
484    
485        case OP_ACCEPT:
486        case OP_CLOSE:
487        case OP_COMMIT:
488        case OP_FAIL:
489        case OP_PRUNE:
490        case OP_SET_SOM:
491        case OP_SKIP:
492        case OP_THEN:
493        cc += _pcre_OP_lengths[op];
494        break;
495    
496        /* This should not occur: we list all opcodes explicitly so that when
497        new ones get added they are properly considered. */
498    
499        default:
500        return -3;
501        }
502      }
503    /* Control never gets here */
504    }
505    
506    
507    
508  /*************************************************  /*************************************************
509  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
510  *************************************************/  *************************************************/
511    
512  /* Given a character, set its bit in the table, and also the bit for the other  /* Given a character, set its first byte's bit in the table, and also the
513  version of a letter if we are caseless.  corresponding bit for the other version of a letter if we are caseless. In
514    UTF-8 mode, for characters greater than 127, we can only do the caseless thing
515    when Unicode property support is available.
516    
517  Arguments:  Arguments:
518    start_bits    points to the bit map    start_bits    points to the bit map
519    c             is the character    p             points to the character
520    caseless      the caseless flag    caseless      the caseless flag
521    cd            the block with char table pointers    cd            the block with char table pointers
522      utf8          TRUE for UTF-8 mode
523    
524    Returns:        pointer after the character
525    */
526    
527    static const uschar *
528    set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
529      compile_data *cd, BOOL utf8)
530    {
531    unsigned int c = *p;
532    
533    SET_BIT(c);
534    
535    #ifdef SUPPORT_UTF8
536    if (utf8 && c > 127)
537      {
538      GETCHARINC(c, p);
539    #ifdef SUPPORT_UCP
540      if (caseless)
541        {
542        uschar buff[8];
543        c = UCD_OTHERCASE(c);
544        (void)_pcre_ord2utf8(c, buff);
545        SET_BIT(buff[0]);
546        }
547    #endif
548      return p;
549      }
550    #endif
551    
552    /* Not UTF-8 mode, or character is less than 127. */
553    
554    if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
555    return p + 1;
556    }
557    
558    
559    
560    /*************************************************
561    *     Set bits for a positive character type     *
562    *************************************************/
563    
564  Returns:        nothing  /* This function sets starting bits for a character type. In UTF-8 mode, we can
565    only do a direct setting for bytes less than 128, as otherwise there can be
566    confusion with bytes in the middle of UTF-8 characters. In a "traditional"
567    environment, the tables will only recognize ASCII characters anyway, but in at
568    least one Windows environment, some higher bytes bits were set in the tables.
569    So we deal with that case by considering the UTF-8 encoding.
570    
571    Arguments:
572      start_bits     the starting bitmap
573      cbit type      the type of character wanted
574      table_limit    32 for non-UTF-8; 16 for UTF-8
575      cd             the block with char table pointers
576    
577    Returns:         nothing
578    */
579    
580    static void
581    set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
582      compile_data *cd)
583    {
584    register int c;
585    for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
586    if (table_limit == 32) return;
587    for (c = 128; c < 256; c++)
588      {
589      if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
590        {
591        uschar buff[8];
592        (void)_pcre_ord2utf8(c, buff);
593        SET_BIT(buff[0]);
594        }
595      }
596    }
597    
598    
599    /*************************************************
600    *     Set bits for a negative character type     *
601    *************************************************/
602    
603    /* This function sets starting bits for a negative character type such as \D.
604    In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
605    otherwise there can be confusion with bytes in the middle of UTF-8 characters.
606    Unlike in the positive case, where we can set appropriate starting bits for
607    specific high-valued UTF-8 characters, in this case we have to set the bits for
608    all high-valued characters. The lowest is 0xc2, but we overkill by starting at
609    0xc0 (192) for simplicity.
610    
611    Arguments:
612      start_bits     the starting bitmap
613      cbit type      the type of character wanted
614      table_limit    32 for non-UTF-8; 16 for UTF-8
615      cd             the block with char table pointers
616    
617    Returns:         nothing
618  */  */
619    
620  static void  static void
621  set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)  set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
622      compile_data *cd)
623  {  {
624  start_bits[c/8] |= (1 << (c&7));  register int c;
625  if (caseless && (cd->ctypes[c] & ctype_letter) != 0)  for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
626    start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));  if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
627  }  }
628    
629    
# Line 95  function fails unless the result is SSB_ Line 643  function fails unless the result is SSB_
643  Arguments:  Arguments:
644    code         points to an expression    code         points to an expression
645    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
   caseless     the current state of the caseless flag  
646    utf8         TRUE if in UTF-8 mode    utf8         TRUE if in UTF-8 mode
647    cd           the block with char table pointers    cd           the block with char table pointers
648    
649  Returns:       SSB_FAIL     => Failed to find any starting bytes  Returns:       SSB_FAIL     => Failed to find any starting bytes
650                 SSB_DONE     => Found mandatory starting bytes                 SSB_DONE     => Found mandatory starting bytes
651                 SSB_CONTINUE => Found optional starting bytes                 SSB_CONTINUE => Found optional starting bytes
652                   SSB_UNKNOWN  => Hit an unrecognized opcode
653  */  */
654    
655  static int  static int
656  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
657    BOOL utf8, compile_data *cd)    compile_data *cd)
658  {  {
659  register int c;  register int c;
660  int yield = SSB_DONE;  int yield = SSB_DONE;
661    int table_limit = utf8? 16:32;
662    
663  #if 0  #if 0
664  /* ========================================================================= */  /* ========================================================================= */
# Line 130  volatile int dummy; Line 679  volatile int dummy;
679    
680  do  do
681    {    {
   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;  
682    BOOL try_next = TRUE;    BOOL try_next = TRUE;
683      const uschar *tcode = code + 1 + LINK_SIZE;
684    
685      if (*code == OP_CBRA || *code == OP_SCBRA ||
686          *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
687    
688    while (try_next)    /* Loop for items in this branch */    while (try_next)    /* Loop for items in this branch */
689      {      {
690      int rc;      int rc;
691      switch(*tcode)      switch(*tcode)
692        {        {
693        /* Fail if we reach something we don't understand */        /* If we reach something we don't understand, it means a new opcode has
694          been created that hasn't been added to this code. Hopefully this problem
695          will be discovered during testing. */
696    
697        default:        default:
698          return SSB_UNKNOWN;
699    
700          /* Fail for a valid opcode that implies no starting bits. */
701    
702          case OP_ACCEPT:
703          case OP_ALLANY:
704          case OP_ANY:
705          case OP_ANYBYTE:
706          case OP_CIRC:
707          case OP_CIRCM:
708          case OP_CLOSE:
709          case OP_COMMIT:
710          case OP_COND:
711          case OP_CREF:
712          case OP_DEF:
713          case OP_DOLL:
714          case OP_DOLLM:
715          case OP_END:
716          case OP_EOD:
717          case OP_EODN:
718          case OP_EXTUNI:
719          case OP_FAIL:
720          case OP_MARK:
721          case OP_NCREF:
722          case OP_NOT:
723          case OP_NOTEXACT:
724          case OP_NOTEXACTI:
725          case OP_NOTI:
726          case OP_NOTMINPLUS:
727          case OP_NOTMINPLUSI:
728          case OP_NOTMINQUERY:
729          case OP_NOTMINQUERYI:
730          case OP_NOTMINSTAR:
731          case OP_NOTMINSTARI:
732          case OP_NOTMINUPTO:
733          case OP_NOTMINUPTOI:
734          case OP_NOTPLUS:
735          case OP_NOTPLUSI:
736          case OP_NOTPOSPLUS:
737          case OP_NOTPOSPLUSI:
738          case OP_NOTPOSQUERY:
739          case OP_NOTPOSQUERYI:
740          case OP_NOTPOSSTAR:
741          case OP_NOTPOSSTARI:
742          case OP_NOTPOSUPTO:
743          case OP_NOTPOSUPTOI:
744          case OP_NOTPROP:
745          case OP_NOTQUERY:
746          case OP_NOTQUERYI:
747          case OP_NOTSTAR:
748          case OP_NOTSTARI:
749          case OP_NOTUPTO:
750          case OP_NOTUPTOI:
751          case OP_NOT_HSPACE:
752          case OP_NOT_VSPACE:
753          case OP_NOT_WORD_BOUNDARY:
754          case OP_NRREF:
755          case OP_PROP:
756          case OP_PRUNE:
757          case OP_PRUNE_ARG:
758          case OP_RECURSE:
759          case OP_REF:
760          case OP_REFI:
761          case OP_REVERSE:
762          case OP_RREF:
763          case OP_SCOND:
764          case OP_SET_SOM:
765          case OP_SKIP:
766          case OP_SKIP_ARG:
767          case OP_SOD:
768          case OP_SOM:
769          case OP_THEN:
770          case OP_THEN_ARG:
771          case OP_WORD_BOUNDARY:
772          case OP_XCLASS:
773        return SSB_FAIL;        return SSB_FAIL;
774    
775        /* If we hit a bracket or a positive lookahead assertion, recurse to set        /* If we hit a bracket or a positive lookahead assertion, recurse to set
# Line 152  do Line 781  do
781        case OP_SBRA:        case OP_SBRA:
782        case OP_CBRA:        case OP_CBRA:
783        case OP_SCBRA:        case OP_SCBRA:
784          case OP_BRAPOS:
785          case OP_SBRAPOS:
786          case OP_CBRAPOS:
787          case OP_SCBRAPOS:
788        case OP_ONCE:        case OP_ONCE:
789        case OP_ASSERT:        case OP_ASSERT:
790        rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);        rc = set_start_bits(tcode, start_bits, utf8, cd);
791        if (rc == SSB_FAIL) return SSB_FAIL;        if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
792        if (rc == SSB_DONE) try_next = FALSE; else        if (rc == SSB_DONE) try_next = FALSE; else
793          {          {
794          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);          do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
# Line 178  do Line 811  do
811        case OP_KET:        case OP_KET:
812        case OP_KETRMAX:        case OP_KETRMAX:
813        case OP_KETRMIN:        case OP_KETRMIN:
814          case OP_KETRPOS:
815        return SSB_CONTINUE;        return SSB_CONTINUE;
816    
817        /* Skip over callout */        /* Skip over callout */
# Line 195  do Line 829  do
829        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
830        break;        break;
831    
       /* Skip over an option setting, changing the caseless flag */  
   
       case OP_OPT:  
       caseless = (tcode[1] & PCRE_CASELESS) != 0;  
       tcode += 2;  
       break;  
   
832        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
833    
834        case OP_BRAZERO:        case OP_BRAZERO:
835        case OP_BRAMINZERO:        case OP_BRAMINZERO:
836        if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)        case OP_BRAPOSZERO:
837          return SSB_FAIL;        rc = set_start_bits(++tcode, start_bits, utf8, cd);
838          if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
839  /* =========================================================================  /* =========================================================================
840        See the comment at the head of this function concerning the next line,        See the comment at the head of this function concerning the next line,
841        which was an old fudge for the benefit of OS/2.        which was an old fudge for the benefit of OS/2.
# Line 220  do Line 848  do
848        /* SKIPZERO skips the bracket. */        /* SKIPZERO skips the bracket. */
849    
850        case OP_SKIPZERO:        case OP_SKIPZERO:
851        tcode++;        tcode++;
852        do tcode += GET(tcode,1); while (*tcode == OP_ALT);        do tcode += GET(tcode,1); while (*tcode == OP_ALT);
853        tcode += 1 + LINK_SIZE;        tcode += 1 + LINK_SIZE;
854        break;        break;
# Line 233  do Line 861  do
861        case OP_QUERY:        case OP_QUERY:
862        case OP_MINQUERY:        case OP_MINQUERY:
863        case OP_POSQUERY:        case OP_POSQUERY:
864        set_bit(start_bits, tcode[1], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
865        tcode += 2;        break;
866  #ifdef SUPPORT_UTF8  
867        if (utf8 && tcode[-1] >= 0xc0)        case OP_STARI:
868          tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];        case OP_MINSTARI:
869  #endif        case OP_POSSTARI:
870          case OP_QUERYI:
871          case OP_MINQUERYI:
872          case OP_POSQUERYI:
873          tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
874        break;        break;
875    
876        /* Single-char upto sets the bit and tries the next */        /* Single-char upto sets the bit and tries the next */
# Line 246  do Line 878  do
878        case OP_UPTO:        case OP_UPTO:
879        case OP_MINUPTO:        case OP_MINUPTO:
880        case OP_POSUPTO:        case OP_POSUPTO:
881        set_bit(start_bits, tcode[3], caseless, cd);        tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
882        tcode += 4;        break;
883  #ifdef SUPPORT_UTF8  
884        if (utf8 && tcode[-1] >= 0xc0)        case OP_UPTOI:
885          tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];        case OP_MINUPTOI:
886  #endif        case OP_POSUPTOI:
887          tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
888        break;        break;
889    
890        /* At least one single char sets the bit and stops */        /* At least one single char sets the bit and stops */
891    
892        case OP_EXACT:       /* Fall through */        case OP_EXACT:
893        tcode += 2;        tcode += 2;
894          /* Fall through */
895        case OP_CHAR:        case OP_CHAR:
       case OP_CHARNC:  
896        case OP_PLUS:        case OP_PLUS:
897        case OP_MINPLUS:        case OP_MINPLUS:
898        case OP_POSPLUS:        case OP_POSPLUS:
899        set_bit(start_bits, tcode[1], caseless, cd);        (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
900          try_next = FALSE;
901          break;
902    
903          case OP_EXACTI:
904          tcode += 2;
905          /* Fall through */
906          case OP_CHARI:
907          case OP_PLUSI:
908          case OP_MINPLUSI:
909          case OP_POSPLUSI:
910          (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
911          try_next = FALSE;
912          break;
913    
914          /* Special spacing and line-terminating items. These recognize specific
915          lists of characters. The difference between VSPACE and ANYNL is that the
916          latter can match the two-character CRLF sequence, but that is not
917          relevant for finding the first character, so their code here is
918          identical. */
919    
920          case OP_HSPACE:
921          SET_BIT(0x09);
922          SET_BIT(0x20);
923          if (utf8)
924            {
925            SET_BIT(0xC2);  /* For U+00A0 */
926            SET_BIT(0xE1);  /* For U+1680, U+180E */
927            SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
928            SET_BIT(0xE3);  /* For U+3000 */
929            }
930          else SET_BIT(0xA0);
931          try_next = FALSE;
932          break;
933    
934          case OP_ANYNL:
935          case OP_VSPACE:
936          SET_BIT(0x0A);
937          SET_BIT(0x0B);
938          SET_BIT(0x0C);
939          SET_BIT(0x0D);
940          if (utf8)
941            {
942            SET_BIT(0xC2);  /* For U+0085 */
943            SET_BIT(0xE2);  /* For U+2028, U+2029 */
944            }
945          else SET_BIT(0x85);
946        try_next = FALSE;        try_next = FALSE;
947        break;        break;
948    
949        /* Single character type sets the bits and stops */        /* Single character types set the bits and stop. Note that if PCRE_UCP
950          is set, we do not see these op codes because \d etc are converted to
951          properties. Therefore, these apply in the case when only characters less
952          than 256 are recognized to match the types. */
953    
954        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
955        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_digit];  
956        try_next = FALSE;        try_next = FALSE;
957        break;        break;
958    
959        case OP_DIGIT:        case OP_DIGIT:
960        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_digit, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_digit];  
961        try_next = FALSE;        try_next = FALSE;
962        break;        break;
963    
964        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
965        discard it. */        ensure it is set as not whitespace. */
966    
967        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
968        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
969          {        start_bits[1] |= 0x08;
         int d = cd->cbits[c+cbit_space];  
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= ~d;  
         }  
970        try_next = FALSE;        try_next = FALSE;
971        break;        break;
972    
973        /* The cbit_space table has vertical tab as whitespace; we have to        /* The cbit_space table has vertical tab as whitespace; we have to
974        discard it. */        not set it from the table. */
975    
976        case OP_WHITESPACE:        case OP_WHITESPACE:
977        for (c = 0; c < 32; c++)        c = start_bits[1];    /* Save in case it was already set */
978          {        set_type_bits(start_bits, cbit_space, table_limit, cd);
979          int d = cd->cbits[c+cbit_space];        start_bits[1] = (start_bits[1] & ~0x08) | c;
         if (c == 1) d &= ~0x08;  
         start_bits[c] |= d;  
         }  
980        try_next = FALSE;        try_next = FALSE;
981        break;        break;
982    
983        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
984        for (c = 0; c < 32; c++)        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= ~cd->cbits[c+cbit_word];  
985        try_next = FALSE;        try_next = FALSE;
986        break;        break;
987    
988        case OP_WORDCHAR:        case OP_WORDCHAR:
989        for (c = 0; c < 32; c++)        set_type_bits(start_bits, cbit_word, table_limit, cd);
         start_bits[c] |= cd->cbits[c+cbit_word];  
990        try_next = FALSE;        try_next = FALSE;
991        break;        break;
992    
# Line 325  do Line 995  do
995    
996        case OP_TYPEPLUS:        case OP_TYPEPLUS:
997        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
998          case OP_TYPEPOSPLUS:
999        tcode++;        tcode++;
1000        break;        break;
1001    
# Line 348  do Line 1019  do
1019        case OP_TYPEPOSQUERY:        case OP_TYPEPOSQUERY:
1020        switch(tcode[1])        switch(tcode[1])
1021          {          {
1022            default:
1023          case OP_ANY:          case OP_ANY:
1024          case OP_ALLANY:          case OP_ALLANY:
1025          return SSB_FAIL;          return SSB_FAIL;
1026    
1027            case OP_HSPACE:
1028            SET_BIT(0x09);
1029            SET_BIT(0x20);
1030            if (utf8)
1031              {
1032              SET_BIT(0xC2);  /* For U+00A0 */
1033              SET_BIT(0xE1);  /* For U+1680, U+180E */
1034              SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1035              SET_BIT(0xE3);  /* For U+3000 */
1036              }
1037            else SET_BIT(0xA0);
1038            break;
1039    
1040            case OP_ANYNL:
1041            case OP_VSPACE:
1042            SET_BIT(0x0A);
1043            SET_BIT(0x0B);
1044            SET_BIT(0x0C);
1045            SET_BIT(0x0D);
1046            if (utf8)
1047              {
1048              SET_BIT(0xC2);  /* For U+0085 */
1049              SET_BIT(0xE2);  /* For U+2028, U+2029 */
1050              }
1051            else SET_BIT(0x85);
1052            break;
1053    
1054          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
1055          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_digit];  
1056          break;          break;
1057    
1058          case OP_DIGIT:          case OP_DIGIT:
1059          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_digit, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_digit];  
1060          break;          break;
1061    
1062          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1063          discard it. */          ensure it gets set as not whitespace. */
1064    
1065          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
1066          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1067            {          start_bits[1] |= 0x08;
           int d = cd->cbits[c+cbit_space];  
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= ~d;  
           }  
1068          break;          break;
1069    
1070          /* The cbit_space table has vertical tab as whitespace; we have to          /* The cbit_space table has vertical tab as whitespace; we have to
1071          discard it. */          avoid setting it. */
1072    
1073          case OP_WHITESPACE:          case OP_WHITESPACE:
1074          for (c = 0; c < 32; c++)          c = start_bits[1];    /* Save in case it was already set */
1075            {          set_type_bits(start_bits, cbit_space, table_limit, cd);
1076            int d = cd->cbits[c+cbit_space];          start_bits[1] = (start_bits[1] & ~0x08) | c;
           if (c == 1) d &= ~0x08;  
           start_bits[c] |= d;  
           }  
1077          break;          break;
1078    
1079          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
1080          for (c = 0; c < 32; c++)          set_nottype_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= ~cd->cbits[c+cbit_word];  
1081          break;          break;
1082    
1083          case OP_WORDCHAR:          case OP_WORDCHAR:
1084          for (c = 0; c < 32; c++)          set_type_bits(start_bits, cbit_word, table_limit, cd);
           start_bits[c] |= cd->cbits[c+cbit_word];  
1085          break;          break;
1086          }          }
1087    
# Line 449  do Line 1137  do
1137            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];            for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1138            }            }
1139    
1140          /* Advance past the bit map, and act on what follows */          /* Advance past the bit map, and act on what follows. For a zero
1141            minimum repeat, continue; otherwise stop processing. */
1142    
1143          tcode += 32;          tcode += 32;
1144          switch (*tcode)          switch (*tcode)
# Line 466  do Line 1155  do
1155            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1156              else try_next = FALSE;              else try_next = FALSE;
1157            break;            break;
1158    
1159            default:            default:
1160            try_next = FALSE;            try_next = FALSE;
1161            break;            break;
# Line 485  return yield; Line 1174  return yield;
1174    
1175    
1176    
1177    
1178    
1179  /*************************************************  /*************************************************
1180  *          Study a compiled expression           *  *          Study a compiled expression           *
1181  *************************************************/  *************************************************/
# Line 500  Arguments: Line 1191  Arguments:
1191              set NULL unless error              set NULL unless error
1192    
1193  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
1194                appropriate flag set;                appropriate flags set;
1195              NULL on error or if no optimization possible              NULL on error or if no optimization possible
1196  */  */
1197    
1198  PCRE_EXP_DEFN pcre_extra *  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1199  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
1200  {  {
1201    int min;
1202    BOOL bits_set = FALSE;
1203  uschar start_bits[32];  uschar start_bits[32];
1204  pcre_extra *extra;  pcre_extra *extra;
1205  pcre_study_data *study;  pcre_study_data *study;
# Line 533  code = (uschar *)re + re->name_table_off Line 1226  code = (uschar *)re + re->name_table_off
1226    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
1227    
1228  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
1229  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
1230  at present. */  seeking a list of starting bytes. */
1231    
1232  if ((re->options & PCRE_ANCHORED) != 0 ||  if ((re->options & PCRE_ANCHORED) == 0 &&
1233      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1234    return NULL;    {
1235      int rc;
1236    
1237      /* Set the character tables in the block that is passed around */
1238    
1239  /* Set the character tables in the block that is passed around */    tables = re->tables;
1240      if (tables == NULL)
1241        (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1242        (void *)(&tables));
1243    
1244      compile_block.lcc = tables + lcc_offset;
1245      compile_block.fcc = tables + fcc_offset;
1246      compile_block.cbits = tables + cbits_offset;
1247      compile_block.ctypes = tables + ctypes_offset;
1248    
1249      /* See if we can find a fixed set of initial characters for the pattern. */
1250    
1251      memset(start_bits, 0, 32 * sizeof(uschar));
1252      rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1253        &compile_block);
1254      bits_set = rc == SSB_DONE;
1255      if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";
1256      }
1257    
1258  tables = re->tables;  /* Find the minimum length of subject string. */
1259  if (tables == NULL)  
1260    (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,  switch(min = find_minlength(code, code, re->options))
1261    (void *)(&tables));    {
1262      case -2: *errorptr = "internal error: missing capturing bracket"; break;
1263  compile_block.lcc = tables + lcc_offset;    case -3: *errorptr = "internal error: opcode not recognized"; break;
1264  compile_block.fcc = tables + fcc_offset;    default: break;
1265  compile_block.cbits = tables + cbits_offset;    }
1266  compile_block.ctypes = tables + ctypes_offset;  
1267    /* Return NULL if there's been an error or if no optimization is possible. */
1268  /* See if we can find a fixed set of initial characters for the pattern. */  
1269    if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;
 memset(start_bits, 0, 32 * sizeof(uschar));  
 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  
   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;  
1270    
1271  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1272  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 579  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 1289  extra->flags = PCRE_EXTRA_STUDY_DATA;
1289  extra->study_data = study;  extra->study_data = study;
1290    
1291  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
1292  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
1293  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
1294    if (bits_set)
1295      {
1296      study->flags |= PCRE_STUDY_MAPPED;
1297      memcpy(study->start_bits, start_bits, sizeof(start_bits));
1298      }
1299    
1300    if (min >= 0)
1301      {
1302      study->flags |= PCRE_STUDY_MINLEN;
1303      study->minlength = min;
1304      }
1305    
1306  return extra;  return extra;
1307  }  }

Legend:
Removed from v.353  
changed lines
  Added in v.611

  ViewVC Help
Powered by ViewVC 1.1.5