/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Diff of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 359 by ph10, Wed Jul 9 16:20:19 2008 UTC revision 469 by ph10, Mon Oct 19 14:38:48 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2008 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 54  supporting functions. */ Line 54  supporting functions. */
54  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };  enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57    
58    /*************************************************
59    *   Find the minimum subject length for a group  *
60    *************************************************/
61    
62    /* Scan a parenthesized group and compute the minimum length of subject that
63    is needed to match it. This is a lower bound; it does not mean there is a
64    string of that length that matches. In UTF8 mode, the result is in characters
65    rather than bytes.
66    
67    Arguments:
68      code       pointer to start of group (the bracket)
69      startcode  pointer to start of the whole pattern
70      options    the compiling options
71    
72    Returns:   the minimum length
73               -1 if \C was encountered
74               -2 internal error (missing capturing bracket)
75    */
76    
77    static int
78    find_minlength(const uschar *code, const uschar *startcode, int options)
79    {
80    int length = -1;
81    BOOL utf8 = (options & PCRE_UTF8) != 0;
82    BOOL had_recurse = FALSE;
83    register int branchlength = 0;
84    register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
85    
86    if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
87    
88    /* Scan along the opcodes for this branch. If we get to the end of the
89    branch, check the length against that of the other branches. */
90    
91    for (;;)
92      {
93      int d, min;
94      uschar *cs, *ce;
95      register int op = *cc;
96    
97      switch (op)
98        {
99        case OP_CBRA:
100        case OP_SCBRA:
101        case OP_BRA:
102        case OP_SBRA:
103        case OP_ONCE:
104        case OP_COND:
105        case OP_SCOND:
106        d = find_minlength(cc, startcode, options);
107        if (d < 0) return d;
108        branchlength += d;
109        do cc += GET(cc, 1); while (*cc == OP_ALT);
110        cc += 1 + LINK_SIZE;
111        break;
112    
113        /* Reached end of a branch; if it's a ket it is the end of a nested
114        call. If it's ALT it is an alternation in a nested call. If it is
115        END it's the end of the outer call. All can be handled by the same code. */
116    
117        case OP_ALT:
118        case OP_KET:
119        case OP_KETRMAX:
120        case OP_KETRMIN:
121        case OP_END:
122        if (length < 0 || (!had_recurse && branchlength < length))
123          length = branchlength;
124        if (*cc != OP_ALT) return length;
125        cc += 1 + LINK_SIZE;
126        branchlength = 0;
127        had_recurse = FALSE;
128        break;
129    
130        /* Skip over assertive subpatterns */
131    
132        case OP_ASSERT:
133        case OP_ASSERT_NOT:
134        case OP_ASSERTBACK:
135        case OP_ASSERTBACK_NOT:
136        do cc += GET(cc, 1); while (*cc == OP_ALT);
137        /* Fall through */
138    
139        /* Skip over things that don't match chars */
140    
141        case OP_REVERSE:
142        case OP_CREF:
143        case OP_NCREF:
144        case OP_RREF:
145        case OP_NRREF:
146        case OP_DEF:
147        case OP_OPT:
148        case OP_CALLOUT:
149        case OP_SOD:
150        case OP_SOM:
151        case OP_EOD:
152        case OP_EODN:
153        case OP_CIRC:
154        case OP_DOLL:
155        case OP_NOT_WORD_BOUNDARY:
156        case OP_WORD_BOUNDARY:
157        cc += _pcre_OP_lengths[*cc];
158        break;
159    
160        /* Skip over a subpattern that has a {0} or {0,x} quantifier */
161    
162        case OP_BRAZERO:
163        case OP_BRAMINZERO:
164        case OP_SKIPZERO:
165        cc += _pcre_OP_lengths[*cc];
166        do cc += GET(cc, 1); while (*cc == OP_ALT);
167        cc += 1 + LINK_SIZE;
168        break;
169    
170        /* Handle literal characters and + repetitions */
171    
172        case OP_CHAR:
173        case OP_CHARNC:
174        case OP_NOT:
175        case OP_PLUS:
176        case OP_MINPLUS:
177        case OP_POSPLUS:
178        case OP_NOTPLUS:
179        case OP_NOTMINPLUS:
180        case OP_NOTPOSPLUS:
181        branchlength++;
182        cc += 2;
183    #ifdef SUPPORT_UTF8
184        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
185    #endif
186        break;
187    
188        case OP_TYPEPLUS:
189        case OP_TYPEMINPLUS:
190        case OP_TYPEPOSPLUS:
191        branchlength++;
192        cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
193        break;
194    
195        /* Handle exact repetitions. The count is already in characters, but we
196        need to skip over a multibyte character in UTF8 mode.  */
197    
198        case OP_EXACT:
199        case OP_NOTEXACT:
200        branchlength += GET2(cc,1);
201        cc += 4;
202    #ifdef SUPPORT_UTF8
203        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
204    #endif
205        break;
206    
207        case OP_TYPEEXACT:
208        branchlength += GET2(cc,1);
209        cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
210        break;
211    
212        /* Handle single-char non-literal matchers */
213    
214        case OP_PROP:
215        case OP_NOTPROP:
216        cc += 2;
217        /* Fall through */
218    
219        case OP_NOT_DIGIT:
220        case OP_DIGIT:
221        case OP_NOT_WHITESPACE:
222        case OP_WHITESPACE:
223        case OP_NOT_WORDCHAR:
224        case OP_WORDCHAR:
225        case OP_ANY:
226        case OP_ALLANY:
227        case OP_EXTUNI:
228        case OP_HSPACE:
229        case OP_NOT_HSPACE:
230        case OP_VSPACE:
231        case OP_NOT_VSPACE:
232        branchlength++;
233        cc++;
234        break;
235    
236        /* "Any newline" might match two characters */
237    
238        case OP_ANYNL:
239        branchlength += 2;
240        cc++;
241        break;
242    
243        /* The single-byte matcher means we can't proceed in UTF-8 mode */
244    
245        case OP_ANYBYTE:
246    #ifdef SUPPORT_UTF8
247        if (utf8) return -1;
248    #endif
249        branchlength++;
250        cc++;
251        break;
252    
253        /* For repeated character types, we have to test for \p and \P, which have
254        an extra two bytes of parameters. */
255    
256        case OP_TYPESTAR:
257        case OP_TYPEMINSTAR:
258        case OP_TYPEQUERY:
259        case OP_TYPEMINQUERY:
260        case OP_TYPEPOSSTAR:
261        case OP_TYPEPOSQUERY:
262        if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
263        cc += _pcre_OP_lengths[op];
264        break;
265    
266        case OP_TYPEUPTO:
267        case OP_TYPEMINUPTO:
268        case OP_TYPEPOSUPTO:
269        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
270        cc += _pcre_OP_lengths[op];
271        break;
272    
273        /* Check a class for variable quantification */
274    
275    #ifdef SUPPORT_UTF8
276        case OP_XCLASS:
277        cc += GET(cc, 1) - 33;
278        /* Fall through */
279    #endif
280    
281        case OP_CLASS:
282        case OP_NCLASS:
283        cc += 33;
284    
285        switch (*cc)
286          {
287          case OP_CRPLUS:
288          case OP_CRMINPLUS:
289          branchlength++;
290          /* Fall through */
291    
292          case OP_CRSTAR:
293          case OP_CRMINSTAR:
294          case OP_CRQUERY:
295          case OP_CRMINQUERY:
296          cc++;
297          break;
298    
299          case OP_CRRANGE:
300          case OP_CRMINRANGE:
301          branchlength += GET2(cc,1);
302          cc += 5;
303          break;
304    
305          default:
306          branchlength++;
307          break;
308          }
309        break;
310    
311        /* Backreferences and subroutine calls are treated in the same way: we find
312        the minimum length for the subpattern. A recursion, however, causes an
313        a flag to be set that causes the length of this branch to be ignored. The
314        logic is that a recursion can only make sense if there is another
315        alternation that stops the recursing. That will provide the minimum length
316        (when no recursion happens). A backreference within the group that it is
317        referencing behaves in the same way.
318    
319        If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
320        matches an empty string (by default it causes a matching failure), so in
321        that case we must set the minimum length to zero. */
322    
323        case OP_REF:
324        if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
325          {
326          ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
327          if (cs == NULL) return -2;
328          do ce += GET(ce, 1); while (*ce == OP_ALT);
329          if (cc > cs && cc < ce)
330            {
331            d = 0;
332            had_recurse = TRUE;
333            }
334          else d = find_minlength(cs, startcode, options);
335          }
336        else d = 0;
337        cc += 3;
338    
339        /* Handle repeated back references */
340    
341        switch (*cc)
342          {
343          case OP_CRSTAR:
344          case OP_CRMINSTAR:
345          case OP_CRQUERY:
346          case OP_CRMINQUERY:
347          min = 0;
348          cc++;
349          break;
350    
351          case OP_CRRANGE:
352          case OP_CRMINRANGE:
353          min = GET2(cc, 1);
354          cc += 5;
355          break;
356    
357          default:
358          min = 1;
359          break;
360          }
361    
362        branchlength += min * d;
363        break;
364    
365        case OP_RECURSE:
366        cs = ce = (uschar *)startcode + GET(cc, 1);
367        if (cs == NULL) return -2;
368        do ce += GET(ce, 1); while (*ce == OP_ALT);
369        if (cc > cs && cc < ce)
370          had_recurse = TRUE;
371        else
372          branchlength += find_minlength(cs, startcode, options);
373        cc += 1 + LINK_SIZE;
374        break;
375    
376        /* Anything else does not or need not match a character. We can get the
377        item's length from the table, but for those that can match zero occurrences
378        of a character, we must take special action for UTF-8 characters. */
379    
380        case OP_UPTO:
381        case OP_NOTUPTO:
382        case OP_MINUPTO:
383        case OP_NOTMINUPTO:
384        case OP_POSUPTO:
385        case OP_STAR:
386        case OP_MINSTAR:
387        case OP_NOTMINSTAR:
388        case OP_POSSTAR:
389        case OP_NOTPOSSTAR:
390        case OP_QUERY:
391        case OP_MINQUERY:
392        case OP_NOTMINQUERY:
393        case OP_POSQUERY:
394        case OP_NOTPOSQUERY:
395        cc += _pcre_OP_lengths[op];
396    #ifdef SUPPORT_UTF8
397        if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
398    #endif
399        break;
400    
401        /* For the record, these are the opcodes that are matched by "default":
402        OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
403        OP_THEN. */
404    
405        default:
406        cc += _pcre_OP_lengths[op];
407        break;
408        }
409      }
410    /* Control never gets here */
411    }
412    
413    
414    
415  /*************************************************  /*************************************************
416  *      Set a bit and maybe its alternate case    *  *      Set a bit and maybe its alternate case    *
417  *************************************************/  *************************************************/
# Line 500  Arguments: Line 858  Arguments:
858              set NULL unless error              set NULL unless error
859    
860  Returns:    pointer to a pcre_extra block, with study_data filled in and the  Returns:    pointer to a pcre_extra block, with study_data filled in and the
861                appropriate flag set;                appropriate flags set;
862              NULL on error or if no optimization possible              NULL on error or if no optimization possible
863  */  */
864    
865  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION  PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
866  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
867  {  {
868    int min;
869    BOOL bits_set = FALSE;
870  uschar start_bits[32];  uschar start_bits[32];
871  pcre_extra *extra;  pcre_extra *extra;
872  pcre_study_data *study;  pcre_study_data *study;
# Line 533  code = (uschar *)re + re->name_table_off Line 893  code = (uschar *)re + re->name_table_off
893    (re->name_count * re->name_entry_size);    (re->name_count * re->name_entry_size);
894    
895  /* For an anchored pattern, or an unanchored pattern that has a first char, or  /* For an anchored pattern, or an unanchored pattern that has a first char, or
896  a multiline pattern that matches only at "line starts", no further processing  a multiline pattern that matches only at "line starts", there is no point in
897  at present. */  seeking a list of starting bytes. */
898    
899  if ((re->options & PCRE_ANCHORED) != 0 ||  if ((re->options & PCRE_ANCHORED) == 0 &&
900      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)      (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
901    return NULL;    {
902      /* Set the character tables in the block that is passed around */
903    
904  /* Set the character tables in the block that is passed around */    tables = re->tables;
905      if (tables == NULL)
906        (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
907        (void *)(&tables));
908    
909      compile_block.lcc = tables + lcc_offset;
910      compile_block.fcc = tables + fcc_offset;
911      compile_block.cbits = tables + cbits_offset;
912      compile_block.ctypes = tables + ctypes_offset;
913    
914      /* See if we can find a fixed set of initial characters for the pattern. */
915    
916      memset(start_bits, 0, 32 * sizeof(uschar));
917      bits_set = set_start_bits(code, start_bits,
918        (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
919        &compile_block) == SSB_DONE;
920      }
921    
922  tables = re->tables;  /* Find the minimum length of subject string. */
923  if (tables == NULL)  
924    (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,  min = find_minlength(code, code, re->options);
925    (void *)(&tables));  
926    /* Return NULL if no optimization is possible. */
927  compile_block.lcc = tables + lcc_offset;  
928  compile_block.fcc = tables + fcc_offset;  if (!bits_set && min < 0) return NULL;
 compile_block.cbits = tables + cbits_offset;  
 compile_block.ctypes = tables + ctypes_offset;  
   
 /* See if we can find a fixed set of initial characters for the pattern. */  
   
 memset(start_bits, 0, 32 * sizeof(uschar));  
 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,  
   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;  
929    
930  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in  /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
931  the latter, which is pointed to by the former, which may also get additional  the latter, which is pointed to by the former, which may also get additional
# Line 579  extra->flags = PCRE_EXTRA_STUDY_DATA; Line 948  extra->flags = PCRE_EXTRA_STUDY_DATA;
948  extra->study_data = study;  extra->study_data = study;
949    
950  study->size = sizeof(pcre_study_data);  study->size = sizeof(pcre_study_data);
951  study->options = PCRE_STUDY_MAPPED;  study->flags = 0;
952  memcpy(study->start_bits, start_bits, sizeof(start_bits));  
953    if (bits_set)
954      {
955      study->flags |= PCRE_STUDY_MAPPED;
956      memcpy(study->start_bits, start_bits, sizeof(start_bits));
957      }
958    
959    if (min >= 0)
960      {
961      study->flags |= PCRE_STUDY_MINLEN;
962      study->minlength = min;
963      }
964    
965  return extra;  return extra;
966  }  }

Legend:
Removed from v.359  
changed lines
  Added in v.469

  ViewVC Help
Powered by ViewVC 1.1.5