/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 612 - (show annotations)
Sat Jul 2 15:20:59 2011 UTC (4 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 37197 byte(s)
Error occurred while calculating annotation data.
Fix two study bugs concerned with minimum subject lengths; add features to 
pcretest so that all tests can be run with or without study; adjust tests so 
that this happens.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre_internal.h"
50
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52
53 /* Returns from set_start_bits() */
54
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56
57
58
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
62
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67
68 Arguments:
69 code pointer to start of group (the bracket)
70 startcode pointer to start of the whole pattern
71 options the compiling options
72 had_accept pointer to flag for (*ACCEPT) encountered
73
74 Returns: the minimum length
75 -1 if \C was encountered
76 -2 internal error (missing capturing bracket)
77 -3 internal error (opcode not listed)
78 */
79
80 static int
81 find_minlength(const uschar *code, const uschar *startcode, int options,
82 BOOL *had_accept_ptr)
83 {
84 int length = -1;
85 BOOL utf8 = (options & PCRE_UTF8) != 0;
86 BOOL had_recurse = FALSE;
87 register int branchlength = 0;
88 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
89
90 if (*code == OP_CBRA || *code == OP_SCBRA ||
91 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
92
93 /* Scan along the opcodes for this branch. If we get to the end of the
94 branch, check the length against that of the other branches. */
95
96 for (;;)
97 {
98 int d, min;
99 uschar *cs, *ce;
100 register int op = *cc;
101
102 switch (op)
103 {
104 case OP_COND:
105 case OP_SCOND:
106
107 /* If there is only one branch in a condition, the implied branch has zero
108 length, so we don't add anything. This covers the DEFINE "condition"
109 automatically. */
110
111 cs = cc + GET(cc, 1);
112 if (*cs != OP_ALT)
113 {
114 cc = cs + 1 + LINK_SIZE;
115 break;
116 }
117
118 /* Otherwise we can fall through and treat it the same as any other
119 subpattern. */
120
121 case OP_CBRA:
122 case OP_SCBRA:
123 case OP_BRA:
124 case OP_SBRA:
125 case OP_CBRAPOS:
126 case OP_SCBRAPOS:
127 case OP_BRAPOS:
128 case OP_SBRAPOS:
129 case OP_ONCE:
130 d = find_minlength(cc, startcode, options, had_accept_ptr);
131 if (d < 0) return d;
132 branchlength += d;
133 if (*had_accept_ptr) return branchlength;
134 do cc += GET(cc, 1); while (*cc == OP_ALT);
135 cc += 1 + LINK_SIZE;
136 break;
137
138 /* Reached end of a branch; if it's a ket it is the end of a nested
139 call. If it's ALT it is an alternation in a nested call. If it is END it's
140 the end of the outer call. All can be handled by the same code. If it is
141 ACCEPT, it is essentially the same as END, but we set a flag so that
142 counting stops. */
143
144 case OP_ACCEPT:
145 *had_accept_ptr = TRUE;
146 /* Fall through */
147 case OP_ALT:
148 case OP_KET:
149 case OP_KETRMAX:
150 case OP_KETRMIN:
151 case OP_KETRPOS:
152 case OP_END:
153 if (length < 0 || (!had_recurse && branchlength < length))
154 length = branchlength;
155 if (op != OP_ALT) return length;
156 cc += 1 + LINK_SIZE;
157 branchlength = 0;
158 had_recurse = FALSE;
159 break;
160
161 /* Skip over assertive subpatterns */
162
163 case OP_ASSERT:
164 case OP_ASSERT_NOT:
165 case OP_ASSERTBACK:
166 case OP_ASSERTBACK_NOT:
167 do cc += GET(cc, 1); while (*cc == OP_ALT);
168 /* Fall through */
169
170 /* Skip over things that don't match chars */
171
172 case OP_REVERSE:
173 case OP_CREF:
174 case OP_NCREF:
175 case OP_RREF:
176 case OP_NRREF:
177 case OP_DEF:
178 case OP_CALLOUT:
179 case OP_SOD:
180 case OP_SOM:
181 case OP_EOD:
182 case OP_EODN:
183 case OP_CIRC:
184 case OP_CIRCM:
185 case OP_DOLL:
186 case OP_DOLLM:
187 case OP_NOT_WORD_BOUNDARY:
188 case OP_WORD_BOUNDARY:
189 cc += _pcre_OP_lengths[*cc];
190 break;
191
192 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
193
194 case OP_BRAZERO:
195 case OP_BRAMINZERO:
196 case OP_BRAPOSZERO:
197 case OP_SKIPZERO:
198 cc += _pcre_OP_lengths[*cc];
199 do cc += GET(cc, 1); while (*cc == OP_ALT);
200 cc += 1 + LINK_SIZE;
201 break;
202
203 /* Handle literal characters and + repetitions */
204
205 case OP_CHAR:
206 case OP_CHARI:
207 case OP_NOT:
208 case OP_NOTI:
209 case OP_PLUS:
210 case OP_PLUSI:
211 case OP_MINPLUS:
212 case OP_MINPLUSI:
213 case OP_POSPLUS:
214 case OP_POSPLUSI:
215 case OP_NOTPLUS:
216 case OP_NOTPLUSI:
217 case OP_NOTMINPLUS:
218 case OP_NOTMINPLUSI:
219 case OP_NOTPOSPLUS:
220 case OP_NOTPOSPLUSI:
221 branchlength++;
222 cc += 2;
223 #ifdef SUPPORT_UTF8
224 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
225 #endif
226 break;
227
228 case OP_TYPEPLUS:
229 case OP_TYPEMINPLUS:
230 case OP_TYPEPOSPLUS:
231 branchlength++;
232 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
233 break;
234
235 /* Handle exact repetitions. The count is already in characters, but we
236 need to skip over a multibyte character in UTF8 mode. */
237
238 case OP_EXACT:
239 case OP_EXACTI:
240 case OP_NOTEXACT:
241 case OP_NOTEXACTI:
242 branchlength += GET2(cc,1);
243 cc += 4;
244 #ifdef SUPPORT_UTF8
245 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
246 #endif
247 break;
248
249 case OP_TYPEEXACT:
250 branchlength += GET2(cc,1);
251 cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
252 break;
253
254 /* Handle single-char non-literal matchers */
255
256 case OP_PROP:
257 case OP_NOTPROP:
258 cc += 2;
259 /* Fall through */
260
261 case OP_NOT_DIGIT:
262 case OP_DIGIT:
263 case OP_NOT_WHITESPACE:
264 case OP_WHITESPACE:
265 case OP_NOT_WORDCHAR:
266 case OP_WORDCHAR:
267 case OP_ANY:
268 case OP_ALLANY:
269 case OP_EXTUNI:
270 case OP_HSPACE:
271 case OP_NOT_HSPACE:
272 case OP_VSPACE:
273 case OP_NOT_VSPACE:
274 branchlength++;
275 cc++;
276 break;
277
278 /* "Any newline" might match two characters, but it also might match just
279 one. */
280
281 case OP_ANYNL:
282 branchlength += 1;
283 cc++;
284 break;
285
286 /* The single-byte matcher means we can't proceed in UTF-8 mode */
287
288 case OP_ANYBYTE:
289 #ifdef SUPPORT_UTF8
290 if (utf8) return -1;
291 #endif
292 branchlength++;
293 cc++;
294 break;
295
296 /* For repeated character types, we have to test for \p and \P, which have
297 an extra two bytes of parameters. */
298
299 case OP_TYPESTAR:
300 case OP_TYPEMINSTAR:
301 case OP_TYPEQUERY:
302 case OP_TYPEMINQUERY:
303 case OP_TYPEPOSSTAR:
304 case OP_TYPEPOSQUERY:
305 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
306 cc += _pcre_OP_lengths[op];
307 break;
308
309 case OP_TYPEUPTO:
310 case OP_TYPEMINUPTO:
311 case OP_TYPEPOSUPTO:
312 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
313 cc += _pcre_OP_lengths[op];
314 break;
315
316 /* Check a class for variable quantification */
317
318 #ifdef SUPPORT_UTF8
319 case OP_XCLASS:
320 cc += GET(cc, 1) - 33;
321 /* Fall through */
322 #endif
323
324 case OP_CLASS:
325 case OP_NCLASS:
326 cc += 33;
327
328 switch (*cc)
329 {
330 case OP_CRPLUS:
331 case OP_CRMINPLUS:
332 branchlength++;
333 /* Fall through */
334
335 case OP_CRSTAR:
336 case OP_CRMINSTAR:
337 case OP_CRQUERY:
338 case OP_CRMINQUERY:
339 cc++;
340 break;
341
342 case OP_CRRANGE:
343 case OP_CRMINRANGE:
344 branchlength += GET2(cc,1);
345 cc += 5;
346 break;
347
348 default:
349 branchlength++;
350 break;
351 }
352 break;
353
354 /* Backreferences and subroutine calls are treated in the same way: we find
355 the minimum length for the subpattern. A recursion, however, causes an
356 a flag to be set that causes the length of this branch to be ignored. The
357 logic is that a recursion can only make sense if there is another
358 alternation that stops the recursing. That will provide the minimum length
359 (when no recursion happens). A backreference within the group that it is
360 referencing behaves in the same way.
361
362 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
363 matches an empty string (by default it causes a matching failure), so in
364 that case we must set the minimum length to zero. */
365
366 case OP_REF:
367 case OP_REFI:
368 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
369 {
370 ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
371 if (cs == NULL) return -2;
372 do ce += GET(ce, 1); while (*ce == OP_ALT);
373 if (cc > cs && cc < ce)
374 {
375 d = 0;
376 had_recurse = TRUE;
377 }
378 else
379 {
380 d = find_minlength(cs, startcode, options, had_accept_ptr);
381 *had_accept_ptr = FALSE;
382 }
383 }
384 else d = 0;
385 cc += 3;
386
387 /* Handle repeated back references */
388
389 switch (*cc)
390 {
391 case OP_CRSTAR:
392 case OP_CRMINSTAR:
393 case OP_CRQUERY:
394 case OP_CRMINQUERY:
395 min = 0;
396 cc++;
397 break;
398
399 case OP_CRPLUS:
400 case OP_CRMINPLUS:
401 min = 1;
402 cc++;
403 break;
404
405 case OP_CRRANGE:
406 case OP_CRMINRANGE:
407 min = GET2(cc, 1);
408 cc += 5;
409 break;
410
411 default:
412 min = 1;
413 break;
414 }
415
416 branchlength += min * d;
417 break;
418
419 case OP_RECURSE:
420 cs = ce = (uschar *)startcode + GET(cc, 1);
421 if (cs == NULL) return -2;
422 do ce += GET(ce, 1); while (*ce == OP_ALT);
423 if (cc > cs && cc < ce)
424 had_recurse = TRUE;
425 else
426 {
427 branchlength += find_minlength(cs, startcode, options, had_accept_ptr);
428 *had_accept_ptr = FALSE;
429 }
430 cc += 1 + LINK_SIZE;
431 break;
432
433 /* Anything else does not or need not match a character. We can get the
434 item's length from the table, but for those that can match zero occurrences
435 of a character, we must take special action for UTF-8 characters. As it
436 happens, the "NOT" versions of these opcodes are used at present only for
437 ASCII characters, so they could be omitted from this list. However, in
438 future that may change, so we include them here so as not to leave a
439 gotcha for a future maintainer. */
440
441 case OP_UPTO:
442 case OP_UPTOI:
443 case OP_NOTUPTO:
444 case OP_NOTUPTOI:
445 case OP_MINUPTO:
446 case OP_MINUPTOI:
447 case OP_NOTMINUPTO:
448 case OP_NOTMINUPTOI:
449 case OP_POSUPTO:
450 case OP_POSUPTOI:
451 case OP_NOTPOSUPTO:
452 case OP_NOTPOSUPTOI:
453
454 case OP_STAR:
455 case OP_STARI:
456 case OP_NOTSTAR:
457 case OP_NOTSTARI:
458 case OP_MINSTAR:
459 case OP_MINSTARI:
460 case OP_NOTMINSTAR:
461 case OP_NOTMINSTARI:
462 case OP_POSSTAR:
463 case OP_POSSTARI:
464 case OP_NOTPOSSTAR:
465 case OP_NOTPOSSTARI:
466
467 case OP_QUERY:
468 case OP_QUERYI:
469 case OP_NOTQUERY:
470 case OP_NOTQUERYI:
471 case OP_MINQUERY:
472 case OP_MINQUERYI:
473 case OP_NOTMINQUERY:
474 case OP_NOTMINQUERYI:
475 case OP_POSQUERY:
476 case OP_POSQUERYI:
477 case OP_NOTPOSQUERY:
478 case OP_NOTPOSQUERYI:
479
480 cc += _pcre_OP_lengths[op];
481 #ifdef SUPPORT_UTF8
482 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
483 #endif
484 break;
485
486 /* Skip these, but we need to add in the name length. */
487
488 case OP_MARK:
489 case OP_PRUNE_ARG:
490 case OP_SKIP_ARG:
491 cc += _pcre_OP_lengths[op] + cc[1];
492 break;
493
494 case OP_THEN_ARG:
495 cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
496 break;
497
498 /* The remaining opcodes are just skipped over. */
499
500 case OP_CLOSE:
501 case OP_COMMIT:
502 case OP_FAIL:
503 case OP_PRUNE:
504 case OP_SET_SOM:
505 case OP_SKIP:
506 case OP_THEN:
507 cc += _pcre_OP_lengths[op];
508 break;
509
510 /* This should not occur: we list all opcodes explicitly so that when
511 new ones get added they are properly considered. */
512
513 default:
514 return -3;
515 }
516 }
517 /* Control never gets here */
518 }
519
520
521
522 /*************************************************
523 * Set a bit and maybe its alternate case *
524 *************************************************/
525
526 /* Given a character, set its first byte's bit in the table, and also the
527 corresponding bit for the other version of a letter if we are caseless. In
528 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
529 when Unicode property support is available.
530
531 Arguments:
532 start_bits points to the bit map
533 p points to the character
534 caseless the caseless flag
535 cd the block with char table pointers
536 utf8 TRUE for UTF-8 mode
537
538 Returns: pointer after the character
539 */
540
541 static const uschar *
542 set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
543 compile_data *cd, BOOL utf8)
544 {
545 unsigned int c = *p;
546
547 SET_BIT(c);
548
549 #ifdef SUPPORT_UTF8
550 if (utf8 && c > 127)
551 {
552 GETCHARINC(c, p);
553 #ifdef SUPPORT_UCP
554 if (caseless)
555 {
556 uschar buff[8];
557 c = UCD_OTHERCASE(c);
558 (void)_pcre_ord2utf8(c, buff);
559 SET_BIT(buff[0]);
560 }
561 #endif
562 return p;
563 }
564 #endif
565
566 /* Not UTF-8 mode, or character is less than 127. */
567
568 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
569 return p + 1;
570 }
571
572
573
574 /*************************************************
575 * Set bits for a positive character type *
576 *************************************************/
577
578 /* This function sets starting bits for a character type. In UTF-8 mode, we can
579 only do a direct setting for bytes less than 128, as otherwise there can be
580 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
581 environment, the tables will only recognize ASCII characters anyway, but in at
582 least one Windows environment, some higher bytes bits were set in the tables.
583 So we deal with that case by considering the UTF-8 encoding.
584
585 Arguments:
586 start_bits the starting bitmap
587 cbit type the type of character wanted
588 table_limit 32 for non-UTF-8; 16 for UTF-8
589 cd the block with char table pointers
590
591 Returns: nothing
592 */
593
594 static void
595 set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
596 compile_data *cd)
597 {
598 register int c;
599 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
600 if (table_limit == 32) return;
601 for (c = 128; c < 256; c++)
602 {
603 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
604 {
605 uschar buff[8];
606 (void)_pcre_ord2utf8(c, buff);
607 SET_BIT(buff[0]);
608 }
609 }
610 }
611
612
613 /*************************************************
614 * Set bits for a negative character type *
615 *************************************************/
616
617 /* This function sets starting bits for a negative character type such as \D.
618 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
619 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
620 Unlike in the positive case, where we can set appropriate starting bits for
621 specific high-valued UTF-8 characters, in this case we have to set the bits for
622 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
623 0xc0 (192) for simplicity.
624
625 Arguments:
626 start_bits the starting bitmap
627 cbit type the type of character wanted
628 table_limit 32 for non-UTF-8; 16 for UTF-8
629 cd the block with char table pointers
630
631 Returns: nothing
632 */
633
634 static void
635 set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
636 compile_data *cd)
637 {
638 register int c;
639 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
640 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
641 }
642
643
644
645 /*************************************************
646 * Create bitmap of starting bytes *
647 *************************************************/
648
649 /* This function scans a compiled unanchored expression recursively and
650 attempts to build a bitmap of the set of possible starting bytes. As time goes
651 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
652 useful for parenthesized groups in patterns such as (a*)b where the group
653 provides some optional starting bytes but scanning must continue at the outer
654 level to find at least one mandatory byte. At the outermost level, this
655 function fails unless the result is SSB_DONE.
656
657 Arguments:
658 code points to an expression
659 start_bits points to a 32-byte table, initialized to 0
660 utf8 TRUE if in UTF-8 mode
661 cd the block with char table pointers
662
663 Returns: SSB_FAIL => Failed to find any starting bytes
664 SSB_DONE => Found mandatory starting bytes
665 SSB_CONTINUE => Found optional starting bytes
666 SSB_UNKNOWN => Hit an unrecognized opcode
667 */
668
669 static int
670 set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
671 compile_data *cd)
672 {
673 register int c;
674 int yield = SSB_DONE;
675 int table_limit = utf8? 16:32;
676
677 #if 0
678 /* ========================================================================= */
679 /* The following comment and code was inserted in January 1999. In May 2006,
680 when it was observed to cause compiler warnings about unused values, I took it
681 out again. If anybody is still using OS/2, they will have to put it back
682 manually. */
683
684 /* This next statement and the later reference to dummy are here in order to
685 trick the optimizer of the IBM C compiler for OS/2 into generating correct
686 code. Apparently IBM isn't going to fix the problem, and we would rather not
687 disable optimization (in this module it actually makes a big difference, and
688 the pcre module can use all the optimization it can get). */
689
690 volatile int dummy;
691 /* ========================================================================= */
692 #endif
693
694 do
695 {
696 BOOL try_next = TRUE;
697 const uschar *tcode = code + 1 + LINK_SIZE;
698
699 if (*code == OP_CBRA || *code == OP_SCBRA ||
700 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
701
702 while (try_next) /* Loop for items in this branch */
703 {
704 int rc;
705
706 switch(*tcode)
707 {
708 /* If we reach something we don't understand, it means a new opcode has
709 been created that hasn't been added to this code. Hopefully this problem
710 will be discovered during testing. */
711
712 default:
713 return SSB_UNKNOWN;
714
715 /* Fail for a valid opcode that implies no starting bits. */
716
717 case OP_ACCEPT:
718 case OP_ALLANY:
719 case OP_ANY:
720 case OP_ANYBYTE:
721 case OP_CIRC:
722 case OP_CIRCM:
723 case OP_CLOSE:
724 case OP_COMMIT:
725 case OP_COND:
726 case OP_CREF:
727 case OP_DEF:
728 case OP_DOLL:
729 case OP_DOLLM:
730 case OP_END:
731 case OP_EOD:
732 case OP_EODN:
733 case OP_EXTUNI:
734 case OP_FAIL:
735 case OP_MARK:
736 case OP_NCREF:
737 case OP_NOT:
738 case OP_NOTEXACT:
739 case OP_NOTEXACTI:
740 case OP_NOTI:
741 case OP_NOTMINPLUS:
742 case OP_NOTMINPLUSI:
743 case OP_NOTMINQUERY:
744 case OP_NOTMINQUERYI:
745 case OP_NOTMINSTAR:
746 case OP_NOTMINSTARI:
747 case OP_NOTMINUPTO:
748 case OP_NOTMINUPTOI:
749 case OP_NOTPLUS:
750 case OP_NOTPLUSI:
751 case OP_NOTPOSPLUS:
752 case OP_NOTPOSPLUSI:
753 case OP_NOTPOSQUERY:
754 case OP_NOTPOSQUERYI:
755 case OP_NOTPOSSTAR:
756 case OP_NOTPOSSTARI:
757 case OP_NOTPOSUPTO:
758 case OP_NOTPOSUPTOI:
759 case OP_NOTPROP:
760 case OP_NOTQUERY:
761 case OP_NOTQUERYI:
762 case OP_NOTSTAR:
763 case OP_NOTSTARI:
764 case OP_NOTUPTO:
765 case OP_NOTUPTOI:
766 case OP_NOT_HSPACE:
767 case OP_NOT_VSPACE:
768 case OP_NOT_WORD_BOUNDARY:
769 case OP_NRREF:
770 case OP_PROP:
771 case OP_PRUNE:
772 case OP_PRUNE_ARG:
773 case OP_RECURSE:
774 case OP_REF:
775 case OP_REFI:
776 case OP_REVERSE:
777 case OP_RREF:
778 case OP_SCOND:
779 case OP_SET_SOM:
780 case OP_SKIP:
781 case OP_SKIP_ARG:
782 case OP_SOD:
783 case OP_SOM:
784 case OP_THEN:
785 case OP_THEN_ARG:
786 case OP_WORD_BOUNDARY:
787 case OP_XCLASS:
788 return SSB_FAIL;
789
790 /* If we hit a bracket or a positive lookahead assertion, recurse to set
791 bits from within the subpattern. If it can't find anything, we have to
792 give up. If it finds some mandatory character(s), we are done for this
793 branch. Otherwise, carry on scanning after the subpattern. */
794
795 case OP_BRA:
796 case OP_SBRA:
797 case OP_CBRA:
798 case OP_SCBRA:
799 case OP_BRAPOS:
800 case OP_SBRAPOS:
801 case OP_CBRAPOS:
802 case OP_SCBRAPOS:
803 case OP_ONCE:
804 case OP_ASSERT:
805 rc = set_start_bits(tcode, start_bits, utf8, cd);
806 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
807 if (rc == SSB_DONE) try_next = FALSE; else
808 {
809 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
810 tcode += 1 + LINK_SIZE;
811 }
812 break;
813
814 /* If we hit ALT or KET, it means we haven't found anything mandatory in
815 this branch, though we might have found something optional. For ALT, we
816 continue with the next alternative, but we have to arrange that the final
817 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
818 return SSB_CONTINUE: if this is the top level, that indicates failure,
819 but after a nested subpattern, it causes scanning to continue. */
820
821 case OP_ALT:
822 yield = SSB_CONTINUE;
823 try_next = FALSE;
824 break;
825
826 case OP_KET:
827 case OP_KETRMAX:
828 case OP_KETRMIN:
829 case OP_KETRPOS:
830 return SSB_CONTINUE;
831
832 /* Skip over callout */
833
834 case OP_CALLOUT:
835 tcode += 2 + 2*LINK_SIZE;
836 break;
837
838 /* Skip over lookbehind and negative lookahead assertions */
839
840 case OP_ASSERT_NOT:
841 case OP_ASSERTBACK:
842 case OP_ASSERTBACK_NOT:
843 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
844 tcode += 1 + LINK_SIZE;
845 break;
846
847 /* BRAZERO does the bracket, but carries on. */
848
849 case OP_BRAZERO:
850 case OP_BRAMINZERO:
851 case OP_BRAPOSZERO:
852 rc = set_start_bits(++tcode, start_bits, utf8, cd);
853 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
854 /* =========================================================================
855 See the comment at the head of this function concerning the next line,
856 which was an old fudge for the benefit of OS/2.
857 dummy = 1;
858 ========================================================================= */
859 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
860 tcode += 1 + LINK_SIZE;
861 break;
862
863 /* SKIPZERO skips the bracket. */
864
865 case OP_SKIPZERO:
866 tcode++;
867 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
868 tcode += 1 + LINK_SIZE;
869 break;
870
871 /* Single-char * or ? sets the bit and tries the next item */
872
873 case OP_STAR:
874 case OP_MINSTAR:
875 case OP_POSSTAR:
876 case OP_QUERY:
877 case OP_MINQUERY:
878 case OP_POSQUERY:
879 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
880 break;
881
882 case OP_STARI:
883 case OP_MINSTARI:
884 case OP_POSSTARI:
885 case OP_QUERYI:
886 case OP_MINQUERYI:
887 case OP_POSQUERYI:
888 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
889 break;
890
891 /* Single-char upto sets the bit and tries the next */
892
893 case OP_UPTO:
894 case OP_MINUPTO:
895 case OP_POSUPTO:
896 tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
897 break;
898
899 case OP_UPTOI:
900 case OP_MINUPTOI:
901 case OP_POSUPTOI:
902 tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
903 break;
904
905 /* At least one single char sets the bit and stops */
906
907 case OP_EXACT:
908 tcode += 2;
909 /* Fall through */
910 case OP_CHAR:
911 case OP_PLUS:
912 case OP_MINPLUS:
913 case OP_POSPLUS:
914 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
915 try_next = FALSE;
916 break;
917
918 case OP_EXACTI:
919 tcode += 2;
920 /* Fall through */
921 case OP_CHARI:
922 case OP_PLUSI:
923 case OP_MINPLUSI:
924 case OP_POSPLUSI:
925 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
926 try_next = FALSE;
927 break;
928
929 /* Special spacing and line-terminating items. These recognize specific
930 lists of characters. The difference between VSPACE and ANYNL is that the
931 latter can match the two-character CRLF sequence, but that is not
932 relevant for finding the first character, so their code here is
933 identical. */
934
935 case OP_HSPACE:
936 SET_BIT(0x09);
937 SET_BIT(0x20);
938 if (utf8)
939 {
940 SET_BIT(0xC2); /* For U+00A0 */
941 SET_BIT(0xE1); /* For U+1680, U+180E */
942 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
943 SET_BIT(0xE3); /* For U+3000 */
944 }
945 else SET_BIT(0xA0);
946 try_next = FALSE;
947 break;
948
949 case OP_ANYNL:
950 case OP_VSPACE:
951 SET_BIT(0x0A);
952 SET_BIT(0x0B);
953 SET_BIT(0x0C);
954 SET_BIT(0x0D);
955 if (utf8)
956 {
957 SET_BIT(0xC2); /* For U+0085 */
958 SET_BIT(0xE2); /* For U+2028, U+2029 */
959 }
960 else SET_BIT(0x85);
961 try_next = FALSE;
962 break;
963
964 /* Single character types set the bits and stop. Note that if PCRE_UCP
965 is set, we do not see these op codes because \d etc are converted to
966 properties. Therefore, these apply in the case when only characters less
967 than 256 are recognized to match the types. */
968
969 case OP_NOT_DIGIT:
970 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
971 try_next = FALSE;
972 break;
973
974 case OP_DIGIT:
975 set_type_bits(start_bits, cbit_digit, table_limit, cd);
976 try_next = FALSE;
977 break;
978
979 /* The cbit_space table has vertical tab as whitespace; we have to
980 ensure it is set as not whitespace. */
981
982 case OP_NOT_WHITESPACE:
983 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
984 start_bits[1] |= 0x08;
985 try_next = FALSE;
986 break;
987
988 /* The cbit_space table has vertical tab as whitespace; we have to
989 not set it from the table. */
990
991 case OP_WHITESPACE:
992 c = start_bits[1]; /* Save in case it was already set */
993 set_type_bits(start_bits, cbit_space, table_limit, cd);
994 start_bits[1] = (start_bits[1] & ~0x08) | c;
995 try_next = FALSE;
996 break;
997
998 case OP_NOT_WORDCHAR:
999 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1000 try_next = FALSE;
1001 break;
1002
1003 case OP_WORDCHAR:
1004 set_type_bits(start_bits, cbit_word, table_limit, cd);
1005 try_next = FALSE;
1006 break;
1007
1008 /* One or more character type fudges the pointer and restarts, knowing
1009 it will hit a single character type and stop there. */
1010
1011 case OP_TYPEPLUS:
1012 case OP_TYPEMINPLUS:
1013 case OP_TYPEPOSPLUS:
1014 tcode++;
1015 break;
1016
1017 case OP_TYPEEXACT:
1018 tcode += 3;
1019 break;
1020
1021 /* Zero or more repeats of character types set the bits and then
1022 try again. */
1023
1024 case OP_TYPEUPTO:
1025 case OP_TYPEMINUPTO:
1026 case OP_TYPEPOSUPTO:
1027 tcode += 2; /* Fall through */
1028
1029 case OP_TYPESTAR:
1030 case OP_TYPEMINSTAR:
1031 case OP_TYPEPOSSTAR:
1032 case OP_TYPEQUERY:
1033 case OP_TYPEMINQUERY:
1034 case OP_TYPEPOSQUERY:
1035 switch(tcode[1])
1036 {
1037 default:
1038 case OP_ANY:
1039 case OP_ALLANY:
1040 return SSB_FAIL;
1041
1042 case OP_HSPACE:
1043 SET_BIT(0x09);
1044 SET_BIT(0x20);
1045 if (utf8)
1046 {
1047 SET_BIT(0xC2); /* For U+00A0 */
1048 SET_BIT(0xE1); /* For U+1680, U+180E */
1049 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1050 SET_BIT(0xE3); /* For U+3000 */
1051 }
1052 else SET_BIT(0xA0);
1053 break;
1054
1055 case OP_ANYNL:
1056 case OP_VSPACE:
1057 SET_BIT(0x0A);
1058 SET_BIT(0x0B);
1059 SET_BIT(0x0C);
1060 SET_BIT(0x0D);
1061 if (utf8)
1062 {
1063 SET_BIT(0xC2); /* For U+0085 */
1064 SET_BIT(0xE2); /* For U+2028, U+2029 */
1065 }
1066 else SET_BIT(0x85);
1067 break;
1068
1069 case OP_NOT_DIGIT:
1070 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1071 break;
1072
1073 case OP_DIGIT:
1074 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1075 break;
1076
1077 /* The cbit_space table has vertical tab as whitespace; we have to
1078 ensure it gets set as not whitespace. */
1079
1080 case OP_NOT_WHITESPACE:
1081 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1082 start_bits[1] |= 0x08;
1083 break;
1084
1085 /* The cbit_space table has vertical tab as whitespace; we have to
1086 avoid setting it. */
1087
1088 case OP_WHITESPACE:
1089 c = start_bits[1]; /* Save in case it was already set */
1090 set_type_bits(start_bits, cbit_space, table_limit, cd);
1091 start_bits[1] = (start_bits[1] & ~0x08) | c;
1092 break;
1093
1094 case OP_NOT_WORDCHAR:
1095 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1096 break;
1097
1098 case OP_WORDCHAR:
1099 set_type_bits(start_bits, cbit_word, table_limit, cd);
1100 break;
1101 }
1102
1103 tcode += 2;
1104 break;
1105
1106 /* Character class where all the information is in a bit map: set the
1107 bits and either carry on or not, according to the repeat count. If it was
1108 a negative class, and we are operating with UTF-8 characters, any byte
1109 with a value >= 0xc4 is a potentially valid starter because it starts a
1110 character with a value > 255. */
1111
1112 case OP_NCLASS:
1113 #ifdef SUPPORT_UTF8
1114 if (utf8)
1115 {
1116 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1117 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1118 }
1119 #endif
1120 /* Fall through */
1121
1122 case OP_CLASS:
1123 {
1124 tcode++;
1125
1126 /* In UTF-8 mode, the bits in a bit map correspond to character
1127 values, not to byte values. However, the bit map we are constructing is
1128 for byte values. So we have to do a conversion for characters whose
1129 value is > 127. In fact, there are only two possible starting bytes for
1130 characters in the range 128 - 255. */
1131
1132 #ifdef SUPPORT_UTF8
1133 if (utf8)
1134 {
1135 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
1136 for (c = 128; c < 256; c++)
1137 {
1138 if ((tcode[c/8] && (1 << (c&7))) != 0)
1139 {
1140 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1141 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1142 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1143 }
1144 }
1145 }
1146
1147 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1148
1149 else
1150 #endif
1151 {
1152 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1153 }
1154
1155 /* Advance past the bit map, and act on what follows. For a zero
1156 minimum repeat, continue; otherwise stop processing. */
1157
1158 tcode += 32;
1159 switch (*tcode)
1160 {
1161 case OP_CRSTAR:
1162 case OP_CRMINSTAR:
1163 case OP_CRQUERY:
1164 case OP_CRMINQUERY:
1165 tcode++;
1166 break;
1167
1168 case OP_CRRANGE:
1169 case OP_CRMINRANGE:
1170 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1171 else try_next = FALSE;
1172 break;
1173
1174 default:
1175 try_next = FALSE;
1176 break;
1177 }
1178 }
1179 break; /* End of bitmap class handling */
1180
1181 } /* End of switch */
1182 } /* End of try_next loop */
1183
1184 code += GET(code, 1); /* Advance to next branch */
1185 }
1186 while (*code == OP_ALT);
1187 return yield;
1188 }
1189
1190
1191
1192
1193
1194 /*************************************************
1195 * Study a compiled expression *
1196 *************************************************/
1197
1198 /* This function is handed a compiled expression that it must study to produce
1199 information that will speed up the matching. It returns a pcre_extra block
1200 which then gets handed back to pcre_exec().
1201
1202 Arguments:
1203 re points to the compiled expression
1204 options contains option bits
1205 errorptr points to where to place error messages;
1206 set NULL unless error
1207
1208 Returns: pointer to a pcre_extra block, with study_data filled in and the
1209 appropriate flags set;
1210 NULL on error or if no optimization possible
1211 */
1212
1213 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1214 pcre_study(const pcre *external_re, int options, const char **errorptr)
1215 {
1216 int min;
1217 BOOL bits_set = FALSE;
1218 BOOL had_accept = FALSE;
1219 uschar start_bits[32];
1220 pcre_extra *extra;
1221 pcre_study_data *study;
1222 const uschar *tables;
1223 uschar *code;
1224 compile_data compile_block;
1225 const real_pcre *re = (const real_pcre *)external_re;
1226
1227 *errorptr = NULL;
1228
1229 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1230 {
1231 *errorptr = "argument is not a compiled regular expression";
1232 return NULL;
1233 }
1234
1235 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1236 {
1237 *errorptr = "unknown or incorrect option bit(s) set";
1238 return NULL;
1239 }
1240
1241 code = (uschar *)re + re->name_table_offset +
1242 (re->name_count * re->name_entry_size);
1243
1244 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1245 a multiline pattern that matches only at "line starts", there is no point in
1246 seeking a list of starting bytes. */
1247
1248 if ((re->options & PCRE_ANCHORED) == 0 &&
1249 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1250 {
1251 int rc;
1252
1253 /* Set the character tables in the block that is passed around */
1254
1255 tables = re->tables;
1256 if (tables == NULL)
1257 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1258 (void *)(&tables));
1259
1260 compile_block.lcc = tables + lcc_offset;
1261 compile_block.fcc = tables + fcc_offset;
1262 compile_block.cbits = tables + cbits_offset;
1263 compile_block.ctypes = tables + ctypes_offset;
1264
1265 /* See if we can find a fixed set of initial characters for the pattern. */
1266
1267 memset(start_bits, 0, 32 * sizeof(uschar));
1268 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1269 &compile_block);
1270 bits_set = rc == SSB_DONE;
1271 if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";
1272 }
1273
1274 /* Find the minimum length of subject string. */
1275
1276 switch(min = find_minlength(code, code, re->options, &had_accept))
1277 {
1278 case -2: *errorptr = "internal error: missing capturing bracket"; break;
1279 case -3: *errorptr = "internal error: opcode not recognized"; break;
1280 default: break;
1281 }
1282
1283 /* Return NULL if there's been an error or if no optimization is possible. */
1284
1285 if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;
1286
1287 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1288 the latter, which is pointed to by the former, which may also get additional
1289 data set later by the calling program. At the moment, the size of
1290 pcre_study_data is fixed. We nevertheless save it in a field for returning via
1291 the pcre_fullinfo() function so that if it becomes variable in the future, we
1292 don't have to change that code. */
1293
1294 extra = (pcre_extra *)(pcre_malloc)
1295 (sizeof(pcre_extra) + sizeof(pcre_study_data));
1296
1297 if (extra == NULL)
1298 {
1299 *errorptr = "failed to get memory";
1300 return NULL;
1301 }
1302
1303 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1304 extra->flags = PCRE_EXTRA_STUDY_DATA;
1305 extra->study_data = study;
1306
1307 study->size = sizeof(pcre_study_data);
1308 study->flags = 0;
1309
1310 if (bits_set)
1311 {
1312 study->flags |= PCRE_STUDY_MAPPED;
1313 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1314 }
1315
1316 if (min >= 0)
1317 {
1318 study->flags |= PCRE_STUDY_MINLEN;
1319 study->minlength = min;
1320 }
1321
1322 return extra;
1323 }
1324
1325 /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5