/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 603 - (show annotations)
Fri May 27 10:14:09 2011 UTC (4 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 33954 byte(s)
Error occurred while calculating annotation data.
Fixed some omissions in pcre_study() for the new caseless opcodes.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre_internal.h"
50
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52
53 /* Returns from set_start_bits() */
54
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
56
57
58
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
62
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67
68 Arguments:
69 code pointer to start of group (the bracket)
70 startcode pointer to start of the whole pattern
71 options the compiling options
72
73 Returns: the minimum length
74 -1 if \C was encountered
75 -2 internal error (missing capturing bracket)
76 -3 internal error (opcode not listed)
77 */
78
79 static int
80 find_minlength(const uschar *code, const uschar *startcode, int options)
81 {
82 int length = -1;
83 BOOL utf8 = (options & PCRE_UTF8) != 0;
84 BOOL had_recurse = FALSE;
85 register int branchlength = 0;
86 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
87
88 if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
89
90 /* Scan along the opcodes for this branch. If we get to the end of the
91 branch, check the length against that of the other branches. */
92
93 for (;;)
94 {
95 int d, min;
96 uschar *cs, *ce;
97 register int op = *cc;
98
99 switch (op)
100 {
101 case OP_COND:
102 case OP_SCOND:
103
104 /* If there is only one branch in a condition, the implied branch has zero
105 length, so we don't add anything. This covers the DEFINE "condition"
106 automatically. */
107
108 cs = cc + GET(cc, 1);
109 if (*cs != OP_ALT)
110 {
111 cc = cs + 1 + LINK_SIZE;
112 break;
113 }
114
115 /* Otherwise we can fall through and treat it the same as any other
116 subpattern. */
117
118 case OP_CBRA:
119 case OP_SCBRA:
120 case OP_BRA:
121 case OP_SBRA:
122 case OP_ONCE:
123 d = find_minlength(cc, startcode, options);
124 if (d < 0) return d;
125 branchlength += d;
126 do cc += GET(cc, 1); while (*cc == OP_ALT);
127 cc += 1 + LINK_SIZE;
128 break;
129
130 /* Reached end of a branch; if it's a ket it is the end of a nested
131 call. If it's ALT it is an alternation in a nested call. If it is
132 END it's the end of the outer call. All can be handled by the same code. */
133
134 case OP_ALT:
135 case OP_KET:
136 case OP_KETRMAX:
137 case OP_KETRMIN:
138 case OP_END:
139 if (length < 0 || (!had_recurse && branchlength < length))
140 length = branchlength;
141 if (*cc != OP_ALT) return length;
142 cc += 1 + LINK_SIZE;
143 branchlength = 0;
144 had_recurse = FALSE;
145 break;
146
147 /* Skip over assertive subpatterns */
148
149 case OP_ASSERT:
150 case OP_ASSERT_NOT:
151 case OP_ASSERTBACK:
152 case OP_ASSERTBACK_NOT:
153 do cc += GET(cc, 1); while (*cc == OP_ALT);
154 /* Fall through */
155
156 /* Skip over things that don't match chars */
157
158 case OP_REVERSE:
159 case OP_CREF:
160 case OP_NCREF:
161 case OP_RREF:
162 case OP_NRREF:
163 case OP_DEF:
164 case OP_CALLOUT:
165 case OP_SOD:
166 case OP_SOM:
167 case OP_EOD:
168 case OP_EODN:
169 case OP_CIRC:
170 case OP_CIRCM:
171 case OP_DOLL:
172 case OP_DOLLM:
173 case OP_NOT_WORD_BOUNDARY:
174 case OP_WORD_BOUNDARY:
175 cc += _pcre_OP_lengths[*cc];
176 break;
177
178 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
179
180 case OP_BRAZERO:
181 case OP_BRAMINZERO:
182 case OP_SKIPZERO:
183 cc += _pcre_OP_lengths[*cc];
184 do cc += GET(cc, 1); while (*cc == OP_ALT);
185 cc += 1 + LINK_SIZE;
186 break;
187
188 /* Handle literal characters and + repetitions */
189
190 case OP_CHAR:
191 case OP_CHARI:
192 case OP_NOT:
193 case OP_NOTI:
194 case OP_PLUS:
195 case OP_PLUSI:
196 case OP_MINPLUS:
197 case OP_MINPLUSI:
198 case OP_POSPLUS:
199 case OP_POSPLUSI:
200 case OP_NOTPLUS:
201 case OP_NOTPLUSI:
202 case OP_NOTMINPLUS:
203 case OP_NOTMINPLUSI:
204 case OP_NOTPOSPLUS:
205 case OP_NOTPOSPLUSI:
206 branchlength++;
207 cc += 2;
208 #ifdef SUPPORT_UTF8
209 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
210 #endif
211 break;
212
213 case OP_TYPEPLUS:
214 case OP_TYPEMINPLUS:
215 case OP_TYPEPOSPLUS:
216 branchlength++;
217 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
218 break;
219
220 /* Handle exact repetitions. The count is already in characters, but we
221 need to skip over a multibyte character in UTF8 mode. */
222
223 case OP_EXACT:
224 case OP_EXACTI:
225 case OP_NOTEXACT:
226 case OP_NOTEXACTI:
227 branchlength += GET2(cc,1);
228 cc += 4;
229 #ifdef SUPPORT_UTF8
230 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
231 #endif
232 break;
233
234 case OP_TYPEEXACT:
235 branchlength += GET2(cc,1);
236 cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
237 break;
238
239 /* Handle single-char non-literal matchers */
240
241 case OP_PROP:
242 case OP_NOTPROP:
243 cc += 2;
244 /* Fall through */
245
246 case OP_NOT_DIGIT:
247 case OP_DIGIT:
248 case OP_NOT_WHITESPACE:
249 case OP_WHITESPACE:
250 case OP_NOT_WORDCHAR:
251 case OP_WORDCHAR:
252 case OP_ANY:
253 case OP_ALLANY:
254 case OP_EXTUNI:
255 case OP_HSPACE:
256 case OP_NOT_HSPACE:
257 case OP_VSPACE:
258 case OP_NOT_VSPACE:
259 branchlength++;
260 cc++;
261 break;
262
263 /* "Any newline" might match two characters */
264
265 case OP_ANYNL:
266 branchlength += 2;
267 cc++;
268 break;
269
270 /* The single-byte matcher means we can't proceed in UTF-8 mode */
271
272 case OP_ANYBYTE:
273 #ifdef SUPPORT_UTF8
274 if (utf8) return -1;
275 #endif
276 branchlength++;
277 cc++;
278 break;
279
280 /* For repeated character types, we have to test for \p and \P, which have
281 an extra two bytes of parameters. */
282
283 case OP_TYPESTAR:
284 case OP_TYPEMINSTAR:
285 case OP_TYPEQUERY:
286 case OP_TYPEMINQUERY:
287 case OP_TYPEPOSSTAR:
288 case OP_TYPEPOSQUERY:
289 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
290 cc += _pcre_OP_lengths[op];
291 break;
292
293 case OP_TYPEUPTO:
294 case OP_TYPEMINUPTO:
295 case OP_TYPEPOSUPTO:
296 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
297 cc += _pcre_OP_lengths[op];
298 break;
299
300 /* Check a class for variable quantification */
301
302 #ifdef SUPPORT_UTF8
303 case OP_XCLASS:
304 cc += GET(cc, 1) - 33;
305 /* Fall through */
306 #endif
307
308 case OP_CLASS:
309 case OP_NCLASS:
310 cc += 33;
311
312 switch (*cc)
313 {
314 case OP_CRPLUS:
315 case OP_CRMINPLUS:
316 branchlength++;
317 /* Fall through */
318
319 case OP_CRSTAR:
320 case OP_CRMINSTAR:
321 case OP_CRQUERY:
322 case OP_CRMINQUERY:
323 cc++;
324 break;
325
326 case OP_CRRANGE:
327 case OP_CRMINRANGE:
328 branchlength += GET2(cc,1);
329 cc += 5;
330 break;
331
332 default:
333 branchlength++;
334 break;
335 }
336 break;
337
338 /* Backreferences and subroutine calls are treated in the same way: we find
339 the minimum length for the subpattern. A recursion, however, causes an
340 a flag to be set that causes the length of this branch to be ignored. The
341 logic is that a recursion can only make sense if there is another
342 alternation that stops the recursing. That will provide the minimum length
343 (when no recursion happens). A backreference within the group that it is
344 referencing behaves in the same way.
345
346 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
347 matches an empty string (by default it causes a matching failure), so in
348 that case we must set the minimum length to zero. */
349
350 case OP_REF:
351 case OP_REFI:
352 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
353 {
354 ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
355 if (cs == NULL) return -2;
356 do ce += GET(ce, 1); while (*ce == OP_ALT);
357 if (cc > cs && cc < ce)
358 {
359 d = 0;
360 had_recurse = TRUE;
361 }
362 else d = find_minlength(cs, startcode, options);
363 }
364 else d = 0;
365 cc += 3;
366
367 /* Handle repeated back references */
368
369 switch (*cc)
370 {
371 case OP_CRSTAR:
372 case OP_CRMINSTAR:
373 case OP_CRQUERY:
374 case OP_CRMINQUERY:
375 min = 0;
376 cc++;
377 break;
378
379 case OP_CRRANGE:
380 case OP_CRMINRANGE:
381 min = GET2(cc, 1);
382 cc += 5;
383 break;
384
385 default:
386 min = 1;
387 break;
388 }
389
390 branchlength += min * d;
391 break;
392
393 case OP_RECURSE:
394 cs = ce = (uschar *)startcode + GET(cc, 1);
395 if (cs == NULL) return -2;
396 do ce += GET(ce, 1); while (*ce == OP_ALT);
397 if (cc > cs && cc < ce)
398 had_recurse = TRUE;
399 else
400 branchlength += find_minlength(cs, startcode, options);
401 cc += 1 + LINK_SIZE;
402 break;
403
404 /* Anything else does not or need not match a character. We can get the
405 item's length from the table, but for those that can match zero occurrences
406 of a character, we must take special action for UTF-8 characters. As it
407 happens, the "NOT" versions of these opcodes are used at present only for
408 ASCII characters, so they could be omitted from this list. However, in
409 future that may change, so we include them here so as not to leave a
410 gotcha for a future maintainer. */
411
412 case OP_UPTO:
413 case OP_UPTOI:
414 case OP_NOTUPTO:
415 case OP_NOTUPTOI:
416 case OP_MINUPTO:
417 case OP_MINUPTOI:
418 case OP_NOTMINUPTO:
419 case OP_NOTMINUPTOI:
420 case OP_POSUPTO:
421 case OP_POSUPTOI:
422 case OP_NOTPOSUPTO:
423 case OP_NOTPOSUPTOI:
424
425 case OP_STAR:
426 case OP_STARI:
427 case OP_NOTSTAR:
428 case OP_NOTSTARI:
429 case OP_MINSTAR:
430 case OP_MINSTARI:
431 case OP_NOTMINSTAR:
432 case OP_NOTMINSTARI:
433 case OP_POSSTAR:
434 case OP_POSSTARI:
435 case OP_NOTPOSSTAR:
436 case OP_NOTPOSSTARI:
437
438 case OP_QUERY:
439 case OP_QUERYI:
440 case OP_NOTQUERY:
441 case OP_NOTQUERYI:
442 case OP_MINQUERY:
443 case OP_MINQUERYI:
444 case OP_NOTMINQUERY:
445 case OP_NOTMINQUERYI:
446 case OP_POSQUERY:
447 case OP_POSQUERYI:
448 case OP_NOTPOSQUERY:
449 case OP_NOTPOSQUERYI:
450
451 cc += _pcre_OP_lengths[op];
452 #ifdef SUPPORT_UTF8
453 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
454 #endif
455 break;
456
457 /* Skip these, but we need to add in the name length. */
458
459 case OP_MARK:
460 case OP_PRUNE_ARG:
461 case OP_SKIP_ARG:
462 cc += _pcre_OP_lengths[op] + cc[1];
463 break;
464
465 case OP_THEN_ARG:
466 cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
467 break;
468
469 /* The remaining opcodes are just skipped over. */
470
471 case OP_ACCEPT:
472 case OP_CLOSE:
473 case OP_COMMIT:
474 case OP_FAIL:
475 case OP_PRUNE:
476 case OP_SET_SOM:
477 case OP_SKIP:
478 case OP_THEN:
479 cc += _pcre_OP_lengths[op];
480 break;
481
482 /* This should not occur: we list all opcodes explicitly so that when
483 new ones get added they are properly considered. */
484
485 default:
486 return -3;
487 }
488 }
489 /* Control never gets here */
490 }
491
492
493
494 /*************************************************
495 * Set a bit and maybe its alternate case *
496 *************************************************/
497
498 /* Given a character, set its first byte's bit in the table, and also the
499 corresponding bit for the other version of a letter if we are caseless. In
500 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
501 when Unicode property support is available.
502
503 Arguments:
504 start_bits points to the bit map
505 p points to the character
506 caseless the caseless flag
507 cd the block with char table pointers
508 utf8 TRUE for UTF-8 mode
509
510 Returns: pointer after the character
511 */
512
513 static const uschar *
514 set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
515 compile_data *cd, BOOL utf8)
516 {
517 unsigned int c = *p;
518
519 SET_BIT(c);
520
521 #ifdef SUPPORT_UTF8
522 if (utf8 && c > 127)
523 {
524 GETCHARINC(c, p);
525 #ifdef SUPPORT_UCP
526 if (caseless)
527 {
528 uschar buff[8];
529 c = UCD_OTHERCASE(c);
530 (void)_pcre_ord2utf8(c, buff);
531 SET_BIT(buff[0]);
532 }
533 #endif
534 return p;
535 }
536 #endif
537
538 /* Not UTF-8 mode, or character is less than 127. */
539
540 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
541 return p + 1;
542 }
543
544
545
546 /*************************************************
547 * Set bits for a positive character type *
548 *************************************************/
549
550 /* This function sets starting bits for a character type. In UTF-8 mode, we can
551 only do a direct setting for bytes less than 128, as otherwise there can be
552 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
553 environment, the tables will only recognize ASCII characters anyway, but in at
554 least one Windows environment, some higher bytes bits were set in the tables.
555 So we deal with that case by considering the UTF-8 encoding.
556
557 Arguments:
558 start_bits the starting bitmap
559 cbit type the type of character wanted
560 table_limit 32 for non-UTF-8; 16 for UTF-8
561 cd the block with char table pointers
562
563 Returns: nothing
564 */
565
566 static void
567 set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
568 compile_data *cd)
569 {
570 register int c;
571 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
572 if (table_limit == 32) return;
573 for (c = 128; c < 256; c++)
574 {
575 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
576 {
577 uschar buff[8];
578 (void)_pcre_ord2utf8(c, buff);
579 SET_BIT(buff[0]);
580 }
581 }
582 }
583
584
585 /*************************************************
586 * Set bits for a negative character type *
587 *************************************************/
588
589 /* This function sets starting bits for a negative character type such as \D.
590 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
591 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
592 Unlike in the positive case, where we can set appropriate starting bits for
593 specific high-valued UTF-8 characters, in this case we have to set the bits for
594 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
595 0xc0 (192) for simplicity.
596
597 Arguments:
598 start_bits the starting bitmap
599 cbit type the type of character wanted
600 table_limit 32 for non-UTF-8; 16 for UTF-8
601 cd the block with char table pointers
602
603 Returns: nothing
604 */
605
606 static void
607 set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
608 compile_data *cd)
609 {
610 register int c;
611 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
612 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
613 }
614
615
616
617 /*************************************************
618 * Create bitmap of starting bytes *
619 *************************************************/
620
621 /* This function scans a compiled unanchored expression recursively and
622 attempts to build a bitmap of the set of possible starting bytes. As time goes
623 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
624 useful for parenthesized groups in patterns such as (a*)b where the group
625 provides some optional starting bytes but scanning must continue at the outer
626 level to find at least one mandatory byte. At the outermost level, this
627 function fails unless the result is SSB_DONE.
628
629 Arguments:
630 code points to an expression
631 start_bits points to a 32-byte table, initialized to 0
632 utf8 TRUE if in UTF-8 mode
633 cd the block with char table pointers
634
635 Returns: SSB_FAIL => Failed to find any starting bytes
636 SSB_DONE => Found mandatory starting bytes
637 SSB_CONTINUE => Found optional starting bytes
638 */
639
640 static int
641 set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
642 compile_data *cd)
643 {
644 register int c;
645 int yield = SSB_DONE;
646 int table_limit = utf8? 16:32;
647
648 #if 0
649 /* ========================================================================= */
650 /* The following comment and code was inserted in January 1999. In May 2006,
651 when it was observed to cause compiler warnings about unused values, I took it
652 out again. If anybody is still using OS/2, they will have to put it back
653 manually. */
654
655 /* This next statement and the later reference to dummy are here in order to
656 trick the optimizer of the IBM C compiler for OS/2 into generating correct
657 code. Apparently IBM isn't going to fix the problem, and we would rather not
658 disable optimization (in this module it actually makes a big difference, and
659 the pcre module can use all the optimization it can get). */
660
661 volatile int dummy;
662 /* ========================================================================= */
663 #endif
664
665 do
666 {
667 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
668 BOOL try_next = TRUE;
669
670 while (try_next) /* Loop for items in this branch */
671 {
672 int rc;
673 switch(*tcode)
674 {
675 /* Fail if we reach something we don't understand */
676
677 default:
678 return SSB_FAIL;
679
680 /* If we hit a bracket or a positive lookahead assertion, recurse to set
681 bits from within the subpattern. If it can't find anything, we have to
682 give up. If it finds some mandatory character(s), we are done for this
683 branch. Otherwise, carry on scanning after the subpattern. */
684
685 case OP_BRA:
686 case OP_SBRA:
687 case OP_CBRA:
688 case OP_SCBRA:
689 case OP_ONCE:
690 case OP_ASSERT:
691 rc = set_start_bits(tcode, start_bits, utf8, cd);
692 if (rc == SSB_FAIL) return SSB_FAIL;
693 if (rc == SSB_DONE) try_next = FALSE; else
694 {
695 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
696 tcode += 1 + LINK_SIZE;
697 }
698 break;
699
700 /* If we hit ALT or KET, it means we haven't found anything mandatory in
701 this branch, though we might have found something optional. For ALT, we
702 continue with the next alternative, but we have to arrange that the final
703 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
704 return SSB_CONTINUE: if this is the top level, that indicates failure,
705 but after a nested subpattern, it causes scanning to continue. */
706
707 case OP_ALT:
708 yield = SSB_CONTINUE;
709 try_next = FALSE;
710 break;
711
712 case OP_KET:
713 case OP_KETRMAX:
714 case OP_KETRMIN:
715 return SSB_CONTINUE;
716
717 /* Skip over callout */
718
719 case OP_CALLOUT:
720 tcode += 2 + 2*LINK_SIZE;
721 break;
722
723 /* Skip over lookbehind and negative lookahead assertions */
724
725 case OP_ASSERT_NOT:
726 case OP_ASSERTBACK:
727 case OP_ASSERTBACK_NOT:
728 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
729 tcode += 1 + LINK_SIZE;
730 break;
731
732 /* BRAZERO does the bracket, but carries on. */
733
734 case OP_BRAZERO:
735 case OP_BRAMINZERO:
736 if (set_start_bits(++tcode, start_bits, utf8, cd) == SSB_FAIL)
737 return SSB_FAIL;
738 /* =========================================================================
739 See the comment at the head of this function concerning the next line,
740 which was an old fudge for the benefit of OS/2.
741 dummy = 1;
742 ========================================================================= */
743 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
744 tcode += 1 + LINK_SIZE;
745 break;
746
747 /* SKIPZERO skips the bracket. */
748
749 case OP_SKIPZERO:
750 tcode++;
751 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
752 tcode += 1 + LINK_SIZE;
753 break;
754
755 /* Single-char * or ? sets the bit and tries the next item */
756
757 case OP_STAR:
758 case OP_MINSTAR:
759 case OP_POSSTAR:
760 case OP_QUERY:
761 case OP_MINQUERY:
762 case OP_POSQUERY:
763 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
764 break;
765
766 case OP_STARI:
767 case OP_MINSTARI:
768 case OP_POSSTARI:
769 case OP_QUERYI:
770 case OP_MINQUERYI:
771 case OP_POSQUERYI:
772 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
773 break;
774
775 /* Single-char upto sets the bit and tries the next */
776
777 case OP_UPTO:
778 case OP_MINUPTO:
779 case OP_POSUPTO:
780 tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
781 break;
782
783 case OP_UPTOI:
784 case OP_MINUPTOI:
785 case OP_POSUPTOI:
786 tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
787 break;
788
789 /* At least one single char sets the bit and stops */
790
791 case OP_EXACT:
792 tcode += 2;
793 /* Fall through */
794 case OP_CHAR:
795 case OP_PLUS:
796 case OP_MINPLUS:
797 case OP_POSPLUS:
798 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
799 try_next = FALSE;
800 break;
801
802 case OP_EXACTI:
803 tcode += 2;
804 /* Fall through */
805 case OP_CHARI:
806 case OP_PLUSI:
807 case OP_MINPLUSI:
808 case OP_POSPLUSI:
809 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
810 try_next = FALSE;
811 break;
812
813 /* Special spacing and line-terminating items. These recognize specific
814 lists of characters. The difference between VSPACE and ANYNL is that the
815 latter can match the two-character CRLF sequence, but that is not
816 relevant for finding the first character, so their code here is
817 identical. */
818
819 case OP_HSPACE:
820 SET_BIT(0x09);
821 SET_BIT(0x20);
822 if (utf8)
823 {
824 SET_BIT(0xC2); /* For U+00A0 */
825 SET_BIT(0xE1); /* For U+1680, U+180E */
826 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
827 SET_BIT(0xE3); /* For U+3000 */
828 }
829 else SET_BIT(0xA0);
830 try_next = FALSE;
831 break;
832
833 case OP_ANYNL:
834 case OP_VSPACE:
835 SET_BIT(0x0A);
836 SET_BIT(0x0B);
837 SET_BIT(0x0C);
838 SET_BIT(0x0D);
839 if (utf8)
840 {
841 SET_BIT(0xC2); /* For U+0085 */
842 SET_BIT(0xE2); /* For U+2028, U+2029 */
843 }
844 else SET_BIT(0x85);
845 try_next = FALSE;
846 break;
847
848 /* Single character types set the bits and stop. Note that if PCRE_UCP
849 is set, we do not see these op codes because \d etc are converted to
850 properties. Therefore, these apply in the case when only characters less
851 than 256 are recognized to match the types. */
852
853 case OP_NOT_DIGIT:
854 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
855 try_next = FALSE;
856 break;
857
858 case OP_DIGIT:
859 set_type_bits(start_bits, cbit_digit, table_limit, cd);
860 try_next = FALSE;
861 break;
862
863 /* The cbit_space table has vertical tab as whitespace; we have to
864 ensure it is set as not whitespace. */
865
866 case OP_NOT_WHITESPACE:
867 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
868 start_bits[1] |= 0x08;
869 try_next = FALSE;
870 break;
871
872 /* The cbit_space table has vertical tab as whitespace; we have to
873 not set it from the table. */
874
875 case OP_WHITESPACE:
876 c = start_bits[1]; /* Save in case it was already set */
877 set_type_bits(start_bits, cbit_space, table_limit, cd);
878 start_bits[1] = (start_bits[1] & ~0x08) | c;
879 try_next = FALSE;
880 break;
881
882 case OP_NOT_WORDCHAR:
883 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
884 try_next = FALSE;
885 break;
886
887 case OP_WORDCHAR:
888 set_type_bits(start_bits, cbit_word, table_limit, cd);
889 try_next = FALSE;
890 break;
891
892 /* One or more character type fudges the pointer and restarts, knowing
893 it will hit a single character type and stop there. */
894
895 case OP_TYPEPLUS:
896 case OP_TYPEMINPLUS:
897 case OP_TYPEPOSPLUS:
898 tcode++;
899 break;
900
901 case OP_TYPEEXACT:
902 tcode += 3;
903 break;
904
905 /* Zero or more repeats of character types set the bits and then
906 try again. */
907
908 case OP_TYPEUPTO:
909 case OP_TYPEMINUPTO:
910 case OP_TYPEPOSUPTO:
911 tcode += 2; /* Fall through */
912
913 case OP_TYPESTAR:
914 case OP_TYPEMINSTAR:
915 case OP_TYPEPOSSTAR:
916 case OP_TYPEQUERY:
917 case OP_TYPEMINQUERY:
918 case OP_TYPEPOSQUERY:
919 switch(tcode[1])
920 {
921 default:
922 case OP_ANY:
923 case OP_ALLANY:
924 return SSB_FAIL;
925
926 case OP_HSPACE:
927 SET_BIT(0x09);
928 SET_BIT(0x20);
929 if (utf8)
930 {
931 SET_BIT(0xC2); /* For U+00A0 */
932 SET_BIT(0xE1); /* For U+1680, U+180E */
933 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
934 SET_BIT(0xE3); /* For U+3000 */
935 }
936 else SET_BIT(0xA0);
937 break;
938
939 case OP_ANYNL:
940 case OP_VSPACE:
941 SET_BIT(0x0A);
942 SET_BIT(0x0B);
943 SET_BIT(0x0C);
944 SET_BIT(0x0D);
945 if (utf8)
946 {
947 SET_BIT(0xC2); /* For U+0085 */
948 SET_BIT(0xE2); /* For U+2028, U+2029 */
949 }
950 else SET_BIT(0x85);
951 break;
952
953 case OP_NOT_DIGIT:
954 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
955 break;
956
957 case OP_DIGIT:
958 set_type_bits(start_bits, cbit_digit, table_limit, cd);
959 break;
960
961 /* The cbit_space table has vertical tab as whitespace; we have to
962 ensure it gets set as not whitespace. */
963
964 case OP_NOT_WHITESPACE:
965 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
966 start_bits[1] |= 0x08;
967 break;
968
969 /* The cbit_space table has vertical tab as whitespace; we have to
970 avoid setting it. */
971
972 case OP_WHITESPACE:
973 c = start_bits[1]; /* Save in case it was already set */
974 set_type_bits(start_bits, cbit_space, table_limit, cd);
975 start_bits[1] = (start_bits[1] & ~0x08) | c;
976 break;
977
978 case OP_NOT_WORDCHAR:
979 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
980 break;
981
982 case OP_WORDCHAR:
983 set_type_bits(start_bits, cbit_word, table_limit, cd);
984 break;
985 }
986
987 tcode += 2;
988 break;
989
990 /* Character class where all the information is in a bit map: set the
991 bits and either carry on or not, according to the repeat count. If it was
992 a negative class, and we are operating with UTF-8 characters, any byte
993 with a value >= 0xc4 is a potentially valid starter because it starts a
994 character with a value > 255. */
995
996 case OP_NCLASS:
997 #ifdef SUPPORT_UTF8
998 if (utf8)
999 {
1000 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1001 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1002 }
1003 #endif
1004 /* Fall through */
1005
1006 case OP_CLASS:
1007 {
1008 tcode++;
1009
1010 /* In UTF-8 mode, the bits in a bit map correspond to character
1011 values, not to byte values. However, the bit map we are constructing is
1012 for byte values. So we have to do a conversion for characters whose
1013 value is > 127. In fact, there are only two possible starting bytes for
1014 characters in the range 128 - 255. */
1015
1016 #ifdef SUPPORT_UTF8
1017 if (utf8)
1018 {
1019 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
1020 for (c = 128; c < 256; c++)
1021 {
1022 if ((tcode[c/8] && (1 << (c&7))) != 0)
1023 {
1024 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1025 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1026 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1027 }
1028 }
1029 }
1030
1031 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1032
1033 else
1034 #endif
1035 {
1036 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1037 }
1038
1039 /* Advance past the bit map, and act on what follows */
1040
1041 tcode += 32;
1042 switch (*tcode)
1043 {
1044 case OP_CRSTAR:
1045 case OP_CRMINSTAR:
1046 case OP_CRQUERY:
1047 case OP_CRMINQUERY:
1048 tcode++;
1049 break;
1050
1051 case OP_CRRANGE:
1052 case OP_CRMINRANGE:
1053 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1054 else try_next = FALSE;
1055 break;
1056
1057 default:
1058 try_next = FALSE;
1059 break;
1060 }
1061 }
1062 break; /* End of bitmap class handling */
1063
1064 } /* End of switch */
1065 } /* End of try_next loop */
1066
1067 code += GET(code, 1); /* Advance to next branch */
1068 }
1069 while (*code == OP_ALT);
1070 return yield;
1071 }
1072
1073
1074
1075 /*************************************************
1076 * Study a compiled expression *
1077 *************************************************/
1078
1079 /* This function is handed a compiled expression that it must study to produce
1080 information that will speed up the matching. It returns a pcre_extra block
1081 which then gets handed back to pcre_exec().
1082
1083 Arguments:
1084 re points to the compiled expression
1085 options contains option bits
1086 errorptr points to where to place error messages;
1087 set NULL unless error
1088
1089 Returns: pointer to a pcre_extra block, with study_data filled in and the
1090 appropriate flags set;
1091 NULL on error or if no optimization possible
1092 */
1093
1094 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1095 pcre_study(const pcre *external_re, int options, const char **errorptr)
1096 {
1097 int min;
1098 BOOL bits_set = FALSE;
1099 uschar start_bits[32];
1100 pcre_extra *extra;
1101 pcre_study_data *study;
1102 const uschar *tables;
1103 uschar *code;
1104 compile_data compile_block;
1105 const real_pcre *re = (const real_pcre *)external_re;
1106
1107 *errorptr = NULL;
1108
1109 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1110 {
1111 *errorptr = "argument is not a compiled regular expression";
1112 return NULL;
1113 }
1114
1115 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1116 {
1117 *errorptr = "unknown or incorrect option bit(s) set";
1118 return NULL;
1119 }
1120
1121 code = (uschar *)re + re->name_table_offset +
1122 (re->name_count * re->name_entry_size);
1123
1124 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1125 a multiline pattern that matches only at "line starts", there is no point in
1126 seeking a list of starting bytes. */
1127
1128 if ((re->options & PCRE_ANCHORED) == 0 &&
1129 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1130 {
1131 /* Set the character tables in the block that is passed around */
1132
1133 tables = re->tables;
1134 if (tables == NULL)
1135 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1136 (void *)(&tables));
1137
1138 compile_block.lcc = tables + lcc_offset;
1139 compile_block.fcc = tables + fcc_offset;
1140 compile_block.cbits = tables + cbits_offset;
1141 compile_block.ctypes = tables + ctypes_offset;
1142
1143 /* See if we can find a fixed set of initial characters for the pattern. */
1144
1145 memset(start_bits, 0, 32 * sizeof(uschar));
1146 bits_set = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1147 &compile_block) == SSB_DONE;
1148 }
1149
1150 /* Find the minimum length of subject string. */
1151
1152 switch(min = find_minlength(code, code, re->options))
1153 {
1154 case -2: *errorptr = "internal error: missing capturing bracket"; break;
1155 case -3: *errorptr = "internal error: opcode not recognized"; break;
1156 default: break;
1157 }
1158
1159 /* Return NULL if no optimization is possible. */
1160
1161 if (!bits_set && min < 0) return NULL;
1162
1163 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1164 the latter, which is pointed to by the former, which may also get additional
1165 data set later by the calling program. At the moment, the size of
1166 pcre_study_data is fixed. We nevertheless save it in a field for returning via
1167 the pcre_fullinfo() function so that if it becomes variable in the future, we
1168 don't have to change that code. */
1169
1170 extra = (pcre_extra *)(pcre_malloc)
1171 (sizeof(pcre_extra) + sizeof(pcre_study_data));
1172
1173 if (extra == NULL)
1174 {
1175 *errorptr = "failed to get memory";
1176 return NULL;
1177 }
1178
1179 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1180 extra->flags = PCRE_EXTRA_STUDY_DATA;
1181 extra->study_data = study;
1182
1183 study->size = sizeof(pcre_study_data);
1184 study->flags = 0;
1185
1186 if (bits_set)
1187 {
1188 study->flags |= PCRE_STUDY_MAPPED;
1189 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1190 }
1191
1192 if (min >= 0)
1193 {
1194 study->flags |= PCRE_STUDY_MINLEN;
1195 study->minlength = min;
1196 }
1197
1198 return extra;
1199 }
1200
1201 /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5