/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1146 - (show annotations)
Sat Oct 20 16:45:33 2012 UTC (7 years ago) by zherczeg
File MIME type: text/plain
File size: 44921 byte(s)
Fix a possible overflow in 64 bit.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre_internal.h"
50
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52
53 /* Returns from set_start_bits() */
54
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56
57
58
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
62
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67
68 Arguments:
69 code pointer to start of group (the bracket)
70 startcode pointer to start of the whole pattern
71 options the compiling options
72 int RECURSE depth
73
74 Returns: the minimum length
75 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
76 -2 internal error (missing capturing bracket)
77 -3 internal error (opcode not listed)
78 */
79
80 static int
81 find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
82 int recurse_depth)
83 {
84 int length = -1;
85 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
86 BOOL utf = (options & PCRE_UTF8) != 0;
87 BOOL had_recurse = FALSE;
88 register int branchlength = 0;
89 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
90
91 if (*code == OP_CBRA || *code == OP_SCBRA ||
92 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
93
94 /* Scan along the opcodes for this branch. If we get to the end of the
95 branch, check the length against that of the other branches. */
96
97 for (;;)
98 {
99 int d, min;
100 pcre_uchar *cs, *ce;
101 register pcre_uchar op = *cc;
102
103 switch (op)
104 {
105 case OP_COND:
106 case OP_SCOND:
107
108 /* If there is only one branch in a condition, the implied branch has zero
109 length, so we don't add anything. This covers the DEFINE "condition"
110 automatically. */
111
112 cs = cc + GET(cc, 1);
113 if (*cs != OP_ALT)
114 {
115 cc = cs + 1 + LINK_SIZE;
116 break;
117 }
118
119 /* Otherwise we can fall through and treat it the same as any other
120 subpattern. */
121
122 case OP_CBRA:
123 case OP_SCBRA:
124 case OP_BRA:
125 case OP_SBRA:
126 case OP_CBRAPOS:
127 case OP_SCBRAPOS:
128 case OP_BRAPOS:
129 case OP_SBRAPOS:
130 case OP_ONCE:
131 case OP_ONCE_NC:
132 d = find_minlength(cc, startcode, options, recurse_depth);
133 if (d < 0) return d;
134 branchlength += d;
135 do cc += GET(cc, 1); while (*cc == OP_ALT);
136 cc += 1 + LINK_SIZE;
137 break;
138
139 /* ACCEPT makes things far too complicated; we have to give up. */
140
141 case OP_ACCEPT:
142 case OP_ASSERT_ACCEPT:
143 return -1;
144
145 /* Reached end of a branch; if it's a ket it is the end of a nested
146 call. If it's ALT it is an alternation in a nested call. If it is END it's
147 the end of the outer call. All can be handled by the same code. If an
148 ACCEPT was previously encountered, use the length that was in force at that
149 time, and pass back the shortest ACCEPT length. */
150
151 case OP_ALT:
152 case OP_KET:
153 case OP_KETRMAX:
154 case OP_KETRMIN:
155 case OP_KETRPOS:
156 case OP_END:
157 if (length < 0 || (!had_recurse && branchlength < length))
158 length = branchlength;
159 if (op != OP_ALT) return length;
160 cc += 1 + LINK_SIZE;
161 branchlength = 0;
162 had_recurse = FALSE;
163 break;
164
165 /* Skip over assertive subpatterns */
166
167 case OP_ASSERT:
168 case OP_ASSERT_NOT:
169 case OP_ASSERTBACK:
170 case OP_ASSERTBACK_NOT:
171 do cc += GET(cc, 1); while (*cc == OP_ALT);
172 /* Fall through */
173
174 /* Skip over things that don't match chars */
175
176 case OP_REVERSE:
177 case OP_CREF:
178 case OP_NCREF:
179 case OP_RREF:
180 case OP_NRREF:
181 case OP_DEF:
182 case OP_CALLOUT:
183 case OP_SOD:
184 case OP_SOM:
185 case OP_EOD:
186 case OP_EODN:
187 case OP_CIRC:
188 case OP_CIRCM:
189 case OP_DOLL:
190 case OP_DOLLM:
191 case OP_NOT_WORD_BOUNDARY:
192 case OP_WORD_BOUNDARY:
193 cc += PRIV(OP_lengths)[*cc];
194 break;
195
196 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
197
198 case OP_BRAZERO:
199 case OP_BRAMINZERO:
200 case OP_BRAPOSZERO:
201 case OP_SKIPZERO:
202 cc += PRIV(OP_lengths)[*cc];
203 do cc += GET(cc, 1); while (*cc == OP_ALT);
204 cc += 1 + LINK_SIZE;
205 break;
206
207 /* Handle literal characters and + repetitions */
208
209 case OP_CHAR:
210 case OP_CHARI:
211 case OP_NOT:
212 case OP_NOTI:
213 case OP_PLUS:
214 case OP_PLUSI:
215 case OP_MINPLUS:
216 case OP_MINPLUSI:
217 case OP_POSPLUS:
218 case OP_POSPLUSI:
219 case OP_NOTPLUS:
220 case OP_NOTPLUSI:
221 case OP_NOTMINPLUS:
222 case OP_NOTMINPLUSI:
223 case OP_NOTPOSPLUS:
224 case OP_NOTPOSPLUSI:
225 branchlength++;
226 cc += 2;
227 #ifdef SUPPORT_UTF
228 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
229 #endif
230 break;
231
232 case OP_TYPEPLUS:
233 case OP_TYPEMINPLUS:
234 case OP_TYPEPOSPLUS:
235 branchlength++;
236 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
237 break;
238
239 /* Handle exact repetitions. The count is already in characters, but we
240 need to skip over a multibyte character in UTF8 mode. */
241
242 case OP_EXACT:
243 case OP_EXACTI:
244 case OP_NOTEXACT:
245 case OP_NOTEXACTI:
246 branchlength += GET2(cc,1);
247 cc += 2 + IMM2_SIZE;
248 #ifdef SUPPORT_UTF
249 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
250 #endif
251 break;
252
253 case OP_TYPEEXACT:
254 branchlength += GET2(cc,1);
255 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
256 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
257 break;
258
259 /* Handle single-char non-literal matchers */
260
261 case OP_PROP:
262 case OP_NOTPROP:
263 cc += 2;
264 /* Fall through */
265
266 case OP_NOT_DIGIT:
267 case OP_DIGIT:
268 case OP_NOT_WHITESPACE:
269 case OP_WHITESPACE:
270 case OP_NOT_WORDCHAR:
271 case OP_WORDCHAR:
272 case OP_ANY:
273 case OP_ALLANY:
274 case OP_EXTUNI:
275 case OP_HSPACE:
276 case OP_NOT_HSPACE:
277 case OP_VSPACE:
278 case OP_NOT_VSPACE:
279 branchlength++;
280 cc++;
281 break;
282
283 /* "Any newline" might match two characters, but it also might match just
284 one. */
285
286 case OP_ANYNL:
287 branchlength += 1;
288 cc++;
289 break;
290
291 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
292 non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
293 appear, but leave the code, just in case.) */
294
295 case OP_ANYBYTE:
296 #ifdef SUPPORT_UTF
297 if (utf) return -1;
298 #endif
299 branchlength++;
300 cc++;
301 break;
302
303 /* For repeated character types, we have to test for \p and \P, which have
304 an extra two bytes of parameters. */
305
306 case OP_TYPESTAR:
307 case OP_TYPEMINSTAR:
308 case OP_TYPEQUERY:
309 case OP_TYPEMINQUERY:
310 case OP_TYPEPOSSTAR:
311 case OP_TYPEPOSQUERY:
312 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
313 cc += PRIV(OP_lengths)[op];
314 break;
315
316 case OP_TYPEUPTO:
317 case OP_TYPEMINUPTO:
318 case OP_TYPEPOSUPTO:
319 if (cc[1 + IMM2_SIZE] == OP_PROP
320 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
321 cc += PRIV(OP_lengths)[op];
322 break;
323
324 /* Check a class for variable quantification */
325
326 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
327 case OP_XCLASS:
328 cc += GET(cc, 1);
329 cc -= PRIV(OP_lengths)[OP_CLASS];
330 /* Fall through */
331 #endif
332
333 case OP_CLASS:
334 case OP_NCLASS:
335 cc += PRIV(OP_lengths)[OP_CLASS];
336
337 switch (*cc)
338 {
339 case OP_CRPLUS:
340 case OP_CRMINPLUS:
341 branchlength++;
342 /* Fall through */
343
344 case OP_CRSTAR:
345 case OP_CRMINSTAR:
346 case OP_CRQUERY:
347 case OP_CRMINQUERY:
348 cc++;
349 break;
350
351 case OP_CRRANGE:
352 case OP_CRMINRANGE:
353 branchlength += GET2(cc,1);
354 cc += 1 + 2 * IMM2_SIZE;
355 break;
356
357 default:
358 branchlength++;
359 break;
360 }
361 break;
362
363 /* Backreferences and subroutine calls are treated in the same way: we find
364 the minimum length for the subpattern. A recursion, however, causes an
365 a flag to be set that causes the length of this branch to be ignored. The
366 logic is that a recursion can only make sense if there is another
367 alternation that stops the recursing. That will provide the minimum length
368 (when no recursion happens). A backreference within the group that it is
369 referencing behaves in the same way.
370
371 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
372 matches an empty string (by default it causes a matching failure), so in
373 that case we must set the minimum length to zero. */
374
375 case OP_REF:
376 case OP_REFI:
377 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
378 {
379 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
380 if (cs == NULL) return -2;
381 do ce += GET(ce, 1); while (*ce == OP_ALT);
382 if (cc > cs && cc < ce)
383 {
384 d = 0;
385 had_recurse = TRUE;
386 }
387 else
388 {
389 d = find_minlength(cs, startcode, options, recurse_depth);
390 }
391 }
392 else d = 0;
393 cc += 1 + IMM2_SIZE;
394
395 /* Handle repeated back references */
396
397 switch (*cc)
398 {
399 case OP_CRSTAR:
400 case OP_CRMINSTAR:
401 case OP_CRQUERY:
402 case OP_CRMINQUERY:
403 min = 0;
404 cc++;
405 break;
406
407 case OP_CRPLUS:
408 case OP_CRMINPLUS:
409 min = 1;
410 cc++;
411 break;
412
413 case OP_CRRANGE:
414 case OP_CRMINRANGE:
415 min = GET2(cc, 1);
416 cc += 1 + 2 * IMM2_SIZE;
417 break;
418
419 default:
420 min = 1;
421 break;
422 }
423
424 branchlength += min * d;
425 break;
426
427 /* We can easily detect direct recursion, but not mutual recursion. This is
428 caught by a recursion depth count. */
429
430 case OP_RECURSE:
431 cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
432 do ce += GET(ce, 1); while (*ce == OP_ALT);
433 if ((cc > cs && cc < ce) || recurse_depth > 10)
434 had_recurse = TRUE;
435 else
436 {
437 branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
438 }
439 cc += 1 + LINK_SIZE;
440 break;
441
442 /* Anything else does not or need not match a character. We can get the
443 item's length from the table, but for those that can match zero occurrences
444 of a character, we must take special action for UTF-8 characters. As it
445 happens, the "NOT" versions of these opcodes are used at present only for
446 ASCII characters, so they could be omitted from this list. However, in
447 future that may change, so we include them here so as not to leave a
448 gotcha for a future maintainer. */
449
450 case OP_UPTO:
451 case OP_UPTOI:
452 case OP_NOTUPTO:
453 case OP_NOTUPTOI:
454 case OP_MINUPTO:
455 case OP_MINUPTOI:
456 case OP_NOTMINUPTO:
457 case OP_NOTMINUPTOI:
458 case OP_POSUPTO:
459 case OP_POSUPTOI:
460 case OP_NOTPOSUPTO:
461 case OP_NOTPOSUPTOI:
462
463 case OP_STAR:
464 case OP_STARI:
465 case OP_NOTSTAR:
466 case OP_NOTSTARI:
467 case OP_MINSTAR:
468 case OP_MINSTARI:
469 case OP_NOTMINSTAR:
470 case OP_NOTMINSTARI:
471 case OP_POSSTAR:
472 case OP_POSSTARI:
473 case OP_NOTPOSSTAR:
474 case OP_NOTPOSSTARI:
475
476 case OP_QUERY:
477 case OP_QUERYI:
478 case OP_NOTQUERY:
479 case OP_NOTQUERYI:
480 case OP_MINQUERY:
481 case OP_MINQUERYI:
482 case OP_NOTMINQUERY:
483 case OP_NOTMINQUERYI:
484 case OP_POSQUERY:
485 case OP_POSQUERYI:
486 case OP_NOTPOSQUERY:
487 case OP_NOTPOSQUERYI:
488
489 cc += PRIV(OP_lengths)[op];
490 #ifdef SUPPORT_UTF
491 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
492 #endif
493 break;
494
495 /* Skip these, but we need to add in the name length. */
496
497 case OP_MARK:
498 case OP_PRUNE_ARG:
499 case OP_SKIP_ARG:
500 case OP_THEN_ARG:
501 cc += PRIV(OP_lengths)[op] + cc[1];
502 break;
503
504 /* The remaining opcodes are just skipped over. */
505
506 case OP_CLOSE:
507 case OP_COMMIT:
508 case OP_FAIL:
509 case OP_PRUNE:
510 case OP_SET_SOM:
511 case OP_SKIP:
512 case OP_THEN:
513 cc += PRIV(OP_lengths)[op];
514 break;
515
516 /* This should not occur: we list all opcodes explicitly so that when
517 new ones get added they are properly considered. */
518
519 default:
520 return -3;
521 }
522 }
523 /* Control never gets here */
524 }
525
526
527
528 /*************************************************
529 * Set a bit and maybe its alternate case *
530 *************************************************/
531
532 /* Given a character, set its first byte's bit in the table, and also the
533 corresponding bit for the other version of a letter if we are caseless. In
534 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
535 when Unicode property support is available.
536
537 Arguments:
538 start_bits points to the bit map
539 p points to the character
540 caseless the caseless flag
541 cd the block with char table pointers
542 utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
543
544 Returns: pointer after the character
545 */
546
547 static const pcre_uchar *
548 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
549 compile_data *cd, BOOL utf)
550 {
551 pcre_uint32 c = *p;
552
553 #ifdef COMPILE_PCRE8
554 SET_BIT(c);
555
556 #ifdef SUPPORT_UTF
557 if (utf && c > 127)
558 {
559 GETCHARINC(c, p);
560 #ifdef SUPPORT_UCP
561 if (caseless)
562 {
563 pcre_uchar buff[6];
564 c = UCD_OTHERCASE(c);
565 (void)PRIV(ord2utf)(c, buff);
566 SET_BIT(buff[0]);
567 }
568 #endif /* Not SUPPORT_UCP */
569 return p;
570 }
571 #else /* Not SUPPORT_UTF */
572 (void)(utf); /* Stops warning for unused parameter */
573 #endif /* SUPPORT_UTF */
574
575 /* Not UTF-8 mode, or character is less than 127. */
576
577 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
578 return p + 1;
579 #endif /* COMPILE_PCRE8 */
580
581 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
582 if (c > 0xff)
583 {
584 c = 0xff;
585 caseless = FALSE;
586 }
587 SET_BIT(c);
588
589 #ifdef SUPPORT_UTF
590 if (utf && c > 127)
591 {
592 GETCHARINC(c, p);
593 #ifdef SUPPORT_UCP
594 if (caseless)
595 {
596 c = UCD_OTHERCASE(c);
597 if (c > 0xff)
598 c = 0xff;
599 SET_BIT(c);
600 }
601 #endif /* SUPPORT_UCP */
602 return p;
603 }
604 #else /* Not SUPPORT_UTF */
605 (void)(utf); /* Stops warning for unused parameter */
606 #endif /* SUPPORT_UTF */
607
608 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
609 return p + 1;
610 #endif
611 }
612
613
614
615 /*************************************************
616 * Set bits for a positive character type *
617 *************************************************/
618
619 /* This function sets starting bits for a character type. In UTF-8 mode, we can
620 only do a direct setting for bytes less than 128, as otherwise there can be
621 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
622 environment, the tables will only recognize ASCII characters anyway, but in at
623 least one Windows environment, some higher bytes bits were set in the tables.
624 So we deal with that case by considering the UTF-8 encoding.
625
626 Arguments:
627 start_bits the starting bitmap
628 cbit type the type of character wanted
629 table_limit 32 for non-UTF-8; 16 for UTF-8
630 cd the block with char table pointers
631
632 Returns: nothing
633 */
634
635 static void
636 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
637 compile_data *cd)
638 {
639 register pcre_uint32 c;
640 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
641 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
642 if (table_limit == 32) return;
643 for (c = 128; c < 256; c++)
644 {
645 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
646 {
647 pcre_uchar buff[6];
648 (void)PRIV(ord2utf)(c, buff);
649 SET_BIT(buff[0]);
650 }
651 }
652 #endif
653 }
654
655
656 /*************************************************
657 * Set bits for a negative character type *
658 *************************************************/
659
660 /* This function sets starting bits for a negative character type such as \D.
661 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
662 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
663 Unlike in the positive case, where we can set appropriate starting bits for
664 specific high-valued UTF-8 characters, in this case we have to set the bits for
665 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
666 0xc0 (192) for simplicity.
667
668 Arguments:
669 start_bits the starting bitmap
670 cbit type the type of character wanted
671 table_limit 32 for non-UTF-8; 16 for UTF-8
672 cd the block with char table pointers
673
674 Returns: nothing
675 */
676
677 static void
678 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
679 compile_data *cd)
680 {
681 register pcre_uint32 c;
682 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
683 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
684 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
685 #endif
686 }
687
688
689
690 /*************************************************
691 * Create bitmap of starting bytes *
692 *************************************************/
693
694 /* This function scans a compiled unanchored expression recursively and
695 attempts to build a bitmap of the set of possible starting bytes. As time goes
696 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
697 useful for parenthesized groups in patterns such as (a*)b where the group
698 provides some optional starting bytes but scanning must continue at the outer
699 level to find at least one mandatory byte. At the outermost level, this
700 function fails unless the result is SSB_DONE.
701
702 Arguments:
703 code points to an expression
704 start_bits points to a 32-byte table, initialized to 0
705 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
706 cd the block with char table pointers
707
708 Returns: SSB_FAIL => Failed to find any starting bytes
709 SSB_DONE => Found mandatory starting bytes
710 SSB_CONTINUE => Found optional starting bytes
711 SSB_UNKNOWN => Hit an unrecognized opcode
712 */
713
714 static int
715 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
716 compile_data *cd)
717 {
718 register pcre_uint32 c;
719 int yield = SSB_DONE;
720 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
721 int table_limit = utf? 16:32;
722 #else
723 int table_limit = 32;
724 #endif
725
726 #if 0
727 /* ========================================================================= */
728 /* The following comment and code was inserted in January 1999. In May 2006,
729 when it was observed to cause compiler warnings about unused values, I took it
730 out again. If anybody is still using OS/2, they will have to put it back
731 manually. */
732
733 /* This next statement and the later reference to dummy are here in order to
734 trick the optimizer of the IBM C compiler for OS/2 into generating correct
735 code. Apparently IBM isn't going to fix the problem, and we would rather not
736 disable optimization (in this module it actually makes a big difference, and
737 the pcre module can use all the optimization it can get). */
738
739 volatile int dummy;
740 /* ========================================================================= */
741 #endif
742
743 do
744 {
745 BOOL try_next = TRUE;
746 const pcre_uchar *tcode = code + 1 + LINK_SIZE;
747
748 if (*code == OP_CBRA || *code == OP_SCBRA ||
749 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
750
751 while (try_next) /* Loop for items in this branch */
752 {
753 int rc;
754
755 switch(*tcode)
756 {
757 /* If we reach something we don't understand, it means a new opcode has
758 been created that hasn't been added to this code. Hopefully this problem
759 will be discovered during testing. */
760
761 default:
762 return SSB_UNKNOWN;
763
764 /* Fail for a valid opcode that implies no starting bits. */
765
766 case OP_ACCEPT:
767 case OP_ASSERT_ACCEPT:
768 case OP_ALLANY:
769 case OP_ANY:
770 case OP_ANYBYTE:
771 case OP_CIRC:
772 case OP_CIRCM:
773 case OP_CLOSE:
774 case OP_COMMIT:
775 case OP_COND:
776 case OP_CREF:
777 case OP_DEF:
778 case OP_DOLL:
779 case OP_DOLLM:
780 case OP_END:
781 case OP_EOD:
782 case OP_EODN:
783 case OP_EXTUNI:
784 case OP_FAIL:
785 case OP_MARK:
786 case OP_NCREF:
787 case OP_NOT:
788 case OP_NOTEXACT:
789 case OP_NOTEXACTI:
790 case OP_NOTI:
791 case OP_NOTMINPLUS:
792 case OP_NOTMINPLUSI:
793 case OP_NOTMINQUERY:
794 case OP_NOTMINQUERYI:
795 case OP_NOTMINSTAR:
796 case OP_NOTMINSTARI:
797 case OP_NOTMINUPTO:
798 case OP_NOTMINUPTOI:
799 case OP_NOTPLUS:
800 case OP_NOTPLUSI:
801 case OP_NOTPOSPLUS:
802 case OP_NOTPOSPLUSI:
803 case OP_NOTPOSQUERY:
804 case OP_NOTPOSQUERYI:
805 case OP_NOTPOSSTAR:
806 case OP_NOTPOSSTARI:
807 case OP_NOTPOSUPTO:
808 case OP_NOTPOSUPTOI:
809 case OP_NOTPROP:
810 case OP_NOTQUERY:
811 case OP_NOTQUERYI:
812 case OP_NOTSTAR:
813 case OP_NOTSTARI:
814 case OP_NOTUPTO:
815 case OP_NOTUPTOI:
816 case OP_NOT_HSPACE:
817 case OP_NOT_VSPACE:
818 case OP_NRREF:
819 case OP_PROP:
820 case OP_PRUNE:
821 case OP_PRUNE_ARG:
822 case OP_RECURSE:
823 case OP_REF:
824 case OP_REFI:
825 case OP_REVERSE:
826 case OP_RREF:
827 case OP_SCOND:
828 case OP_SET_SOM:
829 case OP_SKIP:
830 case OP_SKIP_ARG:
831 case OP_SOD:
832 case OP_SOM:
833 case OP_THEN:
834 case OP_THEN_ARG:
835 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
836 case OP_XCLASS:
837 #endif
838 return SSB_FAIL;
839
840 /* We can ignore word boundary tests. */
841
842 case OP_WORD_BOUNDARY:
843 case OP_NOT_WORD_BOUNDARY:
844 tcode++;
845 break;
846
847 /* If we hit a bracket or a positive lookahead assertion, recurse to set
848 bits from within the subpattern. If it can't find anything, we have to
849 give up. If it finds some mandatory character(s), we are done for this
850 branch. Otherwise, carry on scanning after the subpattern. */
851
852 case OP_BRA:
853 case OP_SBRA:
854 case OP_CBRA:
855 case OP_SCBRA:
856 case OP_BRAPOS:
857 case OP_SBRAPOS:
858 case OP_CBRAPOS:
859 case OP_SCBRAPOS:
860 case OP_ONCE:
861 case OP_ONCE_NC:
862 case OP_ASSERT:
863 rc = set_start_bits(tcode, start_bits, utf, cd);
864 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
865 if (rc == SSB_DONE) try_next = FALSE; else
866 {
867 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
868 tcode += 1 + LINK_SIZE;
869 }
870 break;
871
872 /* If we hit ALT or KET, it means we haven't found anything mandatory in
873 this branch, though we might have found something optional. For ALT, we
874 continue with the next alternative, but we have to arrange that the final
875 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
876 return SSB_CONTINUE: if this is the top level, that indicates failure,
877 but after a nested subpattern, it causes scanning to continue. */
878
879 case OP_ALT:
880 yield = SSB_CONTINUE;
881 try_next = FALSE;
882 break;
883
884 case OP_KET:
885 case OP_KETRMAX:
886 case OP_KETRMIN:
887 case OP_KETRPOS:
888 return SSB_CONTINUE;
889
890 /* Skip over callout */
891
892 case OP_CALLOUT:
893 tcode += 2 + 2*LINK_SIZE;
894 break;
895
896 /* Skip over lookbehind and negative lookahead assertions */
897
898 case OP_ASSERT_NOT:
899 case OP_ASSERTBACK:
900 case OP_ASSERTBACK_NOT:
901 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
902 tcode += 1 + LINK_SIZE;
903 break;
904
905 /* BRAZERO does the bracket, but carries on. */
906
907 case OP_BRAZERO:
908 case OP_BRAMINZERO:
909 case OP_BRAPOSZERO:
910 rc = set_start_bits(++tcode, start_bits, utf, cd);
911 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
912 /* =========================================================================
913 See the comment at the head of this function concerning the next line,
914 which was an old fudge for the benefit of OS/2.
915 dummy = 1;
916 ========================================================================= */
917 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
918 tcode += 1 + LINK_SIZE;
919 break;
920
921 /* SKIPZERO skips the bracket. */
922
923 case OP_SKIPZERO:
924 tcode++;
925 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
926 tcode += 1 + LINK_SIZE;
927 break;
928
929 /* Single-char * or ? sets the bit and tries the next item */
930
931 case OP_STAR:
932 case OP_MINSTAR:
933 case OP_POSSTAR:
934 case OP_QUERY:
935 case OP_MINQUERY:
936 case OP_POSQUERY:
937 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
938 break;
939
940 case OP_STARI:
941 case OP_MINSTARI:
942 case OP_POSSTARI:
943 case OP_QUERYI:
944 case OP_MINQUERYI:
945 case OP_POSQUERYI:
946 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
947 break;
948
949 /* Single-char upto sets the bit and tries the next */
950
951 case OP_UPTO:
952 case OP_MINUPTO:
953 case OP_POSUPTO:
954 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
955 break;
956
957 case OP_UPTOI:
958 case OP_MINUPTOI:
959 case OP_POSUPTOI:
960 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
961 break;
962
963 /* At least one single char sets the bit and stops */
964
965 case OP_EXACT:
966 tcode += IMM2_SIZE;
967 /* Fall through */
968 case OP_CHAR:
969 case OP_PLUS:
970 case OP_MINPLUS:
971 case OP_POSPLUS:
972 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
973 try_next = FALSE;
974 break;
975
976 case OP_EXACTI:
977 tcode += IMM2_SIZE;
978 /* Fall through */
979 case OP_CHARI:
980 case OP_PLUSI:
981 case OP_MINPLUSI:
982 case OP_POSPLUSI:
983 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
984 try_next = FALSE;
985 break;
986
987 /* Special spacing and line-terminating items. These recognize specific
988 lists of characters. The difference between VSPACE and ANYNL is that the
989 latter can match the two-character CRLF sequence, but that is not
990 relevant for finding the first character, so their code here is
991 identical. */
992
993 case OP_HSPACE:
994 SET_BIT(CHAR_HT);
995 SET_BIT(CHAR_SPACE);
996 #ifdef SUPPORT_UTF
997 if (utf)
998 {
999 #ifdef COMPILE_PCRE8
1000 SET_BIT(0xC2); /* For U+00A0 */
1001 SET_BIT(0xE1); /* For U+1680, U+180E */
1002 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1003 SET_BIT(0xE3); /* For U+3000 */
1004 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1005 SET_BIT(0xA0);
1006 SET_BIT(0xFF); /* For characters > 255 */
1007 #endif /* COMPILE_PCRE[8|16|32] */
1008 }
1009 else
1010 #endif /* SUPPORT_UTF */
1011 {
1012 #ifndef EBCDIC
1013 SET_BIT(0xA0);
1014 #endif /* Not EBCDIC */
1015 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1016 SET_BIT(0xFF); /* For characters > 255 */
1017 #endif /* COMPILE_PCRE[16|32] */
1018 }
1019 try_next = FALSE;
1020 break;
1021
1022 case OP_ANYNL:
1023 case OP_VSPACE:
1024 SET_BIT(CHAR_LF);
1025 SET_BIT(CHAR_VT);
1026 SET_BIT(CHAR_FF);
1027 SET_BIT(CHAR_CR);
1028 #ifdef SUPPORT_UTF
1029 if (utf)
1030 {
1031 #ifdef COMPILE_PCRE8
1032 SET_BIT(0xC2); /* For U+0085 */
1033 SET_BIT(0xE2); /* For U+2028, U+2029 */
1034 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1035 SET_BIT(CHAR_NEL);
1036 SET_BIT(0xFF); /* For characters > 255 */
1037 #endif /* COMPILE_PCRE[8|16|32] */
1038 }
1039 else
1040 #endif /* SUPPORT_UTF */
1041 {
1042 SET_BIT(CHAR_NEL);
1043 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1044 SET_BIT(0xFF); /* For characters > 255 */
1045 #endif
1046 }
1047 try_next = FALSE;
1048 break;
1049
1050 /* Single character types set the bits and stop. Note that if PCRE_UCP
1051 is set, we do not see these op codes because \d etc are converted to
1052 properties. Therefore, these apply in the case when only characters less
1053 than 256 are recognized to match the types. */
1054
1055 case OP_NOT_DIGIT:
1056 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1057 try_next = FALSE;
1058 break;
1059
1060 case OP_DIGIT:
1061 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1062 try_next = FALSE;
1063 break;
1064
1065 /* The cbit_space table has vertical tab as whitespace; we have to
1066 ensure it is set as not whitespace. Luckily, the code value is the same
1067 (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1068
1069 case OP_NOT_WHITESPACE:
1070 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1071 start_bits[1] |= 0x08;
1072 try_next = FALSE;
1073 break;
1074
1075 /* The cbit_space table has vertical tab as whitespace; we have to not
1076 set it from the table. Luckily, the code value is the same (0x0b) in
1077 ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1078
1079 case OP_WHITESPACE:
1080 c = start_bits[1]; /* Save in case it was already set */
1081 set_type_bits(start_bits, cbit_space, table_limit, cd);
1082 start_bits[1] = (start_bits[1] & ~0x08) | c;
1083 try_next = FALSE;
1084 break;
1085
1086 case OP_NOT_WORDCHAR:
1087 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1088 try_next = FALSE;
1089 break;
1090
1091 case OP_WORDCHAR:
1092 set_type_bits(start_bits, cbit_word, table_limit, cd);
1093 try_next = FALSE;
1094 break;
1095
1096 /* One or more character type fudges the pointer and restarts, knowing
1097 it will hit a single character type and stop there. */
1098
1099 case OP_TYPEPLUS:
1100 case OP_TYPEMINPLUS:
1101 case OP_TYPEPOSPLUS:
1102 tcode++;
1103 break;
1104
1105 case OP_TYPEEXACT:
1106 tcode += 1 + IMM2_SIZE;
1107 break;
1108
1109 /* Zero or more repeats of character types set the bits and then
1110 try again. */
1111
1112 case OP_TYPEUPTO:
1113 case OP_TYPEMINUPTO:
1114 case OP_TYPEPOSUPTO:
1115 tcode += IMM2_SIZE; /* Fall through */
1116
1117 case OP_TYPESTAR:
1118 case OP_TYPEMINSTAR:
1119 case OP_TYPEPOSSTAR:
1120 case OP_TYPEQUERY:
1121 case OP_TYPEMINQUERY:
1122 case OP_TYPEPOSQUERY:
1123 switch(tcode[1])
1124 {
1125 default:
1126 case OP_ANY:
1127 case OP_ALLANY:
1128 return SSB_FAIL;
1129
1130 case OP_HSPACE:
1131 SET_BIT(CHAR_HT);
1132 SET_BIT(CHAR_SPACE);
1133 #ifdef SUPPORT_UTF
1134 if (utf)
1135 {
1136 #ifdef COMPILE_PCRE8
1137 SET_BIT(0xC2); /* For U+00A0 */
1138 SET_BIT(0xE1); /* For U+1680, U+180E */
1139 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1140 SET_BIT(0xE3); /* For U+3000 */
1141 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1142 SET_BIT(0xA0);
1143 SET_BIT(0xFF); /* For characters > 255 */
1144 #endif /* COMPILE_PCRE[8|16|32] */
1145 }
1146 else
1147 #endif /* SUPPORT_UTF */
1148 #ifndef EBCDIC
1149 SET_BIT(0xA0);
1150 #endif /* Not EBCDIC */
1151 break;
1152
1153 case OP_ANYNL:
1154 case OP_VSPACE:
1155 SET_BIT(CHAR_LF);
1156 SET_BIT(CHAR_VT);
1157 SET_BIT(CHAR_FF);
1158 SET_BIT(CHAR_CR);
1159 #ifdef SUPPORT_UTF
1160 if (utf)
1161 {
1162 #ifdef COMPILE_PCRE8
1163 SET_BIT(0xC2); /* For U+0085 */
1164 SET_BIT(0xE2); /* For U+2028, U+2029 */
1165 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1166 SET_BIT(CHAR_NEL);
1167 SET_BIT(0xFF); /* For characters > 255 */
1168 #endif /* COMPILE_PCRE16 */
1169 }
1170 else
1171 #endif /* SUPPORT_UTF */
1172 SET_BIT(CHAR_NEL);
1173 break;
1174
1175 case OP_NOT_DIGIT:
1176 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1177 break;
1178
1179 case OP_DIGIT:
1180 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1181 break;
1182
1183 /* The cbit_space table has vertical tab as whitespace; we have to
1184 ensure it gets set as not whitespace. Luckily, the code value is the
1185 same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate
1186 bit. */
1187
1188 case OP_NOT_WHITESPACE:
1189 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1190 start_bits[1] |= 0x08;
1191 break;
1192
1193 /* The cbit_space table has vertical tab as whitespace; we have to
1194 avoid setting it. Luckily, the code value is the same (0x0b) in ASCII
1195 and EBCDIC, so we can just adjust the appropriate bit. */
1196
1197 case OP_WHITESPACE:
1198 c = start_bits[1]; /* Save in case it was already set */
1199 set_type_bits(start_bits, cbit_space, table_limit, cd);
1200 start_bits[1] = (start_bits[1] & ~0x08) | c;
1201 break;
1202
1203 case OP_NOT_WORDCHAR:
1204 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1205 break;
1206
1207 case OP_WORDCHAR:
1208 set_type_bits(start_bits, cbit_word, table_limit, cd);
1209 break;
1210 }
1211
1212 tcode += 2;
1213 break;
1214
1215 /* Character class where all the information is in a bit map: set the
1216 bits and either carry on or not, according to the repeat count. If it was
1217 a negative class, and we are operating with UTF-8 characters, any byte
1218 with a value >= 0xc4 is a potentially valid starter because it starts a
1219 character with a value > 255. */
1220
1221 case OP_NCLASS:
1222 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1223 if (utf)
1224 {
1225 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1226 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1227 }
1228 #endif
1229 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1230 SET_BIT(0xFF); /* For characters > 255 */
1231 #endif
1232 /* Fall through */
1233
1234 case OP_CLASS:
1235 {
1236 pcre_uint8 *map;
1237 tcode++;
1238 map = (pcre_uint8 *)tcode;
1239
1240 /* In UTF-8 mode, the bits in a bit map correspond to character
1241 values, not to byte values. However, the bit map we are constructing is
1242 for byte values. So we have to do a conversion for characters whose
1243 value is > 127. In fact, there are only two possible starting bytes for
1244 characters in the range 128 - 255. */
1245
1246 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1247 if (utf)
1248 {
1249 for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1250 for (c = 128; c < 256; c++)
1251 {
1252 if ((map[c/8] && (1 << (c&7))) != 0)
1253 {
1254 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1255 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1256 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1257 }
1258 }
1259 }
1260 else
1261 #endif
1262 {
1263 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1264 for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1265 }
1266
1267 /* Advance past the bit map, and act on what follows. For a zero
1268 minimum repeat, continue; otherwise stop processing. */
1269
1270 tcode += 32 / sizeof(pcre_uchar);
1271 switch (*tcode)
1272 {
1273 case OP_CRSTAR:
1274 case OP_CRMINSTAR:
1275 case OP_CRQUERY:
1276 case OP_CRMINQUERY:
1277 tcode++;
1278 break;
1279
1280 case OP_CRRANGE:
1281 case OP_CRMINRANGE:
1282 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1283 else try_next = FALSE;
1284 break;
1285
1286 default:
1287 try_next = FALSE;
1288 break;
1289 }
1290 }
1291 break; /* End of bitmap class handling */
1292
1293 } /* End of switch */
1294 } /* End of try_next loop */
1295
1296 code += GET(code, 1); /* Advance to next branch */
1297 }
1298 while (*code == OP_ALT);
1299 return yield;
1300 }
1301
1302
1303
1304
1305
1306 /*************************************************
1307 * Study a compiled expression *
1308 *************************************************/
1309
1310 /* This function is handed a compiled expression that it must study to produce
1311 information that will speed up the matching. It returns a pcre[16]_extra block
1312 which then gets handed back to pcre_exec().
1313
1314 Arguments:
1315 re points to the compiled expression
1316 options contains option bits
1317 errorptr points to where to place error messages;
1318 set NULL unless error
1319
1320 Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1321 the appropriate flags set;
1322 NULL on error or if no optimization possible
1323 */
1324
1325 #if defined COMPILE_PCRE8
1326 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1327 pcre_study(const pcre *external_re, int options, const char **errorptr)
1328 #elif defined COMPILE_PCRE16
1329 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1330 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1331 #elif defined COMPILE_PCRE32
1332 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1333 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1334 #endif
1335 {
1336 int min;
1337 BOOL bits_set = FALSE;
1338 pcre_uint8 start_bits[32];
1339 PUBL(extra) *extra = NULL;
1340 pcre_study_data *study;
1341 const pcre_uint8 *tables;
1342 pcre_uchar *code;
1343 compile_data compile_block;
1344 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1345
1346 *errorptr = NULL;
1347
1348 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1349 {
1350 *errorptr = "argument is not a compiled regular expression";
1351 return NULL;
1352 }
1353
1354 if ((re->flags & PCRE_MODE) == 0)
1355 {
1356 #if defined COMPILE_PCRE8
1357 *errorptr = "argument not compiled in 8 bit mode";
1358 #elif defined COMPILE_PCRE16
1359 *errorptr = "argument not compiled in 16 bit mode";
1360 #elif defined COMPILE_PCRE32
1361 *errorptr = "argument not compiled in 32 bit mode";
1362 #endif
1363 return NULL;
1364 }
1365
1366 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1367 {
1368 *errorptr = "unknown or incorrect option bit(s) set";
1369 return NULL;
1370 }
1371
1372 code = (pcre_uchar *)re + re->name_table_offset +
1373 (re->name_count * re->name_entry_size);
1374
1375 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1376 a multiline pattern that matches only at "line starts", there is no point in
1377 seeking a list of starting bytes. */
1378
1379 if ((re->options & PCRE_ANCHORED) == 0 &&
1380 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1381 {
1382 int rc;
1383
1384 /* Set the character tables in the block that is passed around */
1385
1386 tables = re->tables;
1387
1388 #if defined COMPILE_PCRE8
1389 if (tables == NULL)
1390 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1391 (void *)(&tables));
1392 #elif defined COMPILE_PCRE16
1393 if (tables == NULL)
1394 (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1395 (void *)(&tables));
1396 #elif defined COMPILE_PCRE32
1397 if (tables == NULL)
1398 (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1399 (void *)(&tables));
1400 #endif
1401
1402 compile_block.lcc = tables + lcc_offset;
1403 compile_block.fcc = tables + fcc_offset;
1404 compile_block.cbits = tables + cbits_offset;
1405 compile_block.ctypes = tables + ctypes_offset;
1406
1407 /* See if we can find a fixed set of initial characters for the pattern. */
1408
1409 memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1410 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1411 &compile_block);
1412 bits_set = rc == SSB_DONE;
1413 if (rc == SSB_UNKNOWN)
1414 {
1415 *errorptr = "internal error: opcode not recognized";
1416 return NULL;
1417 }
1418 }
1419
1420 /* Find the minimum length of subject string. */
1421
1422 switch(min = find_minlength(code, code, re->options, 0))
1423 {
1424 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1425 case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1426 default: break;
1427 }
1428
1429 /* If a set of starting bytes has been identified, or if the minimum length is
1430 greater than zero, or if JIT optimization has been requested, or if
1431 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1432 pcre_study_data block. The study data is put in the latter, which is pointed to
1433 by the former, which may also get additional data set later by the calling
1434 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1435 save it in a field for returning via the pcre_fullinfo() function so that if it
1436 becomes variable in the future, we don't have to change that code. */
1437
1438 if (bits_set || min > 0 || (options & (
1439 #ifdef SUPPORT_JIT
1440 PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1441 PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1442 #endif
1443 PCRE_STUDY_EXTRA_NEEDED)) != 0)
1444 {
1445 extra = (PUBL(extra) *)(PUBL(malloc))
1446 (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1447 if (extra == NULL)
1448 {
1449 *errorptr = "failed to get memory";
1450 return NULL;
1451 }
1452
1453 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1454 extra->flags = PCRE_EXTRA_STUDY_DATA;
1455 extra->study_data = study;
1456
1457 study->size = sizeof(pcre_study_data);
1458 study->flags = 0;
1459
1460 /* Set the start bits always, to avoid unset memory errors if the
1461 study data is written to a file, but set the flag only if any of the bits
1462 are set, to save time looking when none are. */
1463
1464 if (bits_set)
1465 {
1466 study->flags |= PCRE_STUDY_MAPPED;
1467 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1468 }
1469 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1470
1471 #ifdef PCRE_DEBUG
1472 if (bits_set)
1473 {
1474 pcre_uint8 *ptr = start_bits;
1475 int i;
1476
1477 printf("Start bits:\n");
1478 for (i = 0; i < 32; i++)
1479 printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1480 }
1481 #endif
1482
1483 /* Always set the minlength value in the block, because the JIT compiler
1484 makes use of it. However, don't set the bit unless the length is greater than
1485 zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1486 checking the zero case. */
1487
1488 if (min > 0)
1489 {
1490 study->flags |= PCRE_STUDY_MINLEN;
1491 study->minlength = min;
1492 }
1493 else study->minlength = 0;
1494
1495 /* If JIT support was compiled and requested, attempt the JIT compilation.
1496 If no starting bytes were found, and the minimum length is zero, and JIT
1497 compilation fails, abandon the extra block and return NULL, unless
1498 PCRE_STUDY_EXTRA_NEEDED is set. */
1499
1500 #ifdef SUPPORT_JIT
1501 extra->executable_jit = NULL;
1502 if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1503 PRIV(jit_compile)(re, extra, JIT_COMPILE);
1504 if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1505 PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1506 if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1507 PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1508
1509 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1510 (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1511 {
1512 #if defined COMPILE_PCRE8
1513 pcre_free_study(extra);
1514 #elif defined COMPILE_PCRE16
1515 pcre16_free_study(extra);
1516 #elif defined COMPILE_PCRE32
1517 pcre32_free_study(extra);
1518 #endif
1519 extra = NULL;
1520 }
1521 #endif
1522 }
1523
1524 return extra;
1525 }
1526
1527
1528 /*************************************************
1529 * Free the study data *
1530 *************************************************/
1531
1532 /* This function frees the memory that was obtained by pcre_study().
1533
1534 Argument: a pointer to the pcre[16]_extra block
1535 Returns: nothing
1536 */
1537
1538 #if defined COMPILE_PCRE8
1539 PCRE_EXP_DEFN void
1540 pcre_free_study(pcre_extra *extra)
1541 #elif defined COMPILE_PCRE16
1542 PCRE_EXP_DEFN void
1543 pcre16_free_study(pcre16_extra *extra)
1544 #elif defined COMPILE_PCRE32
1545 PCRE_EXP_DEFN void
1546 pcre32_free_study(pcre32_extra *extra)
1547 #endif
1548 {
1549 if (extra == NULL)
1550 return;
1551 #ifdef SUPPORT_JIT
1552 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1553 extra->executable_jit != NULL)
1554 PRIV(jit_free)(extra->executable_jit);
1555 #endif
1556 PUBL(free)(extra);
1557 }
1558
1559 /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5