/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 31 - (show annotations)
Sat Feb 24 21:38:57 2007 UTC (12 years, 3 months ago) by nigel
File MIME type: text/plain
File size: 125923 byte(s)
Load pcre-2.04 into code/trunk.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-1999 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37
38 /* #define DEBUG */
39
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
43
44 #ifdef DEBUG
45 #define DPRINTF(p) printf p
46 #else
47 #define DPRINTF(p) /*nothing*/
48 #endif
49
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
52
53 #include "internal.h"
54
55
56 /* Allow compilation as C++ source code, should anybody want to do that. */
57
58 #ifdef __cplusplus
59 #define class pcre_class
60 #endif
61
62
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
65
66 #define BRASTACK_SIZE 200
67
68
69 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70
71 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73
74 /* Text forms of OP_ values and things, for debugging (not all used) */
75
76 #ifdef DEBUG
77 static const char *OP_names[] = {
78 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80 "Opt", "^", "$", "Any", "chars", "not",
81 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84 "*", "*?", "+", "+?", "?", "??", "{", "{",
85 "class", "Ref",
86 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 "Brazero", "Braminzero", "Bra"
89 };
90 #endif
91
92 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93 are simple data values; negative values are for special things like \d and so
94 on. Zero means further processing is needed (for things like \x), or the escape
95 is invalid. */
96
97 static const short int escapes[] = {
98 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 0, 0, -ESC_z /* x - z */
108 };
109
110 /* Definition to allow mutual recursion */
111
112 static BOOL
113 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114 BOOL, int, compile_data *);
115
116
117
118 /*************************************************
119 * Global variables *
120 *************************************************/
121
122 /* PCRE is thread-clean and doesn't use any global variables in the normal
123 sense. However, it calls memory allocation and free functions via the two
124 indirections below, which are can be changed by the caller, but are shared
125 between all threads. */
126
127 void *(*pcre_malloc)(size_t) = malloc;
128 void (*pcre_free)(void *) = free;
129
130
131
132
133 /*************************************************
134 * Default character tables *
135 *************************************************/
136
137 /* A default set of character tables is included in the PCRE binary. Its source
138 is built by the maketables auxiliary program, which uses the default C ctypes
139 functions, and put in the file chartables.c. These tables are used by PCRE
140 whenever the caller of pcre_compile() does not provide an alternate set of
141 tables. */
142
143 #include "chartables.c"
144
145
146
147 /*************************************************
148 * Return version string *
149 *************************************************/
150
151 const char *
152 pcre_version(void)
153 {
154 return PCRE_VERSION;
155 }
156
157
158
159
160 /*************************************************
161 * Return info about a compiled pattern *
162 *************************************************/
163
164 /* This function picks potentially useful data out of the private
165 structure.
166
167 Arguments:
168 external_re points to compiled code
169 optptr where to pass back the options
170 first_char where to pass back the first character,
171 or -1 if multiline and all branches start ^,
172 or -2 otherwise
173
174 Returns: number of identifying extraction brackets
175 or negative values on error
176 */
177
178 int
179 pcre_info(const pcre *external_re, int *optptr, int *first_char)
180 {
181 const real_pcre *re = (const real_pcre *)external_re;
182 if (re == NULL) return PCRE_ERROR_NULL;
183 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
184 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
185 if (first_char != NULL)
186 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
187 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
188 return re->top_bracket;
189 }
190
191
192
193
194 #ifdef DEBUG
195 /*************************************************
196 * Debugging function to print chars *
197 *************************************************/
198
199 /* Print a sequence of chars in printable format, stopping at the end of the
200 subject if the requested.
201
202 Arguments:
203 p points to characters
204 length number to print
205 is_subject TRUE if printing from within md->start_subject
206 md pointer to matching data block, if is_subject is TRUE
207
208 Returns: nothing
209 */
210
211 static void
212 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
213 {
214 int c;
215 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
216 while (length-- > 0)
217 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
218 }
219 #endif
220
221
222
223
224 /*************************************************
225 * Handle escapes *
226 *************************************************/
227
228 /* This function is called when a \ has been encountered. It either returns a
229 positive value for a simple escape such as \n, or a negative value which
230 encodes one of the more complicated things such as \d. On entry, ptr is
231 pointing at the \. On exit, it is on the final character of the escape
232 sequence.
233
234 Arguments:
235 ptrptr points to the pattern position pointer
236 errorptr points to the pointer to the error message
237 bracount number of previous extracting brackets
238 options the options bits
239 isclass TRUE if inside a character class
240 cd pointer to char tables block
241
242 Returns: zero or positive => a data character
243 negative => a special escape sequence
244 on error, errorptr is set
245 */
246
247 static int
248 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
249 int options, BOOL isclass, compile_data *cd)
250 {
251 const uschar *ptr = *ptrptr;
252 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
253 int i;
254
255 if (c == 0) *errorptr = ERR1;
256
257 /* Digits or letters may have special meaning; all others are literals. */
258
259 else if (c < '0' || c > 'z') {}
260
261 /* Do an initial lookup in a table. A non-zero result is something that can be
262 returned immediately. Otherwise further processing may be required. */
263
264 else if ((i = escapes[c - '0']) != 0) c = i;
265
266 /* Escapes that need further processing, or are illegal. */
267
268 else
269 {
270 const uschar *oldptr;
271 switch (c)
272 {
273 /* The handling of escape sequences consisting of a string of digits
274 starting with one that is not zero is not straightforward. By experiment,
275 the way Perl works seems to be as follows:
276
277 Outside a character class, the digits are read as a decimal number. If the
278 number is less than 10, or if there are that many previous extracting
279 left brackets, then it is a back reference. Otherwise, up to three octal
280 digits are read to form an escaped byte. Thus \123 is likely to be octal
281 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
282 value is greater than 377, the least significant 8 bits are taken. Inside a
283 character class, \ followed by a digit is always an octal number. */
284
285 case '1': case '2': case '3': case '4': case '5':
286 case '6': case '7': case '8': case '9':
287
288 if (!isclass)
289 {
290 oldptr = ptr;
291 c -= '0';
292 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
293 c = c * 10 + *(++ptr) - '0';
294 if (c < 10 || c <= bracount)
295 {
296 c = -(ESC_REF + c);
297 break;
298 }
299 ptr = oldptr; /* Put the pointer back and fall through */
300 }
301
302 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
303 generates a binary zero byte and treats the digit as a following literal.
304 Thus we have to pull back the pointer by one. */
305
306 if ((c = *ptr) >= '8')
307 {
308 ptr--;
309 c = 0;
310 break;
311 }
312
313 /* \0 always starts an octal number, but we may drop through to here with a
314 larger first octal digit */
315
316 case '0':
317 c -= '0';
318 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
319 ptr[1] != '8' && ptr[1] != '9')
320 c = c * 8 + *(++ptr) - '0';
321 break;
322
323 /* Special escapes not starting with a digit are straightforward */
324
325 case 'x':
326 c = 0;
327 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
328 {
329 ptr++;
330 c = c * 16 + cd->lcc[*ptr] -
331 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
332 }
333 break;
334
335 case 'c':
336 c = *(++ptr);
337 if (c == 0)
338 {
339 *errorptr = ERR2;
340 return 0;
341 }
342
343 /* A letter is upper-cased; then the 0x40 bit is flipped */
344
345 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
346 c ^= 0x40;
347 break;
348
349 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
350 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
351 for Perl compatibility, it is a literal. This code looks a bit odd, but
352 there used to be some cases other than the default, and there may be again
353 in future, so I haven't "optimized" it. */
354
355 default:
356 if ((options & PCRE_EXTRA) != 0) switch(c)
357 {
358 default:
359 *errorptr = ERR3;
360 break;
361 }
362 break;
363 }
364 }
365
366 *ptrptr = ptr;
367 return c;
368 }
369
370
371
372 /*************************************************
373 * Check for counted repeat *
374 *************************************************/
375
376 /* This function is called when a '{' is encountered in a place where it might
377 start a quantifier. It looks ahead to see if it really is a quantifier or not.
378 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
379 where the ddds are digits.
380
381 Arguments:
382 p pointer to the first char after '{'
383 cd pointer to char tables block
384
385 Returns: TRUE or FALSE
386 */
387
388 static BOOL
389 is_counted_repeat(const uschar *p, compile_data *cd)
390 {
391 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
392 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
393 if (*p == '}') return TRUE;
394
395 if (*p++ != ',') return FALSE;
396 if (*p == '}') return TRUE;
397
398 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
399 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
400 return (*p == '}');
401 }
402
403
404
405 /*************************************************
406 * Read repeat counts *
407 *************************************************/
408
409 /* Read an item of the form {n,m} and return the values. This is called only
410 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
411 so the syntax is guaranteed to be correct, but we need to check the values.
412
413 Arguments:
414 p pointer to first char after '{'
415 minp pointer to int for min
416 maxp pointer to int for max
417 returned as -1 if no max
418 errorptr points to pointer to error message
419 cd pointer to character tables clock
420
421 Returns: pointer to '}' on success;
422 current ptr on error, with errorptr set
423 */
424
425 static const uschar *
426 read_repeat_counts(const uschar *p, int *minp, int *maxp,
427 const char **errorptr, compile_data *cd)
428 {
429 int min = 0;
430 int max = -1;
431
432 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
433
434 if (*p == '}') max = min; else
435 {
436 if (*(++p) != '}')
437 {
438 max = 0;
439 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
440 if (max < min)
441 {
442 *errorptr = ERR4;
443 return p;
444 }
445 }
446 }
447
448 /* Do paranoid checks, then fill in the required variables, and pass back the
449 pointer to the terminating '}'. */
450
451 if (min > 65535 || max > 65535)
452 *errorptr = ERR5;
453 else
454 {
455 *minp = min;
456 *maxp = max;
457 }
458 return p;
459 }
460
461
462
463 /*************************************************
464 * Find the fixed length of a pattern *
465 *************************************************/
466
467 /* Scan a pattern and compute the fixed length of subject that will match it,
468 if the length is fixed. This is needed for dealing with backward assertions.
469
470 Arguments:
471 code points to the start of the pattern (the bracket)
472
473 Returns: the fixed length, or -1 if there is no fixed length
474 */
475
476 static int
477 find_fixedlength(uschar *code)
478 {
479 int length = -1;
480
481 register int branchlength = 0;
482 register uschar *cc = code + 3;
483
484 /* Scan along the opcodes for this branch. If we get to the end of the
485 branch, check the length against that of the other branches. */
486
487 for (;;)
488 {
489 int d;
490 register int op = *cc;
491 if (op >= OP_BRA) op = OP_BRA;
492
493 switch (op)
494 {
495 case OP_BRA:
496 case OP_ONCE:
497 case OP_COND:
498 d = find_fixedlength(cc);
499 if (d < 0) return -1;
500 branchlength += d;
501 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
502 cc += 3;
503 break;
504
505 /* Reached end of a branch; if it's a ket it is the end of a nested
506 call. If it's ALT it is an alternation in a nested call. If it is
507 END it's the end of the outer call. All can be handled by the same code. */
508
509 case OP_ALT:
510 case OP_KET:
511 case OP_KETRMAX:
512 case OP_KETRMIN:
513 case OP_END:
514 if (length < 0) length = branchlength;
515 else if (length != branchlength) return -1;
516 if (*cc != OP_ALT) return length;
517 cc += 3;
518 branchlength = 0;
519 break;
520
521 /* Skip over assertive subpatterns */
522
523 case OP_ASSERT:
524 case OP_ASSERT_NOT:
525 case OP_ASSERTBACK:
526 case OP_ASSERTBACK_NOT:
527 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
528 cc += 3;
529 break;
530
531 /* Skip over things that don't match chars */
532
533 case OP_REVERSE:
534 cc++;
535
536 case OP_CREF:
537 case OP_OPT:
538 cc++;
539 /* Fall through */
540
541 case OP_SOD:
542 case OP_EOD:
543 case OP_EODN:
544 case OP_CIRC:
545 case OP_DOLL:
546 case OP_NOT_WORD_BOUNDARY:
547 case OP_WORD_BOUNDARY:
548 cc++;
549 break;
550
551 /* Handle char strings */
552
553 case OP_CHARS:
554 branchlength += *(++cc);
555 cc += *cc + 1;
556 break;
557
558 /* Handle exact repetitions */
559
560 case OP_EXACT:
561 case OP_TYPEEXACT:
562 branchlength += (cc[1] << 8) + cc[2];
563 cc += 4;
564 break;
565
566 /* Handle single-char matchers */
567
568 case OP_NOT_DIGIT:
569 case OP_DIGIT:
570 case OP_NOT_WHITESPACE:
571 case OP_WHITESPACE:
572 case OP_NOT_WORDCHAR:
573 case OP_WORDCHAR:
574 case OP_ANY:
575 branchlength++;
576 cc++;
577 break;
578
579
580 /* Check a class for variable quantification */
581
582 case OP_CLASS:
583 cc += (*cc == OP_REF)? 2 : 33;
584
585 switch (*cc)
586 {
587 case OP_CRSTAR:
588 case OP_CRMINSTAR:
589 case OP_CRQUERY:
590 case OP_CRMINQUERY:
591 return -1;
592
593 case OP_CRRANGE:
594 case OP_CRMINRANGE:
595 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
596 branchlength += (cc[1] << 8) + cc[2];
597 cc += 5;
598 break;
599
600 default:
601 branchlength++;
602 }
603 break;
604
605 /* Anything else is variable length */
606
607 default:
608 return -1;
609 }
610 }
611 /* Control never gets here */
612 }
613
614
615
616
617 /*************************************************
618 * Compile one branch *
619 *************************************************/
620
621 /* Scan the pattern, compiling it into the code vector.
622
623 Arguments:
624 options the option bits
625 brackets points to number of brackets used
626 code points to the pointer to the current code point
627 ptrptr points to the current pattern pointer
628 errorptr points to pointer to error message
629 optchanged set to the value of the last OP_OPT item compiled
630 cd contains pointers to tables
631
632 Returns: TRUE on success
633 FALSE, with *errorptr set on error
634 */
635
636 static BOOL
637 compile_branch(int options, int *brackets, uschar **codeptr,
638 const uschar **ptrptr, const char **errorptr, int *optchanged,
639 compile_data *cd)
640 {
641 int repeat_type, op_type;
642 int repeat_min, repeat_max;
643 int bravalue, length;
644 int greedy_default, greedy_non_default;
645 register int c;
646 register uschar *code = *codeptr;
647 uschar *tempcode;
648 const uschar *ptr = *ptrptr;
649 const uschar *tempptr;
650 uschar *previous = NULL;
651 uschar class[32];
652
653 /* Set up the default and non-default settings for greediness */
654
655 greedy_default = ((options & PCRE_UNGREEDY) != 0);
656 greedy_non_default = greedy_default ^ 1;
657
658 /* Switch on next character until the end of the branch */
659
660 for (;; ptr++)
661 {
662 BOOL negate_class;
663 int class_charcount;
664 int class_lastchar;
665 int newoptions;
666 int condref;
667
668 c = *ptr;
669 if ((options & PCRE_EXTENDED) != 0)
670 {
671 if ((cd->ctypes[c] & ctype_space) != 0) continue;
672 if (c == '#')
673 {
674 while ((c = *(++ptr)) != 0 && c != '\n');
675 continue;
676 }
677 }
678
679 switch(c)
680 {
681 /* The branch terminates at end of string, |, or ). */
682
683 case 0:
684 case '|':
685 case ')':
686 *codeptr = code;
687 *ptrptr = ptr;
688 return TRUE;
689
690 /* Handle single-character metacharacters */
691
692 case '^':
693 previous = NULL;
694 *code++ = OP_CIRC;
695 break;
696
697 case '$':
698 previous = NULL;
699 *code++ = OP_DOLL;
700 break;
701
702 case '.':
703 previous = code;
704 *code++ = OP_ANY;
705 break;
706
707 /* Character classes. These always build a 32-byte bitmap of the permitted
708 characters, except in the special case where there is only one character.
709 For negated classes, we build the map as usual, then invert it at the end.
710 */
711
712 case '[':
713 previous = code;
714 *code++ = OP_CLASS;
715
716 /* If the first character is '^', set the negation flag and skip it. */
717
718 if ((c = *(++ptr)) == '^')
719 {
720 negate_class = TRUE;
721 c = *(++ptr);
722 }
723 else negate_class = FALSE;
724
725 /* Keep a count of chars so that we can optimize the case of just a single
726 character. */
727
728 class_charcount = 0;
729 class_lastchar = -1;
730
731 /* Initialize the 32-char bit map to all zeros. We have to build the
732 map in a temporary bit of store, in case the class contains only 1
733 character, because in that case the compiled code doesn't use the
734 bit map. */
735
736 memset(class, 0, 32 * sizeof(uschar));
737
738 /* Process characters until ] is reached. By writing this as a "do" it
739 means that an initial ] is taken as a data character. */
740
741 do
742 {
743 if (c == 0)
744 {
745 *errorptr = ERR6;
746 goto FAILED;
747 }
748
749 /* Backslash may introduce a single character, or it may introduce one
750 of the specials, which just set a flag. Escaped items are checked for
751 validity in the pre-compiling pass. The sequence \b is a special case.
752 Inside a class (and only there) it is treated as backspace. Elsewhere
753 it marks a word boundary. Other escapes have preset maps ready to
754 or into the one we are building. We assume they have more than one
755 character in them, so set class_count bigger than one. */
756
757 if (c == '\\')
758 {
759 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
760 if (-c == ESC_b) c = '\b';
761 else if (c < 0)
762 {
763 register const uschar *cbits = cd->cbits;
764 class_charcount = 10;
765 switch (-c)
766 {
767 case ESC_d:
768 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
769 continue;
770
771 case ESC_D:
772 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
773 continue;
774
775 case ESC_w:
776 for (c = 0; c < 32; c++)
777 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
778 continue;
779
780 case ESC_W:
781 for (c = 0; c < 32; c++)
782 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
783 continue;
784
785 case ESC_s:
786 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
787 continue;
788
789 case ESC_S:
790 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
791 continue;
792
793 default:
794 *errorptr = ERR7;
795 goto FAILED;
796 }
797 }
798 /* Fall through if single character */
799 }
800
801 /* A single character may be followed by '-' to form a range. However,
802 Perl does not permit ']' to be the end of the range. A '-' character
803 here is treated as a literal. */
804
805 if (ptr[1] == '-' && ptr[2] != ']')
806 {
807 int d;
808 ptr += 2;
809 d = *ptr;
810
811 if (d == 0)
812 {
813 *errorptr = ERR6;
814 goto FAILED;
815 }
816
817 /* The second part of a range can be a single-character escape, but
818 not any of the other escapes. */
819
820 if (d == '\\')
821 {
822 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
823 if (d < 0)
824 {
825 if (d == -ESC_b) d = '\b'; else
826 {
827 *errorptr = ERR7;
828 goto FAILED;
829 }
830 }
831 }
832
833 if (d < c)
834 {
835 *errorptr = ERR8;
836 goto FAILED;
837 }
838
839 for (; c <= d; c++)
840 {
841 class[c/8] |= (1 << (c&7));
842 if ((options & PCRE_CASELESS) != 0)
843 {
844 int uc = cd->fcc[c]; /* flip case */
845 class[uc/8] |= (1 << (uc&7));
846 }
847 class_charcount++; /* in case a one-char range */
848 class_lastchar = c;
849 }
850 continue; /* Go get the next char in the class */
851 }
852
853 /* Handle a lone single character - we can get here for a normal
854 non-escape char, or after \ that introduces a single character. */
855
856 class [c/8] |= (1 << (c&7));
857 if ((options & PCRE_CASELESS) != 0)
858 {
859 c = cd->fcc[c]; /* flip case */
860 class[c/8] |= (1 << (c&7));
861 }
862 class_charcount++;
863 class_lastchar = c;
864 }
865
866 /* Loop until ']' reached; the check for end of string happens inside the
867 loop. This "while" is the end of the "do" above. */
868
869 while ((c = *(++ptr)) != ']');
870
871 /* If class_charcount is 1 and class_lastchar is not negative, we saw
872 precisely one character. This doesn't need the whole 32-byte bit map.
873 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
874 it's negative. */
875
876 if (class_charcount == 1 && class_lastchar >= 0)
877 {
878 if (negate_class)
879 {
880 code[-1] = OP_NOT;
881 }
882 else
883 {
884 code[-1] = OP_CHARS;
885 *code++ = 1;
886 }
887 *code++ = class_lastchar;
888 }
889
890 /* Otherwise, negate the 32-byte map if necessary, and copy it into
891 the code vector. */
892
893 else
894 {
895 if (negate_class)
896 for (c = 0; c < 32; c++) code[c] = ~class[c];
897 else
898 memcpy(code, class, 32);
899 code += 32;
900 }
901 break;
902
903 /* Various kinds of repeat */
904
905 case '{':
906 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
907 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
908 if (*errorptr != NULL) goto FAILED;
909 goto REPEAT;
910
911 case '*':
912 repeat_min = 0;
913 repeat_max = -1;
914 goto REPEAT;
915
916 case '+':
917 repeat_min = 1;
918 repeat_max = -1;
919 goto REPEAT;
920
921 case '?':
922 repeat_min = 0;
923 repeat_max = 1;
924
925 REPEAT:
926 if (previous == NULL)
927 {
928 *errorptr = ERR9;
929 goto FAILED;
930 }
931
932 /* If the next character is '?' this is a minimizing repeat, by default,
933 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
934 next character. */
935
936 if (ptr[1] == '?')
937 { repeat_type = greedy_non_default; ptr++; }
938 else repeat_type = greedy_default;
939
940 /* If the maximum is zero then the minimum must also be zero; Perl allows
941 this case, so we do too - by simply omitting the item altogether. */
942
943 if (repeat_max == 0) code = previous;
944
945 /* If previous was a string of characters, chop off the last one and use it
946 as the subject of the repeat. If there was only one character, we can
947 abolish the previous item altogether. */
948
949 else if (*previous == OP_CHARS)
950 {
951 int len = previous[1];
952 if (len == 1)
953 {
954 c = previous[2];
955 code = previous;
956 }
957 else
958 {
959 c = previous[len+1];
960 previous[1]--;
961 code--;
962 }
963 op_type = 0; /* Use single-char op codes */
964 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
965 }
966
967 /* If previous was a single negated character ([^a] or similar), we use
968 one of the special opcodes, replacing it. The code is shared with single-
969 character repeats by adding a suitable offset into repeat_type. */
970
971 else if ((int)*previous == OP_NOT)
972 {
973 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
974 c = previous[1];
975 code = previous;
976 goto OUTPUT_SINGLE_REPEAT;
977 }
978
979 /* If previous was a character type match (\d or similar), abolish it and
980 create a suitable repeat item. The code is shared with single-character
981 repeats by adding a suitable offset into repeat_type. */
982
983 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
984 {
985 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
986 c = *previous;
987 code = previous;
988
989 OUTPUT_SINGLE_REPEAT:
990 repeat_type += op_type; /* Combine both values for many cases */
991
992 /* A minimum of zero is handled either as the special case * or ?, or as
993 an UPTO, with the maximum given. */
994
995 if (repeat_min == 0)
996 {
997 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
998 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
999 else
1000 {
1001 *code++ = OP_UPTO + repeat_type;
1002 *code++ = repeat_max >> 8;
1003 *code++ = (repeat_max & 255);
1004 }
1005 }
1006
1007 /* The case {1,} is handled as the special case + */
1008
1009 else if (repeat_min == 1 && repeat_max == -1)
1010 *code++ = OP_PLUS + repeat_type;
1011
1012 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1013 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1014
1015 else
1016 {
1017 if (repeat_min != 1)
1018 {
1019 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1020 *code++ = repeat_min >> 8;
1021 *code++ = (repeat_min & 255);
1022 }
1023
1024 /* If the mininum is 1 and the previous item was a character string,
1025 we either have to put back the item that got cancelled if the string
1026 length was 1, or add the character back onto the end of a longer
1027 string. For a character type nothing need be done; it will just get
1028 put back naturally. Note that the final character is always going to
1029 get added below. */
1030
1031 else if (*previous == OP_CHARS)
1032 {
1033 if (code == previous) code += 2; else previous[1]++;
1034 }
1035
1036 /* For a single negated character we also have to put back the
1037 item that got cancelled. */
1038
1039 else if (*previous == OP_NOT) code++;
1040
1041 /* If the maximum is unlimited, insert an OP_STAR. */
1042
1043 if (repeat_max < 0)
1044 {
1045 *code++ = c;
1046 *code++ = OP_STAR + repeat_type;
1047 }
1048
1049 /* Else insert an UPTO if the max is greater than the min. */
1050
1051 else if (repeat_max != repeat_min)
1052 {
1053 *code++ = c;
1054 repeat_max -= repeat_min;
1055 *code++ = OP_UPTO + repeat_type;
1056 *code++ = repeat_max >> 8;
1057 *code++ = (repeat_max & 255);
1058 }
1059 }
1060
1061 /* The character or character type itself comes last in all cases. */
1062
1063 *code++ = c;
1064 }
1065
1066 /* If previous was a character class or a back reference, we put the repeat
1067 stuff after it. */
1068
1069 else if (*previous == OP_CLASS || *previous == OP_REF)
1070 {
1071 if (repeat_min == 0 && repeat_max == -1)
1072 *code++ = OP_CRSTAR + repeat_type;
1073 else if (repeat_min == 1 && repeat_max == -1)
1074 *code++ = OP_CRPLUS + repeat_type;
1075 else if (repeat_min == 0 && repeat_max == 1)
1076 *code++ = OP_CRQUERY + repeat_type;
1077 else
1078 {
1079 *code++ = OP_CRRANGE + repeat_type;
1080 *code++ = repeat_min >> 8;
1081 *code++ = repeat_min & 255;
1082 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1083 *code++ = repeat_max >> 8;
1084 *code++ = repeat_max & 255;
1085 }
1086 }
1087
1088 /* If previous was a bracket group, we may have to replicate it in certain
1089 cases. */
1090
1091 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1092 (int)*previous == OP_COND)
1093 {
1094 register int i;
1095 int ketoffset = 0;
1096 int len = code - previous;
1097 uschar *bralink = NULL;
1098
1099 /* If the maximum repeat count is unlimited, find the end of the bracket
1100 by scanning through from the start, and compute the offset back to it
1101 from the current code pointer. There may be an OP_OPT setting following
1102 the final KET, so we can't find the end just by going back from the code
1103 pointer. */
1104
1105 if (repeat_max == -1)
1106 {
1107 register uschar *ket = previous;
1108 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1109 ketoffset = code - ket;
1110 }
1111
1112 /* The case of a zero minimum is special because of the need to stick
1113 OP_BRAZERO in front of it, and because the group appears once in the
1114 data, whereas in other cases it appears the minimum number of times. For
1115 this reason, it is simplest to treat this case separately, as otherwise
1116 the code gets far too mess. There are several special subcases when the
1117 minimum is zero. */
1118
1119 if (repeat_min == 0)
1120 {
1121 /* If the maximum is also zero, we just omit the group from the output
1122 altogether. */
1123
1124 if (repeat_max == 0)
1125 {
1126 code = previous;
1127 previous = NULL;
1128 break;
1129 }
1130
1131 /* If the maximum is 1 or unlimited, we just have to stick in the
1132 BRAZERO and do no more at this point. */
1133
1134 if (repeat_max <= 1)
1135 {
1136 memmove(previous+1, previous, len);
1137 code++;
1138 *previous++ = OP_BRAZERO + repeat_type;
1139 }
1140
1141 /* If the maximum is greater than 1 and limited, we have to replicate
1142 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1143 The first one has to be handled carefully because it's the original
1144 copy, which has to be moved up. The remainder can be handled by code
1145 that is common with the non-zero minimum case below. We just have to
1146 adjust the value or repeat_max, since one less copy is required. */
1147
1148 else
1149 {
1150 int offset;
1151 memmove(previous+4, previous, len);
1152 code += 4;
1153 *previous++ = OP_BRAZERO + repeat_type;
1154 *previous++ = OP_BRA;
1155
1156 /* We chain together the bracket offset fields that have to be
1157 filled in later when the ends of the brackets are reached. */
1158
1159 offset = (bralink == NULL)? 0 : previous - bralink;
1160 bralink = previous;
1161 *previous++ = offset >> 8;
1162 *previous++ = offset & 255;
1163 }
1164
1165 repeat_max--;
1166 }
1167
1168 /* If the minimum is greater than zero, replicate the group as many
1169 times as necessary, and adjust the maximum to the number of subsequent
1170 copies that we need. */
1171
1172 else
1173 {
1174 for (i = 1; i < repeat_min; i++)
1175 {
1176 memcpy(code, previous, len);
1177 code += len;
1178 }
1179 if (repeat_max > 0) repeat_max -= repeat_min;
1180 }
1181
1182 /* This code is common to both the zero and non-zero minimum cases. If
1183 the maximum is limited, it replicates the group in a nested fashion,
1184 remembering the bracket starts on a stack. In the case of a zero minimum,
1185 the first one was set up above. In all cases the repeat_max now specifies
1186 the number of additional copies needed. */
1187
1188 if (repeat_max >= 0)
1189 {
1190 for (i = repeat_max - 1; i >= 0; i--)
1191 {
1192 *code++ = OP_BRAZERO + repeat_type;
1193
1194 /* All but the final copy start a new nesting, maintaining the
1195 chain of brackets outstanding. */
1196
1197 if (i != 0)
1198 {
1199 int offset;
1200 *code++ = OP_BRA;
1201 offset = (bralink == NULL)? 0 : code - bralink;
1202 bralink = code;
1203 *code++ = offset >> 8;
1204 *code++ = offset & 255;
1205 }
1206
1207 memcpy(code, previous, len);
1208 code += len;
1209 }
1210
1211 /* Now chain through the pending brackets, and fill in their length
1212 fields (which are holding the chain links pro tem). */
1213
1214 while (bralink != NULL)
1215 {
1216 int oldlinkoffset;
1217 int offset = code - bralink + 1;
1218 uschar *bra = code - offset;
1219 oldlinkoffset = (bra[1] << 8) + bra[2];
1220 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1221 *code++ = OP_KET;
1222 *code++ = bra[1] = offset >> 8;
1223 *code++ = bra[2] = (offset & 255);
1224 }
1225 }
1226
1227 /* If the maximum is unlimited, set a repeater in the final copy. We
1228 can't just offset backwards from the current code point, because we
1229 don't know if there's been an options resetting after the ket. The
1230 correct offset was computed above. */
1231
1232 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1233
1234
1235 #ifdef NEVER
1236 /* If the minimum is greater than zero, and the maximum is unlimited or
1237 equal to the minimum, the first copy remains where it is, and is
1238 replicated up to the minimum number of times. This case includes the +
1239 repeat, but of course no replication is needed in that case. */
1240
1241 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1242 {
1243 for (i = 1; i < repeat_min; i++)
1244 {
1245 memcpy(code, previous, len);
1246 code += len;
1247 }
1248 }
1249
1250 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1251 Then, if there is a fixed upper limit, replicated up to that many times,
1252 sticking BRAZERO in front of all the optional ones. */
1253
1254 else
1255 {
1256 if (repeat_min == 0)
1257 {
1258 memmove(previous+1, previous, len);
1259 code++;
1260 *previous++ = OP_BRAZERO + repeat_type;
1261 }
1262
1263 for (i = 1; i < repeat_min; i++)
1264 {
1265 memcpy(code, previous, len);
1266 code += len;
1267 }
1268
1269 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1270 {
1271 *code++ = OP_BRAZERO + repeat_type;
1272 memcpy(code, previous, len);
1273 code += len;
1274 }
1275 }
1276
1277 /* If the maximum is unlimited, set a repeater in the final copy. We
1278 can't just offset backwards from the current code point, because we
1279 don't know if there's been an options resetting after the ket. The
1280 correct offset was computed above. */
1281
1282 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1283 #endif
1284
1285
1286 }
1287
1288 /* Else there's some kind of shambles */
1289
1290 else
1291 {
1292 *errorptr = ERR11;
1293 goto FAILED;
1294 }
1295
1296 /* In all case we no longer have a previous item. */
1297
1298 previous = NULL;
1299 break;
1300
1301
1302 /* Start of nested bracket sub-expression, or comment or lookahead or
1303 lookbehind or option setting or condition. First deal with special things
1304 that can come after a bracket; all are introduced by ?, and the appearance
1305 of any of them means that this is not a referencing group. They were
1306 checked for validity in the first pass over the string, so we don't have to
1307 check for syntax errors here. */
1308
1309 case '(':
1310 newoptions = options;
1311 condref = -1;
1312
1313 if (*(++ptr) == '?')
1314 {
1315 int set, unset;
1316 int *optset;
1317
1318 switch (*(++ptr))
1319 {
1320 case '#': /* Comment; skip to ket */
1321 ptr++;
1322 while (*ptr != ')') ptr++;
1323 continue;
1324
1325 case ':': /* Non-extracting bracket */
1326 bravalue = OP_BRA;
1327 ptr++;
1328 break;
1329
1330 case '(':
1331 bravalue = OP_COND; /* Conditional group */
1332 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1333 {
1334 condref = *ptr - '0';
1335 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1336 ptr++;
1337 }
1338 else ptr--;
1339 break;
1340
1341 case '=': /* Positive lookahead */
1342 bravalue = OP_ASSERT;
1343 ptr++;
1344 break;
1345
1346 case '!': /* Negative lookahead */
1347 bravalue = OP_ASSERT_NOT;
1348 ptr++;
1349 break;
1350
1351 case '<': /* Lookbehinds */
1352 switch (*(++ptr))
1353 {
1354 case '=': /* Positive lookbehind */
1355 bravalue = OP_ASSERTBACK;
1356 ptr++;
1357 break;
1358
1359 case '!': /* Negative lookbehind */
1360 bravalue = OP_ASSERTBACK_NOT;
1361 ptr++;
1362 break;
1363
1364 default: /* Syntax error */
1365 *errorptr = ERR24;
1366 goto FAILED;
1367 }
1368 break;
1369
1370 case '>': /* One-time brackets */
1371 bravalue = OP_ONCE;
1372 ptr++;
1373 break;
1374
1375 default: /* Option setting */
1376 set = unset = 0;
1377 optset = &set;
1378
1379 while (*ptr != ')' && *ptr != ':')
1380 {
1381 switch (*ptr++)
1382 {
1383 case '-': optset = &unset; break;
1384
1385 case 'i': *optset |= PCRE_CASELESS; break;
1386 case 'm': *optset |= PCRE_MULTILINE; break;
1387 case 's': *optset |= PCRE_DOTALL; break;
1388 case 'x': *optset |= PCRE_EXTENDED; break;
1389 case 'U': *optset |= PCRE_UNGREEDY; break;
1390 case 'X': *optset |= PCRE_EXTRA; break;
1391
1392 default:
1393 *errorptr = ERR12;
1394 goto FAILED;
1395 }
1396 }
1397
1398 /* Set up the changed option bits, but don't change anything yet. */
1399
1400 newoptions = (options | set) & (~unset);
1401
1402 /* If the options ended with ')' this is not the start of a nested
1403 group with option changes, so the options change at this level. At top
1404 level there is nothing else to be done (the options will in fact have
1405 been set from the start of compiling as a result of the first pass) but
1406 at an inner level we must compile code to change the ims options if
1407 necessary, and pass the new setting back so that it can be put at the
1408 start of any following branches, and when this group ends, a resetting
1409 item can be compiled. */
1410
1411 if (*ptr == ')')
1412 {
1413 if ((options & PCRE_INGROUP) != 0 &&
1414 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1415 {
1416 *code++ = OP_OPT;
1417 *code++ = *optchanged = newoptions & PCRE_IMS;
1418 }
1419 options = newoptions; /* Change options at this level */
1420 previous = NULL; /* This item can't be repeated */
1421 continue; /* It is complete */
1422 }
1423
1424 /* If the options ended with ':' we are heading into a nested group
1425 with possible change of options. Such groups are non-capturing and are
1426 not assertions of any kind. All we need to do is skip over the ':';
1427 the newoptions value is handled below. */
1428
1429 bravalue = OP_BRA;
1430 ptr++;
1431 }
1432 }
1433
1434 /* Else we have a referencing group; adjust the opcode. */
1435
1436 else
1437 {
1438 if (++(*brackets) > EXTRACT_MAX)
1439 {
1440 *errorptr = ERR13;
1441 goto FAILED;
1442 }
1443 bravalue = OP_BRA + *brackets;
1444 }
1445
1446 /* Process nested bracketed re. Assertions may not be repeated, but other
1447 kinds can be. We copy code into a non-register variable in order to be able
1448 to pass its address because some compilers complain otherwise. Pass in a
1449 new setting for the ims options if they have changed. */
1450
1451 previous = (bravalue >= OP_ONCE)? code : NULL;
1452 *code = bravalue;
1453 tempcode = code;
1454
1455 if (!compile_regex(
1456 options | PCRE_INGROUP, /* Set for all nested groups */
1457 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1458 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1459 brackets, /* Bracket level */
1460 &tempcode, /* Where to put code (updated) */
1461 &ptr, /* Input pointer (updated) */
1462 errorptr, /* Where to put an error message */
1463 (bravalue == OP_ASSERTBACK ||
1464 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1465 condref, /* Condition reference number */
1466 cd)) /* Tables block */
1467 goto FAILED;
1468
1469 /* At the end of compiling, code is still pointing to the start of the
1470 group, while tempcode has been updated to point past the end of the group
1471 and any option resetting that may follow it. The pattern pointer (ptr)
1472 is on the bracket. */
1473
1474 /* If this is a conditional bracket, check that there are no more than
1475 two branches in the group. */
1476
1477 if (bravalue == OP_COND)
1478 {
1479 int branchcount = 0;
1480 uschar *tc = code;
1481
1482 do {
1483 branchcount++;
1484 tc += (tc[1] << 8) | tc[2];
1485 }
1486 while (*tc != OP_KET);
1487
1488 if (branchcount > 2)
1489 {
1490 *errorptr = ERR27;
1491 goto FAILED;
1492 }
1493 }
1494
1495 /* Now update the main code pointer to the end of the group. */
1496
1497 code = tempcode;
1498
1499 /* Error if hit end of pattern */
1500
1501 if (*ptr != ')')
1502 {
1503 *errorptr = ERR14;
1504 goto FAILED;
1505 }
1506 break;
1507
1508 /* Check \ for being a real metacharacter; if not, fall through and handle
1509 it as a data character at the start of a string. Escape items are checked
1510 for validity in the pre-compiling pass. */
1511
1512 case '\\':
1513 tempptr = ptr;
1514 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1515
1516 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1517 are arranged to be the negation of the corresponding OP_values. For the
1518 back references, the values are ESC_REF plus the reference number. Only
1519 back references and those types that consume a character may be repeated.
1520 We can test for values between ESC_b and ESC_Z for the latter; this may
1521 have to change if any new ones are ever created. */
1522
1523 if (c < 0)
1524 {
1525 if (-c >= ESC_REF)
1526 {
1527 previous = code;
1528 *code++ = OP_REF;
1529 *code++ = -c - ESC_REF;
1530 }
1531 else
1532 {
1533 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1534 *code++ = -c;
1535 }
1536 continue;
1537 }
1538
1539 /* Data character: reset and fall through */
1540
1541 ptr = tempptr;
1542 c = '\\';
1543
1544 /* Handle a run of data characters until a metacharacter is encountered.
1545 The first character is guaranteed not to be whitespace or # when the
1546 extended flag is set. */
1547
1548 NORMAL_CHAR:
1549 default:
1550 previous = code;
1551 *code = OP_CHARS;
1552 code += 2;
1553 length = 0;
1554
1555 do
1556 {
1557 if ((options & PCRE_EXTENDED) != 0)
1558 {
1559 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1560 if (c == '#')
1561 {
1562 while ((c = *(++ptr)) != 0 && c != '\n');
1563 if (c == 0) break;
1564 continue;
1565 }
1566 }
1567
1568 /* Backslash may introduce a data char or a metacharacter. Escaped items
1569 are checked for validity in the pre-compiling pass. Stop the string
1570 before a metaitem. */
1571
1572 if (c == '\\')
1573 {
1574 tempptr = ptr;
1575 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1576 if (c < 0) { ptr = tempptr; break; }
1577 }
1578
1579 /* Ordinary character or single-char escape */
1580
1581 *code++ = c;
1582 length++;
1583 }
1584
1585 /* This "while" is the end of the "do" above. */
1586
1587 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1588
1589 /* Compute the length and set it in the data vector, and advance to
1590 the next state. */
1591
1592 previous[1] = length;
1593 if (length < 255) ptr--;
1594 break;
1595 }
1596 } /* end of big loop */
1597
1598 /* Control never reaches here by falling through, only by a goto for all the
1599 error states. Pass back the position in the pattern so that it can be displayed
1600 to the user for diagnosing the error. */
1601
1602 FAILED:
1603 *ptrptr = ptr;
1604 return FALSE;
1605 }
1606
1607
1608
1609
1610 /*************************************************
1611 * Compile sequence of alternatives *
1612 *************************************************/
1613
1614 /* On entry, ptr is pointing past the bracket character, but on return
1615 it points to the closing bracket, or vertical bar, or end of string.
1616 The code variable is pointing at the byte into which the BRA operator has been
1617 stored. If the ims options are changed at the start (for a (?ims: group) or
1618 during any branch, we need to insert an OP_OPT item at the start of every
1619 following branch to ensure they get set correctly at run time, and also pass
1620 the new options into every subsequent branch compile.
1621
1622 Argument:
1623 options the option bits
1624 optchanged new ims options to set as if (?ims) were at the start, or -1
1625 for no change
1626 brackets -> int containing the number of extracting brackets used
1627 codeptr -> the address of the current code pointer
1628 ptrptr -> the address of the current pattern pointer
1629 errorptr -> pointer to error message
1630 lookbehind TRUE if this is a lookbehind assertion
1631 condref > 0 for OPT_CREF setting at start of conditional group
1632 cd points to the data block with tables pointers
1633
1634 Returns: TRUE on success
1635 */
1636
1637 static BOOL
1638 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1639 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1640 compile_data *cd)
1641 {
1642 const uschar *ptr = *ptrptr;
1643 uschar *code = *codeptr;
1644 uschar *last_branch = code;
1645 uschar *start_bracket = code;
1646 uschar *reverse_count = NULL;
1647 int oldoptions = options & PCRE_IMS;
1648
1649 code += 3;
1650
1651 /* At the start of a reference-based conditional group, insert the reference
1652 number as an OP_CREF item. */
1653
1654 if (condref > 0)
1655 {
1656 *code++ = OP_CREF;
1657 *code++ = condref;
1658 }
1659
1660 /* Loop for each alternative branch */
1661
1662 for (;;)
1663 {
1664 int length;
1665
1666 /* Handle change of options */
1667
1668 if (optchanged >= 0)
1669 {
1670 *code++ = OP_OPT;
1671 *code++ = optchanged;
1672 options = (options & ~PCRE_IMS) | optchanged;
1673 }
1674
1675 /* Set up dummy OP_REVERSE if lookbehind assertion */
1676
1677 if (lookbehind)
1678 {
1679 *code++ = OP_REVERSE;
1680 reverse_count = code;
1681 *code++ = 0;
1682 *code++ = 0;
1683 }
1684
1685 /* Now compile the branch */
1686
1687 if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
1688 {
1689 *ptrptr = ptr;
1690 return FALSE;
1691 }
1692
1693 /* Fill in the length of the last branch */
1694
1695 length = code - last_branch;
1696 last_branch[1] = length >> 8;
1697 last_branch[2] = length & 255;
1698
1699 /* If lookbehind, check that this branch matches a fixed-length string,
1700 and put the length into the OP_REVERSE item. Temporarily mark the end of
1701 the branch with OP_END. */
1702
1703 if (lookbehind)
1704 {
1705 *code = OP_END;
1706 length = find_fixedlength(last_branch);
1707 DPRINTF(("fixed length = %d\n", length));
1708 if (length < 0)
1709 {
1710 *errorptr = ERR25;
1711 *ptrptr = ptr;
1712 return FALSE;
1713 }
1714 reverse_count[0] = (length >> 8);
1715 reverse_count[1] = length & 255;
1716 }
1717
1718 /* Reached end of expression, either ')' or end of pattern. Insert a
1719 terminating ket and the length of the whole bracketed item, and return,
1720 leaving the pointer at the terminating char. If any of the ims options
1721 were changed inside the group, compile a resetting op-code following. */
1722
1723 if (*ptr != '|')
1724 {
1725 length = code - start_bracket;
1726 *code++ = OP_KET;
1727 *code++ = length >> 8;
1728 *code++ = length & 255;
1729 if (optchanged >= 0)
1730 {
1731 *code++ = OP_OPT;
1732 *code++ = oldoptions;
1733 }
1734 *codeptr = code;
1735 *ptrptr = ptr;
1736 return TRUE;
1737 }
1738
1739 /* Another branch follows; insert an "or" node and advance the pointer. */
1740
1741 *code = OP_ALT;
1742 last_branch = code;
1743 code += 3;
1744 ptr++;
1745 }
1746 /* Control never reaches here */
1747 }
1748
1749
1750
1751
1752 /*************************************************
1753 * Find first significant op code *
1754 *************************************************/
1755
1756 /* This is called by several functions that scan a compiled expression looking
1757 for a fixed first character, or an anchoring op code etc. It skips over things
1758 that do not influence this. For one application, a change of caseless option is
1759 important.
1760
1761 Arguments:
1762 code pointer to the start of the group
1763 options pointer to external options
1764 optbit the option bit whose changing is significant, or
1765 zero if none are
1766 optstop TRUE to return on option change, otherwise change the options
1767 value and continue
1768
1769 Returns: pointer to the first significant opcode
1770 */
1771
1772 static const uschar*
1773 first_significant_code(const uschar *code, int *options, int optbit,
1774 BOOL optstop)
1775 {
1776 for (;;)
1777 {
1778 switch ((int)*code)
1779 {
1780 case OP_OPT:
1781 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1782 {
1783 if (optstop) return code;
1784 *options = (int)code[1];
1785 }
1786 code += 2;
1787 break;
1788
1789 case OP_CREF:
1790 code += 2;
1791 break;
1792
1793 case OP_ASSERT_NOT:
1794 case OP_ASSERTBACK:
1795 case OP_ASSERTBACK_NOT:
1796 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1797 code += 3;
1798 break;
1799
1800 default:
1801 return code;
1802 }
1803 }
1804 /* Control never reaches here */
1805 }
1806
1807
1808
1809
1810 /*************************************************
1811 * Check for anchored expression *
1812 *************************************************/
1813
1814 /* Try to find out if this is an anchored regular expression. Consider each
1815 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1816 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1817 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1818 counts, since OP_CIRC can match in the middle.
1819
1820 A branch is also implicitly anchored if it starts with .* because that will try
1821 the rest of the pattern at all possible matching points, so there is no point
1822 trying them again.
1823
1824 Arguments:
1825 code points to start of expression (the bracket)
1826 options points to the options setting
1827
1828 Returns: TRUE or FALSE
1829 */
1830
1831 static BOOL
1832 is_anchored(register const uschar *code, int *options)
1833 {
1834 do {
1835 const uschar *scode = first_significant_code(code + 3, options,
1836 PCRE_MULTILINE, FALSE);
1837 register int op = *scode;
1838 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1839 { if (!is_anchored(scode, options)) return FALSE; }
1840 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1841 { if (scode[1] != OP_ANY) return FALSE; }
1842 else if (op != OP_SOD &&
1843 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1844 return FALSE;
1845 code += (code[1] << 8) + code[2];
1846 }
1847 while (*code == OP_ALT);
1848 return TRUE;
1849 }
1850
1851
1852
1853 /*************************************************
1854 * Check for start with \n line expression *
1855 *************************************************/
1856
1857 /* This is called for multiline expressions to try to find out if every branch
1858 starts with ^ so that "first char" processing can be done to speed things up.
1859
1860 Argument: points to start of expression (the bracket)
1861 Returns: TRUE or FALSE
1862 */
1863
1864 static BOOL
1865 is_startline(const uschar *code)
1866 {
1867 do {
1868 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1869 register int op = *scode;
1870 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1871 { if (!is_startline(scode)) return FALSE; }
1872 else if (op != OP_CIRC) return FALSE;
1873 code += (code[1] << 8) + code[2];
1874 }
1875 while (*code == OP_ALT);
1876 return TRUE;
1877 }
1878
1879
1880
1881 /*************************************************
1882 * Check for fixed first char *
1883 *************************************************/
1884
1885 /* Try to find out if there is a fixed first character. This is called for
1886 unanchored expressions, as it speeds up their processing quite considerably.
1887 Consider each alternative branch. If they all start with the same char, or with
1888 a bracket all of whose alternatives start with the same char (recurse ad lib),
1889 then we return that char, otherwise -1.
1890
1891 Arguments:
1892 code points to start of expression (the bracket)
1893 options pointer to the options (used to check casing changes)
1894
1895 Returns: -1 or the fixed first char
1896 */
1897
1898 static int
1899 find_firstchar(const uschar *code, int *options)
1900 {
1901 register int c = -1;
1902 do {
1903 int d;
1904 const uschar *scode = first_significant_code(code + 3, options,
1905 PCRE_CASELESS, TRUE);
1906 register int op = *scode;
1907
1908 if (op >= OP_BRA) op = OP_BRA;
1909
1910 switch(op)
1911 {
1912 default:
1913 return -1;
1914
1915 case OP_BRA:
1916 case OP_ASSERT:
1917 case OP_ONCE:
1918 case OP_COND:
1919 if ((d = find_firstchar(scode, options)) < 0) return -1;
1920 if (c < 0) c = d; else if (c != d) return -1;
1921 break;
1922
1923 case OP_EXACT: /* Fall through */
1924 scode++;
1925
1926 case OP_CHARS: /* Fall through */
1927 scode++;
1928
1929 case OP_PLUS:
1930 case OP_MINPLUS:
1931 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1932 break;
1933 }
1934
1935 code += (code[1] << 8) + code[2];
1936 }
1937 while (*code == OP_ALT);
1938 return c;
1939 }
1940
1941
1942
1943
1944
1945 /*************************************************
1946 * Compile a Regular Expression *
1947 *************************************************/
1948
1949 /* This function takes a string and returns a pointer to a block of store
1950 holding a compiled version of the expression.
1951
1952 Arguments:
1953 pattern the regular expression
1954 options various option bits
1955 errorptr pointer to pointer to error text
1956 erroroffset ptr offset in pattern where error was detected
1957 tables pointer to character tables or NULL
1958
1959 Returns: pointer to compiled data block, or NULL on error,
1960 with errorptr and erroroffset set
1961 */
1962
1963 pcre *
1964 pcre_compile(const char *pattern, int options, const char **errorptr,
1965 int *erroroffset, const unsigned char *tables)
1966 {
1967 real_pcre *re;
1968 int length = 3; /* For initial BRA plus length */
1969 int runlength;
1970 int c, size;
1971 int bracount = 0;
1972 int top_backref = 0;
1973 int branch_extra = 0;
1974 int branch_newextra;
1975 unsigned int brastackptr = 0;
1976 uschar *code;
1977 const uschar *ptr;
1978 compile_data compile_block;
1979 int brastack[BRASTACK_SIZE];
1980 uschar bralenstack[BRASTACK_SIZE];
1981
1982 #ifdef DEBUG
1983 uschar *code_base, *code_end;
1984 #endif
1985
1986 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1987 can do is just return NULL. */
1988
1989 if (errorptr == NULL) return NULL;
1990 *errorptr = NULL;
1991
1992 /* However, we can give a message for this error */
1993
1994 if (erroroffset == NULL)
1995 {
1996 *errorptr = ERR16;
1997 return NULL;
1998 }
1999 *erroroffset = 0;
2000
2001 if ((options & ~PUBLIC_OPTIONS) != 0)
2002 {
2003 *errorptr = ERR17;
2004 return NULL;
2005 }
2006
2007 /* Set up pointers to the individual character tables */
2008
2009 if (tables == NULL) tables = pcre_default_tables;
2010 compile_block.lcc = tables + lcc_offset;
2011 compile_block.fcc = tables + fcc_offset;
2012 compile_block.cbits = tables + cbits_offset;
2013 compile_block.ctypes = tables + ctypes_offset;
2014
2015 /* Reflect pattern for debugging output */
2016
2017 DPRINTF(("------------------------------------------------------------------\n"));
2018 DPRINTF(("%s\n", pattern));
2019
2020 /* The first thing to do is to make a pass over the pattern to compute the
2021 amount of store required to hold the compiled code. This does not have to be
2022 perfect as long as errors are overestimates. At the same time we can detect any
2023 internal flag settings. Make an attempt to correct for any counted white space
2024 if an "extended" flag setting appears late in the pattern. We can't be so
2025 clever for #-comments. */
2026
2027 ptr = (const uschar *)(pattern - 1);
2028 while ((c = *(++ptr)) != 0)
2029 {
2030 int min, max;
2031 int class_charcount;
2032
2033 if ((options & PCRE_EXTENDED) != 0)
2034 {
2035 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2036 if (c == '#')
2037 {
2038 while ((c = *(++ptr)) != 0 && c != '\n');
2039 continue;
2040 }
2041 }
2042
2043 switch(c)
2044 {
2045 /* A backslashed item may be an escaped "normal" character or a
2046 character type. For a "normal" character, put the pointers and
2047 character back so that tests for whitespace etc. in the input
2048 are done correctly. */
2049
2050 case '\\':
2051 {
2052 const uschar *save_ptr = ptr;
2053 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2054 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2055 if (c >= 0)
2056 {
2057 ptr = save_ptr;
2058 c = '\\';
2059 goto NORMAL_CHAR;
2060 }
2061 }
2062 length++;
2063
2064 /* A back reference needs an additional char, plus either one or 5
2065 bytes for a repeat. We also need to keep the value of the highest
2066 back reference. */
2067
2068 if (c <= -ESC_REF)
2069 {
2070 int refnum = -c - ESC_REF;
2071 if (refnum > top_backref) top_backref = refnum;
2072 length++; /* For single back reference */
2073 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2074 {
2075 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2076 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2077 if ((min == 0 && (max == 1 || max == -1)) ||
2078 (min == 1 && max == -1))
2079 length++;
2080 else length += 5;
2081 if (ptr[1] == '?') ptr++;
2082 }
2083 }
2084 continue;
2085
2086 case '^':
2087 case '.':
2088 case '$':
2089 case '*': /* These repeats won't be after brackets; */
2090 case '+': /* those are handled separately */
2091 case '?':
2092 length++;
2093 continue;
2094
2095 /* This covers the cases of repeats after a single char, metachar, class,
2096 or back reference. */
2097
2098 case '{':
2099 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2100 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2101 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2102 if ((min == 0 && (max == 1 || max == -1)) ||
2103 (min == 1 && max == -1))
2104 length++;
2105 else
2106 {
2107 length--; /* Uncount the original char or metachar */
2108 if (min == 1) length++; else if (min > 0) length += 4;
2109 if (max > 0) length += 4; else length += 2;
2110 }
2111 if (ptr[1] == '?') ptr++;
2112 continue;
2113
2114 /* An alternation contains an offset to the next branch or ket. If any ims
2115 options changed in the previous branch(es), and/or if we are in a
2116 lookbehind assertion, extra space will be needed at the start of the
2117 branch. This is handled by branch_extra. */
2118
2119 case '|':
2120 length += 3 + branch_extra;
2121 continue;
2122
2123 /* A character class uses 33 characters. Don't worry about character types
2124 that aren't allowed in classes - they'll get picked up during the compile.
2125 A character class that contains only one character uses 2 or 3 bytes,
2126 depending on whether it is negated or not. Notice this where we can. */
2127
2128 case '[':
2129 class_charcount = 0;
2130 if (*(++ptr) == '^') ptr++;
2131 do
2132 {
2133 if (*ptr == '\\')
2134 {
2135 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2136 &compile_block);
2137 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2138 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2139 }
2140 else class_charcount++;
2141 ptr++;
2142 }
2143 while (*ptr != 0 && *ptr != ']');
2144
2145 /* Repeats for negated single chars are handled by the general code */
2146
2147 if (class_charcount == 1) length += 3; else
2148 {
2149 length += 33;
2150
2151 /* A repeat needs either 1 or 5 bytes. */
2152
2153 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2154 {
2155 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2156 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2157 if ((min == 0 && (max == 1 || max == -1)) ||
2158 (min == 1 && max == -1))
2159 length++;
2160 else length += 5;
2161 if (ptr[1] == '?') ptr++;
2162 }
2163 }
2164 continue;
2165
2166 /* Brackets may be genuine groups or special things */
2167
2168 case '(':
2169 branch_newextra = 0;
2170
2171 /* Handle special forms of bracket, which all start (? */
2172
2173 if (ptr[1] == '?')
2174 {
2175 int set, unset;
2176 int *optset;
2177
2178 switch (c = ptr[2])
2179 {
2180 /* Skip over comments entirely */
2181 case '#':
2182 ptr += 3;
2183 while (*ptr != 0 && *ptr != ')') ptr++;
2184 if (*ptr == 0)
2185 {
2186 *errorptr = ERR18;
2187 goto PCRE_ERROR_RETURN;
2188 }
2189 continue;
2190
2191 /* Non-referencing groups and lookaheads just move the pointer on, and
2192 then behave like a non-special bracket, except that they don't increment
2193 the count of extracting brackets. Ditto for the "once only" bracket,
2194 which is in Perl from version 5.005. */
2195
2196 case ':':
2197 case '=':
2198 case '!':
2199 case '>':
2200 ptr += 2;
2201 break;
2202
2203 /* Lookbehinds are in Perl from version 5.005 */
2204
2205 case '<':
2206 if (ptr[3] == '=' || ptr[3] == '!')
2207 {
2208 ptr += 3;
2209 branch_newextra = 3;
2210 length += 3; /* For the first branch */
2211 break;
2212 }
2213 *errorptr = ERR24;
2214 goto PCRE_ERROR_RETURN;
2215
2216 /* Conditionals are in Perl from version 5.005. The bracket must either
2217 be followed by a number (for bracket reference) or by an assertion
2218 group. */
2219
2220 case '(':
2221 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2222 {
2223 ptr += 4;
2224 length += 2;
2225 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2226 if (*ptr != ')')
2227 {
2228 *errorptr = ERR26;
2229 goto PCRE_ERROR_RETURN;
2230 }
2231 }
2232 else /* An assertion must follow */
2233 {
2234 ptr++; /* Can treat like ':' as far as spacing is concerned */
2235
2236 if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2237 {
2238 ptr += 2; /* To get right offset in message */
2239 *errorptr = ERR28;
2240 goto PCRE_ERROR_RETURN;
2241 }
2242 }
2243 break;
2244
2245 /* Else loop checking valid options until ) is met. Anything else is an
2246 error. If we are without any brackets, i.e. at top level, the settings
2247 act as if specified in the options, so massage the options immediately.
2248 This is for backward compatibility with Perl 5.004. */
2249
2250 default:
2251 set = unset = 0;
2252 optset = &set;
2253 ptr += 2;
2254
2255 for (;; ptr++)
2256 {
2257 c = *ptr;
2258 switch (c)
2259 {
2260 case 'i':
2261 *optset |= PCRE_CASELESS;
2262 continue;
2263
2264 case 'm':
2265 *optset |= PCRE_MULTILINE;
2266 continue;
2267
2268 case 's':
2269 *optset |= PCRE_DOTALL;
2270 continue;
2271
2272 case 'x':
2273 *optset |= PCRE_EXTENDED;
2274 continue;
2275
2276 case 'X':
2277 *optset |= PCRE_EXTRA;
2278 continue;
2279
2280 case 'U':
2281 *optset |= PCRE_UNGREEDY;
2282 continue;
2283
2284 case '-':
2285 optset = &unset;
2286 continue;
2287
2288 /* A termination by ')' indicates an options-setting-only item;
2289 this is global at top level; otherwise nothing is done here and
2290 it is handled during the compiling process on a per-bracket-group
2291 basis. */
2292
2293 case ')':
2294 if (brastackptr == 0)
2295 {
2296 options = (options | set) & (~unset);
2297 set = unset = 0; /* To save length */
2298 }
2299 /* Fall through */
2300
2301 /* A termination by ':' indicates the start of a nested group with
2302 the given options set. This is again handled at compile time, but
2303 we must allow for compiled space if any of the ims options are
2304 set. We also have to allow for resetting space at the end of
2305 the group, which is why 4 is added to the length and not just 2.
2306 If there are several changes of options within the same group, this
2307 will lead to an over-estimate on the length, but this shouldn't
2308 matter very much. We also have to allow for resetting options at
2309 the start of any alternations, which we do by setting
2310 branch_newextra to 2. */
2311
2312 case ':':
2313 if (((set|unset) & PCRE_IMS) != 0)
2314 {
2315 length += 4;
2316 branch_newextra = 2;
2317 }
2318 goto END_OPTIONS;
2319
2320 /* Unrecognized option character */
2321
2322 default:
2323 *errorptr = ERR12;
2324 goto PCRE_ERROR_RETURN;
2325 }
2326 }
2327
2328 /* If we hit a closing bracket, that's it - this is a freestanding
2329 option-setting. We need to ensure that branch_extra is updated if
2330 necessary. The only values branch_newextra can have here are 0 or 2.
2331 If the value is 2, then branch_extra must either be 2 or 5, depending
2332 on whether this is a lookbehind group or not. */
2333
2334 END_OPTIONS:
2335 if (c == ')')
2336 {
2337 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2338 branch_extra += branch_newextra;
2339 continue;
2340 }
2341
2342 /* If options were terminated by ':' control comes here. Fall through
2343 to handle the group below. */
2344 }
2345 }
2346
2347 /* Extracting brackets must be counted so we can process escapes in a
2348 Perlish way. */
2349
2350 else bracount++;
2351
2352 /* Non-special forms of bracket. Save length for computing whole length
2353 at end if there's a repeat that requires duplication of the group. Also
2354 save the current value of branch_extra, and start the new group with
2355 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2356 for a lookbehind assertion. */
2357
2358 if (brastackptr >= sizeof(brastack)/sizeof(int))
2359 {
2360 *errorptr = ERR19;
2361 goto PCRE_ERROR_RETURN;
2362 }
2363
2364 bralenstack[brastackptr] = branch_extra;
2365 branch_extra = branch_newextra;
2366
2367 brastack[brastackptr++] = length;
2368 length += 3;
2369 continue;
2370
2371 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2372 have to replicate this bracket up to that many times. If brastackptr is
2373 0 this is an unmatched bracket which will generate an error, but take care
2374 not to try to access brastack[-1] when computing the length and restoring
2375 the branch_extra value. */
2376
2377 case ')':
2378 length += 3;
2379 {
2380 int minval = 1;
2381 int maxval = 1;
2382 int duplength;
2383
2384 if (brastackptr > 0)
2385 {
2386 duplength = length - brastack[--brastackptr];
2387 branch_extra = bralenstack[brastackptr];
2388 }
2389 else duplength = 0;
2390
2391 /* Leave ptr at the final char; for read_repeat_counts this happens
2392 automatically; for the others we need an increment. */
2393
2394 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2395 {
2396 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2397 &compile_block);
2398 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2399 }
2400 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2401 else if (c == '+') { maxval = -1; ptr++; }
2402 else if (c == '?') { minval = 0; ptr++; }
2403
2404 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2405 group, and if the maximum is greater than zero, we have to replicate
2406 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2407 bracket set - hence the 7. */
2408
2409 if (minval == 0)
2410 {
2411 length++;
2412 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2413 }
2414
2415 /* When the minimum is greater than zero, 1 we have to replicate up to
2416 minval-1 times, with no additions required in the copies. Then, if
2417 there is a limited maximum we have to replicate up to maxval-1 times
2418 allowing for a BRAZERO item before each optional copy and nesting
2419 brackets for all but one of the optional copies. */
2420
2421 else
2422 {
2423 length += (minval - 1) * duplength;
2424 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2425 length += (maxval - minval) * (duplength + 7) - 6;
2426 }
2427 }
2428 continue;
2429
2430 /* Non-special character. For a run of such characters the length required
2431 is the number of characters + 2, except that the maximum run length is 255.
2432 We won't get a skipped space or a non-data escape or the start of a #
2433 comment as the first character, so the length can't be zero. */
2434
2435 NORMAL_CHAR:
2436 default:
2437 length += 2;
2438 runlength = 0;
2439 do
2440 {
2441 if ((options & PCRE_EXTENDED) != 0)
2442 {
2443 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2444 if (c == '#')
2445 {
2446 while ((c = *(++ptr)) != 0 && c != '\n');
2447 continue;
2448 }
2449 }
2450
2451 /* Backslash may introduce a data char or a metacharacter; stop the
2452 string before the latter. */
2453
2454 if (c == '\\')
2455 {
2456 const uschar *saveptr = ptr;
2457 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2458 &compile_block);
2459 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2460 if (c < 0) { ptr = saveptr; break; }
2461 }
2462
2463 /* Ordinary character or single-char escape */
2464
2465 runlength++;
2466 }
2467
2468 /* This "while" is the end of the "do" above. */
2469
2470 while (runlength < 255 &&
2471 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2472
2473 ptr--;
2474 length += runlength;
2475 continue;
2476 }
2477 }
2478
2479 length += 4; /* For final KET and END */
2480
2481 if (length > 65539)
2482 {
2483 *errorptr = ERR20;
2484 return NULL;
2485 }
2486
2487 /* Compute the size of data block needed and get it, either from malloc or
2488 externally provided function. We specify "code[0]" in the offsetof() expression
2489 rather than just "code", because it has been reported that one broken compiler
2490 fails on "code" because it is also an independent variable. It should make no
2491 difference to the value of the offsetof(). */
2492
2493 size = length + offsetof(real_pcre, code[0]);
2494 re = (real_pcre *)(pcre_malloc)(size);
2495
2496 if (re == NULL)
2497 {
2498 *errorptr = ERR21;
2499 return NULL;
2500 }
2501
2502 /* Put in the magic number and the options. */
2503
2504 re->magic_number = MAGIC_NUMBER;
2505 re->options = options;
2506 re->tables = tables;
2507
2508 /* Set up a starting, non-extracting bracket, then compile the expression. On
2509 error, *errorptr will be set non-NULL, so we don't need to look at the result
2510 of the function here. */
2511
2512 ptr = (const uschar *)pattern;
2513 code = re->code;
2514 *code = OP_BRA;
2515 bracount = 0;
2516 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2517 &compile_block);
2518 re->top_bracket = bracount;
2519 re->top_backref = top_backref;
2520
2521 /* If not reached end of pattern on success, there's an excess bracket. */
2522
2523 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2524
2525 /* Fill in the terminating state and check for disastrous overflow, but
2526 if debugging, leave the test till after things are printed out. */
2527
2528 *code++ = OP_END;
2529
2530 #ifndef DEBUG
2531 if (code - re->code > length) *errorptr = ERR23;
2532 #endif
2533
2534 /* Give an error if there's back reference to a non-existent capturing
2535 subpattern. */
2536
2537 if (top_backref > re->top_bracket) *errorptr = ERR15;
2538
2539 /* Failed to compile */
2540
2541 if (*errorptr != NULL)
2542 {
2543 (pcre_free)(re);
2544 PCRE_ERROR_RETURN:
2545 *erroroffset = ptr - (const uschar *)pattern;
2546 return NULL;
2547 }
2548
2549 /* If the anchored option was not passed, set flag if we can determine that it
2550 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2551 we can determine what the first character has to be, because that speeds up
2552 unanchored matches no end. In the case of multiline matches, an alternative is
2553 to set the PCRE_STARTLINE flag if all branches start with ^. */
2554
2555 if ((options & PCRE_ANCHORED) == 0)
2556 {
2557 int temp_options = options;
2558 if (is_anchored(re->code, &temp_options))
2559 re->options |= PCRE_ANCHORED;
2560 else
2561 {
2562 int ch = find_firstchar(re->code, &temp_options);
2563 if (ch >= 0)
2564 {
2565 re->first_char = ch;
2566 re->options |= PCRE_FIRSTSET;
2567 }
2568 else if (is_startline(re->code))
2569 re->options |= PCRE_STARTLINE;
2570 }
2571 }
2572
2573 /* Print out the compiled data for debugging */
2574
2575 #ifdef DEBUG
2576
2577 printf("Length = %d top_bracket = %d top_backref = %d\n",
2578 length, re->top_bracket, re->top_backref);
2579
2580 if (re->options != 0)
2581 {
2582 printf("%s%s%s%s%s%s%s%s\n",
2583 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2584 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2585 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2586 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2587 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2588 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2589 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2590 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2591 }
2592
2593 if ((re->options & PCRE_FIRSTSET) != 0)
2594 {
2595 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2596 else printf("First char = \\x%02x\n", re->first_char);
2597 }
2598
2599 code_end = code;
2600 code_base = code = re->code;
2601
2602 while (code < code_end)
2603 {
2604 int charlength;
2605
2606 printf("%3d ", code - code_base);
2607
2608 if (*code >= OP_BRA)
2609 {
2610 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2611 code += 2;
2612 }
2613
2614 else switch(*code)
2615 {
2616 case OP_OPT:
2617 printf(" %.2x %s", code[1], OP_names[*code]);
2618 code++;
2619 break;
2620
2621 case OP_COND:
2622 printf("%3d Cond", (code[1] << 8) + code[2]);
2623 code += 2;
2624 break;
2625
2626 case OP_CREF:
2627 printf(" %.2d %s", code[1], OP_names[*code]);
2628 code++;
2629 break;
2630
2631 case OP_CHARS:
2632 charlength = *(++code);
2633 printf("%3d ", charlength);
2634 while (charlength-- > 0)
2635 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2636 break;
2637
2638 case OP_KETRMAX:
2639 case OP_KETRMIN:
2640 case OP_ALT:
2641 case OP_KET:
2642 case OP_ASSERT:
2643 case OP_ASSERT_NOT:
2644 case OP_ASSERTBACK:
2645 case OP_ASSERTBACK_NOT:
2646 case OP_ONCE:
2647 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2648 code += 2;
2649 break;
2650
2651 case OP_REVERSE:
2652 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2653 code += 2;
2654 break;
2655
2656 case OP_STAR:
2657 case OP_MINSTAR:
2658 case OP_PLUS:
2659 case OP_MINPLUS:
2660 case OP_QUERY:
2661 case OP_MINQUERY:
2662 case OP_TYPESTAR:
2663 case OP_TYPEMINSTAR:
2664 case OP_TYPEPLUS:
2665 case OP_TYPEMINPLUS:
2666 case OP_TYPEQUERY:
2667 case OP_TYPEMINQUERY:
2668 if (*code >= OP_TYPESTAR)
2669 printf(" %s", OP_names[code[1]]);
2670 else if (isprint(c = code[1])) printf(" %c", c);
2671 else printf(" \\x%02x", c);
2672 printf("%s", OP_names[*code++]);
2673 break;
2674
2675 case OP_EXACT:
2676 case OP_UPTO:
2677 case OP_MINUPTO:
2678 if (isprint(c = code[3])) printf(" %c{", c);
2679 else printf(" \\x%02x{", c);
2680 if (*code != OP_EXACT) printf("0,");
2681 printf("%d}", (code[1] << 8) + code[2]);
2682 if (*code == OP_MINUPTO) printf("?");
2683 code += 3;
2684 break;
2685
2686 case OP_TYPEEXACT:
2687 case OP_TYPEUPTO:
2688 case OP_TYPEMINUPTO:
2689 printf(" %s{", OP_names[code[3]]);
2690 if (*code != OP_TYPEEXACT) printf(",");
2691 printf("%d}", (code[1] << 8) + code[2]);
2692 if (*code == OP_TYPEMINUPTO) printf("?");
2693 code += 3;
2694 break;
2695
2696 case OP_NOT:
2697 if (isprint(c = *(++code))) printf(" [^%c]", c);
2698 else printf(" [^\\x%02x]", c);
2699 break;
2700
2701 case OP_NOTSTAR:
2702 case OP_NOTMINSTAR:
2703 case OP_NOTPLUS:
2704 case OP_NOTMINPLUS:
2705 case OP_NOTQUERY:
2706 case OP_NOTMINQUERY:
2707 if (isprint(c = code[1])) printf(" [^%c]", c);
2708 else printf(" [^\\x%02x]", c);
2709 printf("%s", OP_names[*code++]);
2710 break;
2711
2712 case OP_NOTEXACT:
2713 case OP_NOTUPTO:
2714 case OP_NOTMINUPTO:
2715 if (isprint(c = code[3])) printf(" [^%c]{", c);
2716 else printf(" [^\\x%02x]{", c);
2717 if (*code != OP_NOTEXACT) printf(",");
2718 printf("%d}", (code[1] << 8) + code[2]);
2719 if (*code == OP_NOTMINUPTO) printf("?");
2720 code += 3;
2721 break;
2722
2723 case OP_REF:
2724 printf(" \\%d", *(++code));
2725 code ++;
2726 goto CLASS_REF_REPEAT;
2727
2728 case OP_CLASS:
2729 {
2730 int i, min, max;
2731 code++;
2732 printf(" [");
2733
2734 for (i = 0; i < 256; i++)
2735 {
2736 if ((code[i/8] & (1 << (i&7))) != 0)
2737 {
2738 int j;
2739 for (j = i+1; j < 256; j++)
2740 if ((code[j/8] & (1 << (j&7))) == 0) break;
2741 if (i == '-' || i == ']') printf("\\");
2742 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2743 if (--j > i)
2744 {
2745 printf("-");
2746 if (j == '-' || j == ']') printf("\\");
2747 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2748 }
2749 i = j;
2750 }
2751 }
2752 printf("]");
2753 code += 32;
2754
2755 CLASS_REF_REPEAT:
2756
2757 switch(*code)
2758 {
2759 case OP_CRSTAR:
2760 case OP_CRMINSTAR:
2761 case OP_CRPLUS:
2762 case OP_CRMINPLUS:
2763 case OP_CRQUERY:
2764 case OP_CRMINQUERY:
2765 printf("%s", OP_names[*code]);
2766 break;
2767
2768 case OP_CRRANGE:
2769 case OP_CRMINRANGE:
2770 min = (code[1] << 8) + code[2];
2771 max = (code[3] << 8) + code[4];
2772 if (max == 0) printf("{%d,}", min);
2773 else printf("{%d,%d}", min, max);
2774 if (*code == OP_CRMINRANGE) printf("?");
2775 code += 4;
2776 break;
2777
2778 default:
2779 code--;
2780 }
2781 }
2782 break;
2783
2784 /* Anything else is just a one-node item */
2785
2786 default:
2787 printf(" %s", OP_names[*code]);
2788 break;
2789 }
2790
2791 code++;
2792 printf("\n");
2793 }
2794 printf("------------------------------------------------------------------\n");
2795
2796 /* This check is done here in the debugging case so that the code that
2797 was compiled can be seen. */
2798
2799 if (code - re->code > length)
2800 {
2801 *errorptr = ERR23;
2802 (pcre_free)(re);
2803 *erroroffset = ptr - (uschar *)pattern;
2804 return NULL;
2805 }
2806 #endif
2807
2808 return (pcre *)re;
2809 }
2810
2811
2812
2813 /*************************************************
2814 * Match a back-reference *
2815 *************************************************/
2816
2817 /* If a back reference hasn't been set, the length that is passed is greater
2818 than the number of characters left in the string, so the match fails.
2819
2820 Arguments:
2821 offset index into the offset vector
2822 eptr points into the subject
2823 length length to be matched
2824 md points to match data block
2825 ims the ims flags
2826
2827 Returns: TRUE if matched
2828 */
2829
2830 static BOOL
2831 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2832 int ims)
2833 {
2834 const uschar *p = md->start_subject + md->offset_vector[offset];
2835
2836 #ifdef DEBUG
2837 if (eptr >= md->end_subject)
2838 printf("matching subject <null>");
2839 else
2840 {
2841 printf("matching subject ");
2842 pchars(eptr, length, TRUE, md);
2843 }
2844 printf(" against backref ");
2845 pchars(p, length, FALSE, md);
2846 printf("\n");
2847 #endif
2848
2849 /* Always fail if not enough characters left */
2850
2851 if (length > md->end_subject - eptr) return FALSE;
2852
2853 /* Separate the caselesss case for speed */
2854
2855 if ((ims & PCRE_CASELESS) != 0)
2856 {
2857 while (length-- > 0)
2858 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2859 }
2860 else
2861 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2862
2863 return TRUE;
2864 }
2865
2866
2867
2868 /*************************************************
2869 * Match from current position *
2870 *************************************************/
2871
2872 /* On entry ecode points to the first opcode, and eptr to the first character
2873 in the subject string, while eptrb holds the value of eptr at the start of the
2874 last bracketed group - used for breaking infinite loops matching zero-length
2875 strings.
2876
2877 Arguments:
2878 eptr pointer in subject
2879 ecode position in code
2880 offset_top current top pointer
2881 md pointer to "static" info for the match
2882 ims current /i, /m, and /s options
2883 condassert TRUE if called to check a condition assertion
2884 eptrb eptr at start of last bracket
2885
2886 Returns: TRUE if matched
2887 */
2888
2889 static BOOL
2890 match(register const uschar *eptr, register const uschar *ecode,
2891 int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2892 {
2893 int original_ims = ims; /* Save for resetting on ')' */
2894
2895 for (;;)
2896 {
2897 int op = (int)*ecode;
2898 int min, max, ctype;
2899 register int i;
2900 register int c;
2901 BOOL minimize = FALSE;
2902
2903 /* Opening capturing bracket. If there is space in the offset vector, save
2904 the current subject position in the working slot at the top of the vector. We
2905 mustn't change the current values of the data slot, because they may be set
2906 from a previous iteration of this group, and be referred to by a reference
2907 inside the group.
2908
2909 If the bracket fails to match, we need to restore this value and also the
2910 values of the final offsets, in case they were set by a previous iteration of
2911 the same bracket.
2912
2913 If there isn't enough space in the offset vector, treat this as if it were a
2914 non-capturing bracket. Don't worry about setting the flag for the error case
2915 here; that is handled in the code for KET. */
2916
2917 if (op > OP_BRA)
2918 {
2919 int number = op - OP_BRA;
2920 int offset = number << 1;
2921
2922 #ifdef DEBUG
2923 printf("start bracket %d subject=", number);
2924 pchars(eptr, 16, TRUE, md);
2925 printf("\n");
2926 #endif
2927
2928 if (offset < md->offset_max)
2929 {
2930 int save_offset1 = md->offset_vector[offset];
2931 int save_offset2 = md->offset_vector[offset+1];
2932 int save_offset3 = md->offset_vector[md->offset_end - number];
2933
2934 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2935 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2936
2937 do
2938 {
2939 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2940 ecode += (ecode[1] << 8) + ecode[2];
2941 }
2942 while (*ecode == OP_ALT);
2943
2944 DPRINTF(("bracket %d failed\n", number));
2945
2946 md->offset_vector[offset] = save_offset1;
2947 md->offset_vector[offset+1] = save_offset2;
2948 md->offset_vector[md->offset_end - number] = save_offset3;
2949 return FALSE;
2950 }
2951
2952 /* Insufficient room for saving captured contents */
2953
2954 else op = OP_BRA;
2955 }
2956
2957 /* Other types of node can be handled by a switch */
2958
2959 switch(op)
2960 {
2961 case OP_BRA: /* Non-capturing bracket: optimized */
2962 DPRINTF(("start bracket 0\n"));
2963 do
2964 {
2965 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2966 ecode += (ecode[1] << 8) + ecode[2];
2967 }
2968 while (*ecode == OP_ALT);
2969 DPRINTF(("bracket 0 failed\n"));
2970 return FALSE;
2971
2972 /* Conditional group: compilation checked that there are no more than
2973 two branches. If the condition is false, skipping the first branch takes us
2974 past the end if there is only one branch, but that's OK because that is
2975 exactly what going to the ket would do. */
2976
2977 case OP_COND:
2978 if (ecode[3] == OP_CREF) /* Condition is extraction test */
2979 {
2980 int offset = ecode[4] << 1; /* Doubled reference number */
2981 return match(eptr,
2982 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2983 5 : 3 + (ecode[1] << 8) + ecode[2]),
2984 offset_top, md, ims, FALSE, eptr);
2985 }
2986
2987 /* The condition is an assertion. Call match() to evaluate it - setting
2988 the final argument TRUE causes it to stop at the end of an assertion. */
2989
2990 else
2991 {
2992 if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
2993 {
2994 ecode += 3 + (ecode[4] << 8) + ecode[5];
2995 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
2996 }
2997 else ecode += (ecode[1] << 8) + ecode[2];
2998 return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
2999 }
3000 /* Control never reaches here */
3001
3002 /* Skip over conditional reference data if encountered (should not be) */
3003
3004 case OP_CREF:
3005 ecode += 2;
3006 break;
3007
3008 /* End of the pattern */
3009
3010 case OP_END:
3011 md->end_match_ptr = eptr; /* Record where we ended */
3012 md->end_offset_top = offset_top; /* and how many extracts were taken */
3013 return TRUE;
3014
3015 /* Change option settings */
3016
3017 case OP_OPT:
3018 ims = ecode[1];
3019 ecode += 2;
3020 DPRINTF(("ims set to %02x\n", ims));
3021 break;
3022
3023 /* Assertion brackets. Check the alternative branches in turn - the
3024 matching won't pass the KET for an assertion. If any one branch matches,
3025 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3026 start of each branch to move the current point backwards, so the code at
3027 this level is identical to the lookahead case. */
3028
3029 case OP_ASSERT:
3030 case OP_ASSERTBACK:
3031 do
3032 {
3033 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3034 ecode += (ecode[1] << 8) + ecode[2];
3035 }
3036 while (*ecode == OP_ALT);
3037 if (*ecode == OP_KET) return FALSE;
3038
3039 /* If checking an assertion for a condition, return TRUE. */
3040
3041 if (condassert) return TRUE;
3042
3043 /* Continue from after the assertion, updating the offsets high water
3044 mark, since extracts may have been taken during the assertion. */
3045
3046 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3047 ecode += 3;
3048 offset_top = md->end_offset_top;
3049 continue;
3050
3051 /* Negative assertion: all branches must fail to match */
3052
3053 case OP_ASSERT_NOT:
3054 case OP_ASSERTBACK_NOT:
3055 do
3056 {
3057 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3058 ecode += (ecode[1] << 8) + ecode[2];
3059 }
3060 while (*ecode == OP_ALT);
3061
3062 if (condassert) return TRUE;
3063 ecode += 3;
3064 continue;
3065
3066 /* Move the subject pointer back. This occurs only at the start of
3067 each branch of a lookbehind assertion. If we are too close to the start to
3068 move back, this match function fails. */
3069
3070 case OP_REVERSE:
3071 eptr -= (ecode[1] << 8) + ecode[2];
3072 if (eptr < md->start_subject) return FALSE;
3073 ecode += 3;
3074 break;
3075
3076
3077 /* "Once" brackets are like assertion brackets except that after a match,
3078 the point in the subject string is not moved back. Thus there can never be
3079 a move back into the brackets. Check the alternative branches in turn - the
3080 matching won't pass the KET for this kind of subpattern. If any one branch
3081 matches, we carry on as at the end of a normal bracket, leaving the subject
3082 pointer. */
3083
3084 case OP_ONCE:
3085 {
3086 const uschar *prev = ecode;
3087
3088 do
3089 {
3090 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3091 ecode += (ecode[1] << 8) + ecode[2];
3092 }
3093 while (*ecode == OP_ALT);
3094
3095 /* If hit the end of the group (which could be repeated), fail */
3096
3097 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3098
3099 /* Continue as from after the assertion, updating the offsets high water
3100 mark, since extracts may have been taken. */
3101
3102 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3103
3104 offset_top = md->end_offset_top;
3105 eptr = md->end_match_ptr;
3106
3107 /* For a non-repeating ket, just continue at this level. This also
3108 happens for a repeating ket if no characters were matched in the group.
3109 This is the forcible breaking of infinite loops as implemented in Perl
3110 5.005. If there is an options reset, it will get obeyed in the normal
3111 course of events. */
3112
3113 if (*ecode == OP_KET || eptr == eptrb)
3114 {
3115 ecode += 3;
3116 break;
3117 }
3118
3119 /* The repeating kets try the rest of the pattern or restart from the
3120 preceding bracket, in the appropriate order. We need to reset any options
3121 that changed within the bracket before re-running it, so check the next
3122 opcode. */
3123
3124 if (ecode[3] == OP_OPT)
3125 {
3126 ims = (ims & ~PCRE_IMS) | ecode[4];
3127 DPRINTF(("ims set to %02x at group repeat\n", ims));
3128 }
3129
3130 if (*ecode == OP_KETRMIN)
3131 {
3132 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3133 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3134 }
3135 else /* OP_KETRMAX */
3136 {
3137 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3138 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3139 }
3140 }
3141 return FALSE;
3142
3143 /* An alternation is the end of a branch; scan along to find the end of the
3144 bracketed group and go to there. */
3145
3146 case OP_ALT:
3147 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3148 break;
3149
3150 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3151 that it may occur zero times. It may repeat infinitely, or not at all -
3152 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3153 repeat limits are compiled as a number of copies, with the optional ones
3154 preceded by BRAZERO or BRAMINZERO. */
3155
3156 case OP_BRAZERO:
3157 {
3158 const uschar *next = ecode+1;
3159 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3160 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3161 ecode = next + 3;
3162 }
3163 break;
3164
3165 case OP_BRAMINZERO:
3166 {
3167 const uschar *next = ecode+1;
3168 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3169 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3170 ecode++;
3171 }
3172 break;
3173
3174 /* End of a group, repeated or non-repeating. If we are at the end of
3175 an assertion "group", stop matching and return TRUE, but record the
3176 current high water mark for use by positive assertions. Do this also
3177 for the "once" (not-backup up) groups. */
3178
3179 case OP_KET:
3180 case OP_KETRMIN:
3181 case OP_KETRMAX:
3182 {
3183 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3184
3185 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3186 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3187 *prev == OP_ONCE)
3188 {
3189 md->end_match_ptr = eptr; /* For ONCE */
3190 md->end_offset_top = offset_top;
3191 return TRUE;
3192 }
3193
3194 /* In all other cases except a conditional group we have to check the
3195 group number back at the start and if necessary complete handling an
3196 extraction by setting the offsets and bumping the high water mark. */
3197
3198 if (*prev != OP_COND)
3199 {
3200 int number = *prev - OP_BRA;
3201 int offset = number << 1;
3202
3203 DPRINTF(("end bracket %d\n", number));
3204
3205 if (number > 0)
3206 {
3207 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3208 {
3209 md->offset_vector[offset] =
3210 md->offset_vector[md->offset_end - number];
3211 md->offset_vector[offset+1] = eptr - md->start_subject;
3212 if (offset_top <= offset) offset_top = offset + 2;
3213 }
3214 }
3215 }
3216
3217 /* Reset the value of the ims flags, in case they got changed during
3218 the group. */
3219
3220 ims = original_ims;
3221 DPRINTF(("ims reset to %02x\n", ims));
3222
3223 /* For a non-repeating ket, just continue at this level. This also
3224 happens for a repeating ket if no characters were matched in the group.
3225 This is the forcible breaking of infinite loops as implemented in Perl
3226 5.005. If there is an options reset, it will get obeyed in the normal
3227 course of events. */
3228
3229 if (*ecode == OP_KET || eptr == eptrb)
3230 {
3231 ecode += 3;
3232 break;
3233 }
3234
3235 /* The repeating kets try the rest of the pattern or restart from the
3236 preceding bracket, in the appropriate order. */
3237
3238 if (*ecode == OP_KETRMIN)
3239 {
3240 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3241 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3242 }
3243 else /* OP_KETRMAX */
3244 {
3245 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3246 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3247 }
3248 }
3249 return FALSE;
3250
3251 /* Start of subject unless notbol, or after internal newline if multiline */
3252
3253 case OP_CIRC:
3254 if (md->notbol && eptr == md->start_subject) return FALSE;
3255 if ((ims & PCRE_MULTILINE) != 0)
3256 {
3257 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3258 ecode++;
3259 break;
3260 }
3261 /* ... else fall through */
3262
3263 /* Start of subject assertion */
3264
3265 case OP_SOD:
3266 if (eptr != md->start_subject) return FALSE;
3267 ecode++;
3268 break;
3269
3270 /* Assert before internal newline if multiline, or before a terminating
3271 newline unless endonly is set, else end of subject unless noteol is set. */
3272
3273 case OP_DOLL:
3274 if ((ims & PCRE_MULTILINE) != 0)
3275 {
3276 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3277 else { if (md->noteol) return FALSE; }
3278 ecode++;
3279 break;
3280 }
3281 else
3282 {
3283 if (md->noteol) return FALSE;
3284 if (!md->endonly)
3285 {
3286 if (eptr < md->end_subject - 1 ||
3287 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3288
3289 ecode++;
3290 break;
3291 }
3292 }
3293 /* ... else fall through */
3294
3295 /* End of subject assertion (\z) */
3296
3297 case OP_EOD:
3298 if (eptr < md->end_subject) return FALSE;
3299 ecode++;
3300 break;
3301
3302 /* End of subject or ending \n assertion (\Z) */
3303
3304 case OP_EODN:
3305 if (eptr < md->end_subject - 1 ||
3306 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3307 ecode++;
3308 break;
3309
3310 /* Word boundary assertions */
3311
3312 case OP_NOT_WORD_BOUNDARY:
3313 case OP_WORD_BOUNDARY:
3314 {
3315 BOOL prev_is_word = (eptr != md->start_subject) &&
3316 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3317 BOOL cur_is_word = (eptr < md->end_subject) &&
3318 ((md->ctypes[*eptr] & ctype_word) != 0);
3319 if ((*ecode++ == OP_WORD_BOUNDARY)?
3320 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3321 return FALSE;
3322 }
3323 break;
3324
3325 /* Match a single character type; inline for speed */
3326
3327 case OP_ANY:
3328 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3329 return FALSE;
3330 if (eptr++ >= md->end_subject) return FALSE;
3331 ecode++;
3332 break;
3333
3334 case OP_NOT_DIGIT:
3335 if (eptr >= md->end_subject ||
3336 (md->ctypes[*eptr++] & ctype_digit) != 0)
3337 return FALSE;
3338 ecode++;
3339 break;
3340
3341 case OP_DIGIT:
3342 if (eptr >= md->end_subject ||
3343 (md->ctypes[*eptr++] & ctype_digit) == 0)
3344 return FALSE;
3345 ecode++;
3346 break;
3347
3348 case OP_NOT_WHITESPACE:
3349 if (eptr >= md->end_subject ||
3350 (md->ctypes[*eptr++] & ctype_space) != 0)
3351 return FALSE;
3352 ecode++;
3353 break;
3354
3355 case OP_WHITESPACE:
3356 if (eptr >= md->end_subject ||
3357 (md->ctypes[*eptr++] & ctype_space) == 0)
3358 return FALSE;
3359 ecode++;
3360 break;
3361
3362 case OP_NOT_WORDCHAR:
3363 if (eptr >= md->end_subject ||
3364 (md->ctypes[*eptr++] & ctype_word) != 0)
3365 return FALSE;
3366 ecode++;
3367 break;
3368
3369 case OP_WORDCHAR:
3370 if (eptr >= md->end_subject ||
3371 (md->ctypes[*eptr++] & ctype_word) == 0)
3372 return FALSE;
3373 ecode++;
3374 break;
3375
3376 /* Match a back reference, possibly repeatedly. Look past the end of the
3377 item to see if there is repeat information following. The code is similar
3378 to that for character classes, but repeated for efficiency. Then obey
3379 similar code to character type repeats - written out again for speed.
3380 However, if the referenced string is the empty string, always treat
3381 it as matched, any number of times (otherwise there could be infinite
3382 loops). */
3383
3384 case OP_REF:
3385 {
3386 int length;
3387 int offset = ecode[1] << 1; /* Doubled reference number */
3388 ecode += 2; /* Advance past the item */
3389
3390 /* If the reference is unset, set the length to be longer than the amount
3391 of subject left; this ensures that every attempt at a match fails. We
3392 can't just fail here, because of the possibility of quantifiers with zero
3393 minima. */
3394
3395 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3396 md->end_subject - eptr + 1 :
3397 md->offset_vector[offset+1] - md->offset_vector[offset];
3398
3399 /* Set up for repetition, or handle the non-repeated case */
3400
3401 switch (*ecode)
3402 {
3403 case OP_CRSTAR:
3404 case OP_CRMINSTAR:
3405 case OP_CRPLUS:
3406 case OP_CRMINPLUS:
3407 case OP_CRQUERY:
3408 case OP_CRMINQUERY:
3409 c = *ecode++ - OP_CRSTAR;
3410 minimize = (c & 1) != 0;
3411 min = rep_min[c]; /* Pick up values from tables; */
3412 max = rep_max[c]; /* zero for max => infinity */
3413 if (max == 0) max = INT_MAX;
3414 break;
3415
3416 case OP_CRRANGE:
3417 case OP_CRMINRANGE:
3418 minimize = (*ecode == OP_CRMINRANGE);
3419 min = (ecode[1] << 8) + ecode[2];
3420 max = (ecode[3] << 8) + ecode[4];
3421 if (max == 0) max = INT_MAX;
3422 ecode += 5;
3423 break;
3424
3425 default: /* No repeat follows */
3426 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3427 eptr += length;
3428 continue; /* With the main loop */
3429 }
3430
3431 /* If the length of the reference is zero, just continue with the
3432 main loop. */
3433
3434 if (length == 0) continue;
3435
3436 /* First, ensure the minimum number of matches are present. We get back
3437 the length of the reference string explicitly rather than passing the
3438 address of eptr, so that eptr can be a register variable. */
3439
3440 for (i = 1; i <= min; i++)
3441 {
3442 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3443 eptr += length;
3444 }
3445
3446 /* If min = max, continue at the same level without recursion.
3447 They are not both allowed to be zero. */
3448
3449 if (min == max) continue;
3450
3451 /* If minimizing, keep trying and advancing the pointer */
3452
3453 if (minimize)
3454 {
3455 for (i = min;; i++)
3456 {
3457 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3458 return TRUE;
3459 if (i >= max || !match_ref(offset, eptr, length, md, ims))
3460 return FALSE;
3461 eptr += length;
3462 }
3463 /* Control never gets here */
3464 }
3465
3466 /* If maximizing, find the longest string and work backwards */
3467
3468 else
3469 {
3470 const uschar *pp = eptr;
3471 for (i = min; i < max; i++)
3472 {
3473 if (!match_ref(offset, eptr, length, md, ims)) break;
3474 eptr += length;
3475 }
3476 while (eptr >= pp)
3477 {
3478 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3479 return TRUE;
3480 eptr -= length;
3481 }
3482 return FALSE;
3483 }
3484 }
3485 /* Control never gets here */
3486
3487
3488
3489 /* Match a character class, possibly repeatedly. Look past the end of the
3490 item to see if there is repeat information following. Then obey similar
3491 code to character type repeats - written out again for speed. */
3492
3493 case OP_CLASS:
3494 {
3495 const uschar *data = ecode + 1; /* Save for matching */
3496 ecode += 33; /* Advance past the item */
3497
3498 switch (*ecode)
3499 {
3500 case OP_CRSTAR:
3501 case OP_CRMINSTAR:
3502 case OP_CRPLUS:
3503 case OP_CRMINPLUS:
3504 case OP_CRQUERY:
3505 case OP_CRMINQUERY:
3506 c = *ecode++ - OP_CRSTAR;
3507 minimize = (c & 1) != 0;
3508 min = rep_min[c]; /* Pick up values from tables; */
3509 max = rep_max[c]; /* zero for max => infinity */
3510 if (max == 0) max = INT_MAX;
3511 break;
3512
3513 case OP_CRRANGE:
3514 case OP_CRMINRANGE:
3515 minimize = (*ecode == OP_CRMINRANGE);
3516 min = (ecode[1] << 8) + ecode[2];
3517 max = (ecode[3] << 8) + ecode[4];
3518 if (max == 0) max = INT_MAX;
3519 ecode += 5;
3520 break;
3521
3522 default: /* No repeat follows */
3523 min = max = 1;
3524 break;
3525 }
3526
3527 /* First, ensure the minimum number of matches are present. */
3528
3529 for (i = 1; i <= min; i++)
3530 {
3531 if (eptr >= md->end_subject) return FALSE;
3532 c = *eptr++;
3533 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3534 return FALSE;
3535 }
3536
3537 /* If max == min we can continue with the main loop without the
3538 need to recurse. */
3539
3540 if (min == max) continue;
3541
3542 /* If minimizing, keep testing the rest of the expression and advancing
3543 the pointer while it matches the class. */
3544
3545 if (minimize)
3546 {
3547 for (i = min;; i++)
3548 {
3549 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3550 return TRUE;
3551 if (i >= max || eptr >= md->end_subject) return FALSE;
3552 c = *eptr++;
3553 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3554 return FALSE;
3555 }
3556 /* Control never gets here */
3557 }
3558
3559 /* If maximizing, find the longest possible run, then work backwards. */
3560
3561 else
3562 {
3563 const uschar *pp = eptr;
3564 for (i = min; i < max; eptr++, i++)
3565 {
3566 if (eptr >= md->end_subject) break;
3567 c = *eptr;
3568 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3569 break;
3570 }
3571
3572 while (eptr >= pp)
3573 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3574 return TRUE;
3575 return FALSE;
3576 }
3577 }
3578 /* Control never gets here */
3579
3580 /* Match a run of characters */
3581
3582 case OP_CHARS:
3583 {
3584 register int length = ecode[1];
3585 ecode += 2;
3586
3587 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3588 if (eptr >= md->end_subject)
3589 printf("matching subject <null> against pattern ");
3590 else
3591 {
3592 printf("matching subject ");
3593 pchars(eptr, length, TRUE, md);
3594 printf(" against pattern ");
3595 }
3596 pchars(ecode, length, FALSE, md);
3597 printf("\n");
3598 #endif
3599
3600 if (length > md->end_subject - eptr) return FALSE;
3601 if ((ims & PCRE_CASELESS) != 0)
3602 {
3603 while (length-- > 0)
3604 if (md->lcc[*ecode++] != md->lcc[*eptr++])
3605 return FALSE;
3606 }
3607 else
3608 {
3609 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3610 }
3611 }
3612 break;
3613
3614 /* Match a single character repeatedly; different opcodes share code. */
3615
3616 case OP_EXACT:
3617 min = max = (ecode[1] << 8) + ecode[2];
3618 ecode += 3;
3619 goto REPEATCHAR;
3620
3621 case OP_UPTO:
3622 case OP_MINUPTO:
3623 min = 0;
3624 max = (ecode[1] << 8) + ecode[2];
3625 minimize = *ecode == OP_MINUPTO;
3626 ecode += 3;
3627 goto REPEATCHAR;
3628
3629 case OP_STAR:
3630 case OP_MINSTAR:
3631 case OP_PLUS:
3632 case OP_MINPLUS:
3633 case OP_QUERY:
3634 case OP_MINQUERY:
3635 c = *ecode++ - OP_STAR;
3636 minimize = (c & 1) != 0;
3637 min = rep_min[c]; /* Pick up values from tables; */
3638 max = rep_max[c]; /* zero for max => infinity */
3639 if (max == 0) max = INT_MAX;
3640
3641 /* Common code for all repeated single-character matches. We can give
3642 up quickly if there are fewer than the minimum number of characters left in
3643 the subject. */
3644
3645 REPEATCHAR:
3646 if (min > md->end_subject - eptr) return FALSE;
3647 c = *ecode++;
3648
3649 /* The code is duplicated for the caseless and caseful cases, for speed,
3650 since matching characters is likely to be quite common. First, ensure the
3651 minimum number of matches are present. If min = max, continue at the same
3652 level without recursing. Otherwise, if minimizing, keep trying the rest of
3653 the expression and advancing one matching character if failing, up to the
3654 maximum. Alternatively, if maximizing, find the maximum number of
3655 characters and work backwards. */
3656
3657 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3658 max, eptr));
3659
3660 if ((ims & PCRE_CASELESS) != 0)
3661 {
3662 c = md->lcc[c];
3663 for (i = 1; i <= min; i++)
3664 if (c != md->lcc[*eptr++]) return FALSE;
3665 if (min == max) continue;
3666 if (minimize)
3667 {
3668 for (i = min;; i++)
3669 {
3670 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3671 return TRUE;
3672 if (i >= max || eptr >= md->end_subject ||
3673 c != md->lcc[*eptr++])
3674 return FALSE;
3675 }
3676 /* Control never gets here */
3677 }
3678 else
3679 {
3680 const uschar *pp = eptr;
3681 for (i = min; i < max; i++)
3682 {
3683 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3684 eptr++;
3685 }
3686 while (eptr >= pp)
3687 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3688 return TRUE;
3689 return FALSE;
3690 }
3691 /* Control never gets here */
3692 }
3693
3694 /* Caseful comparisons */
3695
3696 else
3697 {
3698 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3699 if (min == max) continue;
3700 if (minimize)
3701 {
3702 for (i = min;; i++)
3703 {
3704 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3705 return TRUE;
3706 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3707 }
3708 /* Control never gets here */
3709 }
3710 else
3711 {
3712 const uschar *pp = eptr;
3713 for (i = min; i < max; i++)
3714 {
3715 if (eptr >= md->end_subject || c != *eptr) break;
3716 eptr++;
3717 }
3718 while (eptr >= pp)
3719 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3720 return TRUE;
3721 return FALSE;
3722 }
3723 }
3724 /* Control never gets here */
3725
3726 /* Match a negated single character */
3727
3728 case OP_NOT:
3729 if (eptr >= md->end_subject) return FALSE;
3730 ecode++;
3731 if ((ims & PCRE_CASELESS) != 0)
3732 {
3733 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3734 }
3735 else
3736 {
3737 if (*ecode++ == *eptr++) return FALSE;
3738 }
3739 break;
3740
3741 /* Match a negated single character repeatedly. This is almost a repeat of
3742 the code for a repeated single character, but I haven't found a nice way of
3743 commoning these up that doesn't require a test of the positive/negative
3744 option for each character match. Maybe that wouldn't add very much to the
3745 time taken, but character matching *is* what this is all about... */
3746
3747 case OP_NOTEXACT:
3748 min = max = (ecode[1] << 8) + ecode[2];
3749 ecode += 3;
3750 goto REPEATNOTCHAR;
3751
3752 case OP_NOTUPTO:
3753 case OP_NOTMINUPTO:
3754 min = 0;
3755 max = (ecode[1] << 8) + ecode[2];
3756 minimize = *ecode == OP_NOTMINUPTO;
3757 ecode += 3;
3758 goto REPEATNOTCHAR;
3759
3760 case OP_NOTSTAR:
3761 case OP_NOTMINSTAR:
3762 case OP_NOTPLUS:
3763 case OP_NOTMINPLUS:
3764 case OP_NOTQUERY:
3765 case OP_NOTMINQUERY:
3766 c = *ecode++ - OP_NOTSTAR;
3767 minimize = (c & 1) != 0;
3768 min = rep_min[c]; /* Pick up values from tables; */
3769 max = rep_max[c]; /* zero for max => infinity */
3770 if (max == 0) max = INT_MAX;
3771
3772 /* Common code for all repeated single-character matches. We can give
3773 up quickly if there are fewer than the minimum number of characters left in
3774 the subject. */
3775
3776 REPEATNOTCHAR:
3777 if (min > md->end_subject - eptr) return FALSE;
3778 c = *ecode++;
3779
3780 /* The code is duplicated for the caseless and caseful cases, for speed,
3781 since matching characters is likely to be quite common. First, ensure the
3782 minimum number of matches are present. If min = max, continue at the same
3783 level without recursing. Otherwise, if minimizing, keep trying the rest of
3784 the expression and advancing one matching character if failing, up to the
3785 maximum. Alternatively, if maximizing, find the maximum number of
3786 characters and work backwards. */
3787
3788 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3789 max, eptr));
3790
3791 if ((ims & PCRE_CASELESS) != 0)
3792 {
3793 c = md->lcc[c];
3794 for (i = 1; i <= min; i++)
3795 if (c == md->lcc[*eptr++]) return FALSE;
3796 if (min == max) continue;
3797 if (minimize)
3798 {
3799 for (i = min;; i++)
3800 {
3801 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3802 return TRUE;
3803 if (i >= max || eptr >= md->end_subject ||
3804 c == md->lcc[*eptr++])
3805 return FALSE;
3806 }
3807 /* Control never gets here */
3808 }
3809 else
3810 {
3811 const uschar *pp = eptr;
3812 for (i = min; i < max; i++)
3813 {
3814 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3815 eptr++;
3816 }
3817 while (eptr >= pp)
3818 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3819 return TRUE;
3820 return FALSE;
3821 }
3822 /* Control never gets here */
3823 }
3824
3825 /* Caseful comparisons */
3826
3827 else
3828 {
3829 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3830 if (min == max) continue;
3831 if (minimize)
3832 {
3833 for (i = min;; i++)
3834 {
3835 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3836 return TRUE;
3837 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3838 }
3839 /* Control never gets here */
3840 }
3841 else
3842 {
3843 const uschar *pp = eptr;
3844 for (i = min; i < max; i++)
3845 {
3846 if (eptr >= md->end_subject || c == *eptr) break;
3847 eptr++;
3848 }
3849 while (eptr >= pp)
3850 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3851 return TRUE;
3852 return FALSE;
3853 }
3854 }
3855 /* Control never gets here */
3856
3857 /* Match a single character type repeatedly; several different opcodes
3858 share code. This is very similar to the code for single characters, but we
3859 repeat it in the interests of efficiency. */
3860
3861 case OP_TYPEEXACT:
3862 min = max = (ecode[1] << 8) + ecode[2];
3863 minimize = TRUE;
3864 ecode += 3;
3865 goto REPEATTYPE;
3866
3867 case OP_TYPEUPTO:
3868 case OP_TYPEMINUPTO:
3869 min = 0;
3870 max = (ecode[1] << 8) + ecode[2];
3871 minimize = *ecode == OP_TYPEMINUPTO;
3872 ecode += 3;
3873 goto REPEATTYPE;
3874
3875 case OP_TYPESTAR:
3876 case OP_TYPEMINSTAR:
3877 case OP_TYPEPLUS:
3878 case OP_TYPEMINPLUS:
3879 case OP_TYPEQUERY:
3880 case OP_TYPEMINQUERY:
3881 c = *ecode++ - OP_TYPESTAR;
3882 minimize = (c & 1) != 0;
3883 min = rep_min[c]; /* Pick up values from tables; */
3884 max = rep_max[c]; /* zero for max => infinity */
3885 if (max == 0) max = INT_MAX;
3886
3887 /* Common code for all repeated single character type matches */
3888
3889 REPEATTYPE:
3890 ctype = *ecode++; /* Code for the character type */
3891
3892 /* First, ensure the minimum number of matches are present. Use inline
3893 code for maximizing the speed, and do the type test once at the start
3894 (i.e. keep it out of the loop). Also test that there are at least the
3895 minimum number of characters before we start. */
3896
3897 if (min > md->end_subject - eptr) return FALSE;
3898 if (min > 0) switch(ctype)
3899 {
3900 case OP_ANY:
3901 if ((ims & PCRE_DOTALL) == 0)
3902 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3903 else eptr += min;
3904 break;
3905
3906 case OP_NOT_DIGIT:
3907 for (i = 1; i <= min; i++)
3908 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3909 break;
3910
3911 case OP_DIGIT:
3912 for (i = 1; i <= min; i++)
3913 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3914 break;
3915
3916 case OP_NOT_WHITESPACE:
3917 for (i = 1; i <= min; i++)
3918 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3919 break;
3920
3921 case OP_WHITESPACE:
3922 for (i = 1; i <= min; i++)
3923 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3924 break;
3925
3926 case OP_NOT_WORDCHAR:
3927 for (i = 1; i <= min; i++)
3928 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3929 return FALSE;
3930 break;
3931
3932 case OP_WORDCHAR:
3933 for (i = 1; i <= min; i++)
3934 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3935 return FALSE;
3936 break;
3937 }
3938
3939 /* If min = max, continue at the same level without recursing */
3940
3941 if (min == max) continue;
3942
3943 /* If minimizing, we have to test the rest of the pattern before each
3944 subsequent match. */
3945
3946 if (minimize)
3947 {
3948 for (i = min;; i++)
3949 {
3950 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3951 if (i >= max || eptr >= md->end_subject) return FALSE;
3952
3953 c = *eptr++;
3954 switch(ctype)
3955 {
3956 case OP_ANY:
3957 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
3958 break;
3959
3960 case OP_NOT_DIGIT:
3961 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
3962 break;
3963
3964 case OP_DIGIT:
3965 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
3966 break;
3967
3968 case OP_NOT_WHITESPACE:
3969 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
3970 break;
3971
3972 case OP_WHITESPACE:
3973 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
3974 break;
3975
3976 case OP_NOT_WORDCHAR:
3977 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
3978 break;
3979
3980 case OP_WORDCHAR:
3981 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
3982 break;
3983 }
3984 }
3985 /* Control never gets here */
3986 }
3987
3988 /* If maximizing it is worth using inline code for speed, doing the type
3989 test once at the start (i.e. keep it out of the loop). */
3990
3991 else
3992 {
3993 const uschar *pp = eptr;
3994 switch(ctype)
3995 {
3996 case OP_ANY:
3997 if ((ims & PCRE_DOTALL) == 0)
3998 {
3999 for (i = min; i < max; i++)
4000 {
4001 if (eptr >= md->end_subject || *eptr == '\n') break;
4002 eptr++;
4003 }
4004 }
4005 else
4006 {
4007 c = max - min;
4008 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4009 eptr += c;
4010 }
4011 break;
4012
4013 case OP_NOT_DIGIT:
4014 for (i = min; i < max; i++)
4015 {
4016 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4017 break;
4018 eptr++;
4019 }
4020 break;
4021
4022 case OP_DIGIT:
4023 for (i = min; i < max; i++)
4024 {
4025 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4026 break;
4027 eptr++;
4028 }
4029 break;
4030
4031 case OP_NOT_WHITESPACE:
4032 for (i = min; i < max; i++)
4033 {
4034 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4035 break;
4036 eptr++;
4037 }
4038 break;
4039
4040 case OP_WHITESPACE:
4041 for (i = min; i < max; i++)
4042 {
4043 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4044 break;
4045 eptr++;
4046 }
4047 break;
4048
4049 case OP_NOT_WORDCHAR:
4050 for (i = min; i < max; i++)
4051 {
4052 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4053 break;
4054 eptr++;
4055 }
4056 break;
4057
4058 case OP_WORDCHAR:
4059 for (i = min; i < max; i++)
4060 {
4061 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4062 break;
4063 eptr++;
4064 }
4065 break;
4066 }
4067
4068 while (eptr >= pp)
4069 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4070 return TRUE;
4071 return FALSE;
4072 }
4073 /* Control never gets here */
4074
4075 /* There's been some horrible disaster. */
4076
4077 default:
4078 DPRINTF(("Unknown opcode %d\n", *ecode));
4079 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4080 return FALSE;
4081 }
4082
4083 /* Do not stick any code in here without much thought; it is assumed
4084 that "continue" in the code above comes out to here to repeat the main
4085 loop. */
4086
4087 } /* End of main loop */
4088 /* Control never reaches here */
4089 }
4090
4091
4092
4093
4094 /*************************************************
4095 * Execute a Regular Expression *
4096 *************************************************/
4097
4098 /* This function applies a compiled re to a subject string and picks out
4099 portions of the string if it matches. Two elements in the vector are set for
4100 each substring: the offsets to the start and end of the substring.
4101
4102 Arguments:
4103 external_re points to the compiled expression
4104 external_extra points to "hints" from pcre_study() or is NULL
4105 subject points to the subject string
4106 length length of subject string (may contain binary zeros)
4107 options option bits
4108 offsets points to a vector of ints to be filled in with offsets
4109 offsetcount the number of elements in the vector
4110
4111 Returns: > 0 => success; value is the number of elements filled in
4112 = 0 => success, but offsets is not big enough
4113 -1 => failed to match
4114 < -1 => some kind of unexpected problem
4115 */
4116
4117 int
4118 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4119 const char *subject, int length, int options, int *offsets, int offsetcount)
4120 {
4121 int resetcount, ocount;
4122 int first_char = -1;
4123 int ims = 0;
4124 match_data match_block;
4125 const uschar *start_bits = NULL;
4126 const uschar *start_match = (const uschar *)subject;
4127 const uschar *end_subject;
4128 const real_pcre *re = (const real_pcre *)external_re;
4129 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4130 BOOL using_temporary_offsets = FALSE;
4131 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4132 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4133
4134 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4135
4136 if (re == NULL || subject == NULL ||
4137 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4138 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4139
4140 match_block.start_subject = (const uschar *)subject;
4141 match_block.end_subject = match_block.start_subject + length;
4142 end_subject = match_block.end_subject;
4143
4144 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4145
4146 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4147 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4148
4149 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4150
4151 match_block.lcc = re->tables + lcc_offset;
4152 match_block.ctypes = re->tables + ctypes_offset;
4153
4154 /* The ims options can vary during the matching as a result of the presence
4155 of (?ims) items in the pattern. They are kept in a local variable so that
4156 restoring at the exit of a group is easy. */
4157
4158 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4159
4160 /* If the expression has got more back references than the offsets supplied can
4161 hold, we get a temporary bit of working store to use during the matching.
4162 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4163 of 3. */
4164
4165 ocount = offsetcount - (offsetcount % 3);
4166
4167 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4168 {
4169 ocount = re->top_backref * 3 + 3;
4170 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4171 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4172 using_temporary_offsets = TRUE;
4173 DPRINTF(("Got memory to hold back references\n"));
4174 }
4175 else match_block.offset_vector = offsets;
4176
4177 match_block.offset_end = ocount;
4178 match_block.offset_max = (2*ocount)/3;
4179 match_block.offset_overflow = FALSE;
4180
4181 /* Compute the minimum number of offsets that we need to reset each time. Doing
4182 this makes a huge difference to execution time when there aren't many brackets
4183 in the pattern. */
4184
4185 resetcount = 2 + re->top_bracket * 2;
4186 if (resetcount > offsetcount) resetcount = ocount;
4187
4188 /* Reset the working variable associated with each extraction. These should
4189 never be used unless previously set, but they get saved and restored, and so we
4190 initialize them to avoid reading uninitialized locations. */
4191
4192 if (match_block.offset_vector != NULL)
4193 {
4194 register int *iptr = match_block.offset_vector + ocount;
4195 register int *iend = iptr - resetcount/2 + 1;
4196 while (--iptr >= iend) *iptr = -1;
4197 }
4198
4199 /* Set up the first character to match, if available. The first_char value is
4200 never set for an anchored regular expression, but the anchoring may be forced
4201 at run time, so we have to test for anchoring. The first char may be unset for
4202 an unanchored pattern, of course. If there's no first char and the pattern was
4203 studied, there may be a bitmap of possible first characters. */
4204
4205 if (!anchored)
4206 {
4207 if ((re->options & PCRE_FIRSTSET) != 0)
4208 {
4209 first_char = re->first_char;
4210 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4211 }
4212 else
4213 if (!startline && extra != NULL &&
4214 (extra->options & PCRE_STUDY_MAPPED) != 0)
4215 start_bits = extra->start_bits;
4216 }
4217
4218 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4219
4220 do
4221 {
4222 int rc;
4223 register int *iptr = match_block.offset_vector;
4224 register int *iend = iptr + resetcount;
4225
4226 /* Reset the maximum number of extractions we might see. */
4227
4228 while (iptr < iend) *iptr++ = -1;
4229
4230 /* Advance to a unique first char if possible */
4231
4232 if (first_char >= 0)
4233 {
4234 if ((ims & PCRE_CASELESS) != 0)
4235 while (start_match < end_subject &&
4236 match_block.lcc[*start_match] != first_char)
4237 start_match++;
4238 else
4239 while (start_match < end_subject && *start_match != first_char)
4240 start_match++;
4241 }
4242
4243 /* Or to just after \n for a multiline match if possible */
4244
4245 else if (startline)
4246 {
4247 if (start_match > match_block.start_subject)
4248 {
4249 while (start_match < end_subject && start_match[-1] != '\n')
4250 start_match++;
4251 }
4252 }
4253
4254 /* Or to a non-unique first char */
4255
4256 else if (start_bits != NULL)
4257 {
4258 while (start_match < end_subject)
4259 {
4260 register int c = *start_match;
4261 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4262 }
4263 }
4264
4265 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4266 printf(">>>> Match against: ");
4267 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4268 printf("\n");
4269 #endif
4270
4271 /* When a match occurs, substrings will be set for all internal extractions;
4272 we just need to set up the whole thing as substring 0 before returning. If
4273 there were too many extractions, set the return code to zero. In the case
4274 where we had to get some local store to hold offsets for backreferences, copy
4275 those back references that we can. In this case there need not be overflow
4276 if certain parts of the pattern were not used. */
4277
4278 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4279 continue;
4280
4281 /* Copy the offset information from temporary store if necessary */
4282
4283 if (using_temporary_offsets)
4284 {
4285 if (offsetcount >= 4)
4286 {
4287 memcpy(offsets + 2, match_block.offset_vector + 2,
4288 (offsetcount - 2) * sizeof(int));
4289 DPRINTF(("Copied offsets from temporary memory\n"));
4290 }
4291 if (match_block.end_offset_top > offsetcount)
4292 match_block.offset_overflow = TRUE;
4293
4294 DPRINTF(("Freeing temporary memory\n"));
4295 (pcre_free)(match_block.offset_vector);
4296 }
4297
4298 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4299
4300 if (match_block.offset_end < 2) rc = 0; else
4301 {
4302 offsets[0] = start_match - match_block.start_subject;
4303 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4304 }
4305
4306 DPRINTF((">>>> returning %d\n", rc));
4307 return rc;
4308 }
4309
4310 /* This "while" is the end of the "do" above */
4311
4312 while (!anchored &&
4313 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4314 start_match++ < end_subject);
4315
4316 if (using_temporary_offsets)
4317 {
4318 DPRINTF(("Freeing temporary memory\n"));
4319 (pcre_free)(match_block.offset_vector);
4320 }
4321
4322 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4323
4324 return match_block.errorcode;
4325 }
4326
4327 /* End of pcre.c */

  ViewVC Help
Powered by ViewVC 1.1.5