/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Sat Feb 24 21:38:21 2007 UTC (12 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 103422 byte(s)
Load pcre-1.05 into code/trunk.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36 /* Use a macro for debugging printing, 'cause that eliminates the the use
37 of #ifdef inline, and there are *still* stupid compilers about that don't like
38 indented pre-processor statements. I suppose it's only been 10 years... */
39
40 #ifdef DEBUG
41 #define DPRINTF(p) printf p
42 #else
43 #define DPRINTF(p) /*nothing*/
44 #endif
45
46 /* Include the internals header, which itself includes Standard C headers plus
47 the external pcre header. */
48
49 #include "internal.h"
50
51
52 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
53
54 static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
55 static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
56
57 /* Text forms of OP_ values and things, for debugging (not all used) */
58
59 #ifdef DEBUG
60 static const char *OP_names[] = {
61 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
62 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
63 "not",
64 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
65 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
66 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
67 "*", "*?", "+", "+?", "?", "??", "{", "{",
68 "class", "negclass", "Ref",
69 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
70 "Brazero", "Braminzero", "Bra"
71 };
72 #endif
73
74 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
75 are simple data values; negative values are for special things like \d and so
76 on. Zero means further processing is needed (for things like \x), or the escape
77 is invalid. */
78
79 static short int escapes[] = {
80 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
81 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
82 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
83 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
84 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
85 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
86 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
87 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
88 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
89 0, 0, 0 /* x - z */
90 };
91
92 /* Definition to allow mutual recursion */
93
94 static BOOL
95 compile_regex(int, int *, uschar **, const uschar **, const char **);
96
97 /* Structure for passing "static" information around between the functions
98 doing the matching, so that they are thread-safe. */
99
100 typedef struct match_data {
101 int errorcode; /* As it says */
102 int *offset_vector; /* Offset vector */
103 int offset_end; /* One past the end */
104 BOOL offset_overflow; /* Set if too many extractions */
105 BOOL caseless; /* Case-independent flag */
106 BOOL runtime_caseless; /* Caseless forced at run time */
107 BOOL multiline; /* Multiline flag */
108 BOOL notbol; /* NOTBOL flag */
109 BOOL noteol; /* NOTEOL flag */
110 BOOL dotall; /* Dot matches any char */
111 BOOL endonly; /* Dollar not before final \n */
112 const uschar *start_subject; /* Start of the subject string */
113 const uschar *end_subject; /* End of the subject string */
114 jmp_buf fail_env; /* Environment for longjump() break out */
115 const uschar *end_match_ptr; /* Subject position at end match */
116 int end_offset_top; /* Highwater mark at end of match */
117 } match_data;
118
119
120
121 /*************************************************
122 * Global variables *
123 *************************************************/
124
125 /* PCRE is thread-clean and doesn't use any global variables in the normal
126 sense. However, it calls memory allocation and free functions via the two
127 indirections below, which are can be changed by the caller, but are shared
128 between all threads. */
129
130 void *(*pcre_malloc)(size_t) = malloc;
131 void (*pcre_free)(void *) = free;
132
133
134
135
136 /*************************************************
137 * Return version string *
138 *************************************************/
139
140 const char *
141 pcre_version(void)
142 {
143 return PCRE_VERSION;
144 }
145
146
147
148
149 /*************************************************
150 * Return info about a compiled pattern *
151 *************************************************/
152
153 /* This function picks potentially useful data out of the private
154 structure.
155
156 Arguments:
157 external_re points to compiled code
158 optptr where to pass back the options
159 first_char where to pass back the first character,
160 or -1 if multiline and all branches start ^,
161 or -2 otherwise
162
163 Returns: number of identifying extraction brackets
164 or negative values on error
165 */
166
167 int
168 pcre_info(const pcre *external_re, int *optptr, int *first_char)
169 {
170 const real_pcre *re = (const real_pcre *)external_re;
171 if (re == NULL) return PCRE_ERROR_NULL;
172 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
173 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
174 if (first_char != NULL)
175 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
176 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
177 return re->top_bracket;
178 }
179
180
181
182
183 #ifdef DEBUG
184 /*************************************************
185 * Debugging function to print chars *
186 *************************************************/
187
188 /* Print a sequence of chars in printable format, stopping at the end of the
189 subject if the requested.
190
191 Arguments:
192 p points to characters
193 length number to print
194 is_subject TRUE if printing from within md->start_subject
195 md pointer to matching data block, if is_subject is TRUE
196
197 Returns: nothing
198 */
199
200 static void
201 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
202 {
203 int c;
204 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
205 while (length-- > 0)
206 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
207 }
208 #endif
209
210
211
212
213 /*************************************************
214 * Check subpattern for empty operand *
215 *************************************************/
216
217 /* This function checks a bracketed subpattern to see if any of the paths
218 through it could match an empty string. This is used to diagnose an error if
219 such a subpattern is followed by a quantifier with an unlimited upper bound.
220
221 Argument:
222 code points to the opening bracket
223
224 Returns: TRUE or FALSE
225 */
226
227 static BOOL
228 could_be_empty(uschar *code)
229 {
230 do {
231 uschar *cc = code + 3;
232
233 /* Scan along the opcodes for this branch; as soon as we find something
234 that matches a non-empty string, break out and advance to test the next
235 branch. If we get to the end of the branch, return TRUE for the whole
236 sub-expression. */
237
238 for (;;)
239 {
240 /* Test an embedded subpattern; if it could not be empty, break the
241 loop. Otherwise carry on in the branch. */
242
243 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
244 {
245 if (!could_be_empty(cc)) break;
246 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
247 cc += 3;
248 }
249
250 else switch (*cc)
251 {
252 /* Reached end of a branch: the subpattern may match the empty string */
253
254 case OP_ALT:
255 case OP_KET:
256 case OP_KETRMAX:
257 case OP_KETRMIN:
258 return TRUE;
259
260 /* Skip over assertive subpatterns */
261
262 case OP_ASSERT:
263 case OP_ASSERT_NOT:
264 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
265 cc += 3;
266 break;
267
268 /* Skip over things that don't match chars */
269
270 case OP_SOD:
271 case OP_EOD:
272 case OP_CIRC:
273 case OP_DOLL:
274 case OP_BRAZERO:
275 case OP_BRAMINZERO:
276 case OP_NOT_WORD_BOUNDARY:
277 case OP_WORD_BOUNDARY:
278 cc++;
279 break;
280
281 /* Skip over simple repeats with zero lower bound */
282
283 case OP_STAR:
284 case OP_MINSTAR:
285 case OP_QUERY:
286 case OP_MINQUERY:
287 case OP_NOTSTAR:
288 case OP_NOTMINSTAR:
289 case OP_NOTQUERY:
290 case OP_NOTMINQUERY:
291 case OP_TYPESTAR:
292 case OP_TYPEMINSTAR:
293 case OP_TYPEQUERY:
294 case OP_TYPEMINQUERY:
295 cc += 2;
296 break;
297
298 /* Skip over UPTOs (lower bound is zero) */
299
300 case OP_UPTO:
301 case OP_MINUPTO:
302 case OP_TYPEUPTO:
303 case OP_TYPEMINUPTO:
304 cc += 4;
305 break;
306
307 /* Check a class or a back reference for a zero minimum */
308
309 case OP_CLASS:
310 case OP_NEGCLASS:
311 case OP_REF:
312 cc += (*cc == OP_REF)? 2 : 33;
313
314 switch (*cc)
315 {
316 case OP_CRSTAR:
317 case OP_CRMINSTAR:
318 case OP_CRQUERY:
319 case OP_CRMINQUERY:
320 cc++;
321 break;
322
323 case OP_CRRANGE:
324 case OP_CRMINRANGE:
325 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
326 cc += 3;
327 break;
328
329 default:
330 goto NEXT_BRANCH;
331 }
332 break;
333
334 /* Anything else matches at least one character */
335
336 default:
337 goto NEXT_BRANCH;
338 }
339 }
340
341 NEXT_BRANCH:
342 code += (code[1] << 8) + code[2];
343 }
344 while (*code == OP_ALT);
345
346 /* No branches match the empty string */
347
348 return FALSE;
349 }
350
351
352
353 /*************************************************
354 * Handle escapes *
355 *************************************************/
356
357 /* This function is called when a \ has been encountered. It either returns a
358 positive value for a simple escape such as \n, or a negative value which
359 encodes one of the more complicated things such as \d. On entry, ptr is
360 pointing at the \. On exit, it is on the final character of the escape
361 sequence.
362
363 Arguments:
364 ptrptr points to the pattern position pointer
365 errorptr points to the pointer to the error message
366 bracount number of previous extracting brackets
367 options the options bits
368 isclass TRUE if inside a character class
369
370 Returns: zero or positive => a data character
371 negative => a special escape sequence
372 on error, errorptr is set
373 */
374
375 static int
376 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
377 int options, BOOL isclass)
378 {
379 const uschar *ptr = *ptrptr;
380 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
381 int i;
382
383 if (c == 0) *errorptr = ERR1;
384
385 /* Digits or letters may have special meaning; all others are literals. */
386
387 else if (c < '0' || c > 'z') {}
388
389 /* Do an initial lookup in a table. A non-zero result is something that can be
390 returned immediately. Otherwise further processing may be required. */
391
392 else if ((i = escapes[c - '0']) != 0) c = i;
393
394 /* Escapes that need further processing, or are illegal. */
395
396 else
397 {
398 const uschar *oldptr;
399 switch (c)
400 {
401 /* The handling of escape sequences consisting of a string of digits
402 starting with one that is not zero is not straightforward. By experiment,
403 the way Perl works seems to be as follows:
404
405 Outside a character class, the digits are read as a decimal number. If the
406 number is less than 10, or if there are that many previous extracting
407 left brackets, then it is a back reference. Otherwise, up to three octal
408 digits are read to form an escaped byte. Thus \123 is likely to be octal
409 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
410 value is greater than 377, the least significant 8 bits are taken. Inside a
411 character class, \ followed by a digit is always an octal number. */
412
413 case '1': case '2': case '3': case '4': case '5':
414 case '6': case '7': case '8': case '9':
415
416 if (!isclass)
417 {
418 oldptr = ptr;
419 c -= '0';
420 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
421 c = c * 10 + *(++ptr) - '0';
422 if (c < 10 || c <= bracount)
423 {
424 c = -(ESC_REF + c);
425 break;
426 }
427 ptr = oldptr; /* Put the pointer back and fall through */
428 }
429
430 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
431 generates a binary zero byte and treats the digit as a following literal.
432 Thus we have to pull back the pointer by one. */
433
434 if ((c = *ptr) >= '8')
435 {
436 ptr--;
437 c = 0;
438 break;
439 }
440
441 /* \0 always starts an octal number, but we may drop through to here with a
442 larger first octal digit */
443
444 case '0':
445 c -= '0';
446 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
447 ptr[1] != '8' && ptr[1] != '9')
448 c = c * 8 + *(++ptr) - '0';
449 break;
450
451 /* Special escapes not starting with a digit are straightforward */
452
453 case 'x':
454 c = 0;
455 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
456 {
457 ptr++;
458 c = c * 16 + pcre_lcc[*ptr] -
459 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
460 }
461 break;
462
463 case 'c':
464 c = *(++ptr);
465 if (c == 0)
466 {
467 *errorptr = ERR2;
468 return 0;
469 }
470
471 /* A letter is upper-cased; then the 0x40 bit is flipped */
472
473 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
474 c ^= 0x40;
475 break;
476
477 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
478 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
479 for Perl compatibility, it is a literal. */
480
481 default:
482 if ((options & PCRE_EXTRA) != 0) switch(c)
483 {
484 case 'X':
485 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
486 break;
487
488 default:
489 *errorptr = ERR3;
490 break;
491 }
492 break;
493 }
494 }
495
496 *ptrptr = ptr;
497 return c;
498 }
499
500
501
502 /*************************************************
503 * Check for counted repeat *
504 *************************************************/
505
506 /* This function is called when a '{' is encountered in a place where it might
507 start a quantifier. It looks ahead to see if it really is a quantifier or not.
508 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
509 where the ddds are digits.
510
511 Arguments:
512 p pointer to the first char after '{'
513
514 Returns: TRUE or FALSE
515 */
516
517 static BOOL
518 is_counted_repeat(const uschar *p)
519 {
520 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
521 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
522 if (*p == '}') return TRUE;
523
524 if (*p++ != ',') return FALSE;
525 if (*p == '}') return TRUE;
526
527 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
528 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
529 return (*p == '}');
530 }
531
532
533
534 /*************************************************
535 * Read repeat counts *
536 *************************************************/
537
538 /* Read an item of the form {n,m} and return the values. This is called only
539 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
540 so the syntax is guaranteed to be correct, but we need to check the values.
541
542 Arguments:
543 p pointer to first char after '{'
544 minp pointer to int for min
545 maxp pointer to int for max
546 returned as -1 if no max
547 errorptr points to pointer to error message
548
549 Returns: pointer to '}' on success;
550 current ptr on error, with errorptr set
551 */
552
553 static const uschar *
554 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
555 {
556 int min = 0;
557 int max = -1;
558
559 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
560
561 if (*p == '}') max = min; else
562 {
563 if (*(++p) != '}')
564 {
565 max = 0;
566 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
567 if (max < min)
568 {
569 *errorptr = ERR4;
570 return p;
571 }
572 }
573 }
574
575 /* Do paranoid checks, then fill in the required variables, and pass back the
576 pointer to the terminating '}'. */
577
578 if (min > 65535 || max > 65535)
579 *errorptr = ERR5;
580 else
581 {
582 *minp = min;
583 *maxp = max;
584 }
585 return p;
586 }
587
588
589
590 /*************************************************
591 * Compile one branch *
592 *************************************************/
593
594 /* Scan the pattern, compiling it into the code vector.
595
596 Arguments:
597 options the option bits
598 bracket points to number of brackets used
599 code points to the pointer to the current code point
600 ptrptr points to the current pattern pointer
601 errorptr points to pointer to error message
602
603 Returns: TRUE on success
604 FALSE, with *errorptr set on error
605 */
606
607 static BOOL
608 compile_branch(int options, int *brackets, uschar **codeptr,
609 const uschar **ptrptr, const char **errorptr)
610 {
611 int repeat_type, op_type;
612 int repeat_min, repeat_max;
613 int bravalue, length;
614 register int c;
615 register uschar *code = *codeptr;
616 const uschar *ptr = *ptrptr;
617 const uschar *oldptr;
618 uschar *previous = NULL;
619 uschar class[32];
620
621 /* Switch on next character until the end of the branch */
622
623 for (;; ptr++)
624 {
625 BOOL negate_class;
626 int class_charcount;
627 int class_lastchar;
628
629 c = *ptr;
630 if ((options & PCRE_EXTENDED) != 0)
631 {
632 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
633 if (c == '#')
634 {
635 while ((c = *(++ptr)) != 0 && c != '\n');
636 continue;
637 }
638 }
639
640 switch(c)
641 {
642 /* The branch terminates at end of string, |, or ). */
643
644 case 0:
645 case '|':
646 case ')':
647 *codeptr = code;
648 *ptrptr = ptr;
649 return TRUE;
650
651 /* Handle single-character metacharacters */
652
653 case '^':
654 previous = NULL;
655 *code++ = OP_CIRC;
656 break;
657
658 case '$':
659 previous = NULL;
660 *code++ = OP_DOLL;
661 break;
662
663 case '.':
664 previous = code;
665 *code++ = OP_ANY;
666 break;
667
668 /* Character classes. These always build a 32-byte bitmap of the permitted
669 characters, except in the special case where there is only one character.
670 For negated classes, we build the map as usual, then invert it at the end.
671 */
672
673 case '[':
674 previous = code;
675
676 /* If the first character is '^', set the negation flag, and use a
677 different opcode. This only matters if caseless matching is specified at
678 runtime. */
679
680 if ((c = *(++ptr)) == '^')
681 {
682 negate_class = TRUE;
683 *code++ = OP_NEGCLASS;
684 c = *(++ptr);
685 }
686 else
687 {
688 negate_class = FALSE;
689 *code++ = OP_CLASS;
690 }
691
692 /* Keep a count of chars so that we can optimize the case of just a single
693 character. */
694
695 class_charcount = 0;
696 class_lastchar = -1;
697
698 /* Initialize the 32-char bit map to all zeros. We have to build the
699 map in a temporary bit of store, in case the class contains only 1
700 character, because in that case the compiled code doesn't use the
701 bit map. */
702
703 memset(class, 0, 32 * sizeof(uschar));
704
705 /* Process characters until ] is reached. By writing this as a "do" it
706 means that an initial ] is taken as a data character. */
707
708 do
709 {
710 if (c == 0)
711 {
712 *errorptr = ERR6;
713 goto FAILED;
714 }
715
716 /* Backslash may introduce a single character, or it may introduce one
717 of the specials, which just set a flag. Escaped items are checked for
718 validity in the pre-compiling pass. The sequence \b is a special case.
719 Inside a class (and only there) it is treated as backspace. Elsewhere
720 it marks a word boundary. Other escapes have preset maps ready to
721 or into the one we are building. We assume they have more than one
722 character in them, so set class_count bigger than one. */
723
724 if (c == '\\')
725 {
726 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
727 if (-c == ESC_b) c = '\b';
728 else if (c < 0)
729 {
730 class_charcount = 10;
731 switch (-c)
732 {
733 case ESC_d:
734 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
735 continue;
736
737 case ESC_D:
738 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
739 continue;
740
741 case ESC_w:
742 for (c = 0; c < 32; c++)
743 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
744 continue;
745
746 case ESC_W:
747 for (c = 0; c < 32; c++)
748 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
749 continue;
750
751 case ESC_s:
752 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
753 continue;
754
755 case ESC_S:
756 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
757 continue;
758
759 default:
760 *errorptr = ERR7;
761 goto FAILED;
762 }
763 }
764 /* Fall through if single character */
765 }
766
767 /* A single character may be followed by '-' to form a range. However,
768 Perl does not permit ']' to be the end of the range. A '-' character
769 here is treated as a literal. */
770
771 if (ptr[1] == '-' && ptr[2] != ']')
772 {
773 int d;
774 ptr += 2;
775 d = *ptr;
776
777 if (d == 0)
778 {
779 *errorptr = ERR6;
780 goto FAILED;
781 }
782
783 /* The second part of a range can be a single-character escape, but
784 not any of the other escapes. */
785
786 if (d == '\\')
787 {
788 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
789 if (d < 0)
790 {
791 if (d == -ESC_b) d = '\b'; else
792 {
793 *errorptr = ERR7;
794 goto FAILED;
795 }
796 }
797 }
798
799 if (d < c)
800 {
801 *errorptr = ERR8;
802 goto FAILED;
803 }
804
805 for (; c <= d; c++)
806 {
807 class[c/8] |= (1 << (c&7));
808 if ((options & PCRE_CASELESS) != 0)
809 {
810 int uc = pcre_fcc[c]; /* flip case */
811 class[uc/8] |= (1 << (uc&7));
812 }
813 class_charcount++; /* in case a one-char range */
814 class_lastchar = c;
815 }
816 continue; /* Go get the next char in the class */
817 }
818
819 /* Handle a lone single character - we can get here for a normal
820 non-escape char, or after \ that introduces a single character. */
821
822 class [c/8] |= (1 << (c&7));
823 if ((options & PCRE_CASELESS) != 0)
824 {
825 c = pcre_fcc[c]; /* flip case */
826 class[c/8] |= (1 << (c&7));
827 }
828 class_charcount++;
829 class_lastchar = c;
830 }
831
832 /* Loop until ']' reached; the check for end of string happens inside the
833 loop. This "while" is the end of the "do" above. */
834
835 while ((c = *(++ptr)) != ']');
836
837 /* If class_charcount is 1 and class_lastchar is not negative, we saw
838 precisely one character. This doesn't need the whole 32-byte bit map.
839 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
840 it's negative. */
841
842 if (class_charcount == 1 && class_lastchar >= 0)
843 {
844 if (negate_class)
845 {
846 code[-1] = OP_NOT;
847 }
848 else
849 {
850 code[-1] = OP_CHARS;
851 *code++ = 1;
852 }
853 *code++ = class_lastchar;
854 }
855
856 /* Otherwise, negate the 32-byte map if necessary, and copy it into
857 the code vector. */
858
859 else
860 {
861 if (negate_class)
862 for (c = 0; c < 32; c++) code[c] = ~class[c];
863 else
864 memcpy(code, class, 32);
865 code += 32;
866 }
867 break;
868
869 /* Various kinds of repeat */
870
871 case '{':
872 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
873 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
874 if (*errorptr != NULL) goto FAILED;
875 goto REPEAT;
876
877 case '*':
878 repeat_min = 0;
879 repeat_max = -1;
880 goto REPEAT;
881
882 case '+':
883 repeat_min = 1;
884 repeat_max = -1;
885 goto REPEAT;
886
887 case '?':
888 repeat_min = 0;
889 repeat_max = 1;
890
891 REPEAT:
892 if (previous == NULL)
893 {
894 *errorptr = ERR9;
895 goto FAILED;
896 }
897
898 /* If the next character is '?' this is a minimizing repeat. Advance to the
899 next character. */
900
901 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
902
903 /* If the maximum is zero then the minimum must also be zero; Perl allows
904 this case, so we do too - by simply omitting the item altogether. */
905
906 if (repeat_max == 0) code = previous;
907
908 /* If previous was a string of characters, chop off the last one and use it
909 as the subject of the repeat. If there was only one character, we can
910 abolish the previous item altogether. */
911
912 else if (*previous == OP_CHARS)
913 {
914 int len = previous[1];
915 if (len == 1)
916 {
917 c = previous[2];
918 code = previous;
919 }
920 else
921 {
922 c = previous[len+1];
923 previous[1]--;
924 code--;
925 }
926 op_type = 0; /* Use single-char op codes */
927 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
928 }
929
930 /* If previous was a single negated character ([^a] or similar), we use
931 one of the special opcodes, replacing it. The code is shared with single-
932 character repeats by adding a suitable offset into repeat_type. */
933
934 else if ((int)*previous == OP_NOT)
935 {
936 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
937 c = previous[1];
938 code = previous;
939 goto OUTPUT_SINGLE_REPEAT;
940 }
941
942 /* If previous was a character type match (\d or similar), abolish it and
943 create a suitable repeat item. The code is shared with single-character
944 repeats by adding a suitable offset into repeat_type. */
945
946 else if ((int)*previous < OP_EOD || *previous == OP_ANY)
947 {
948 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
949 c = *previous;
950 code = previous;
951
952 OUTPUT_SINGLE_REPEAT:
953 repeat_type += op_type; /* Combine both values for many cases */
954
955 /* A minimum of zero is handled either as the special case * or ?, or as
956 an UPTO, with the maximum given. */
957
958 if (repeat_min == 0)
959 {
960 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
961 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
962 else
963 {
964 *code++ = OP_UPTO + repeat_type;
965 *code++ = repeat_max >> 8;
966 *code++ = (repeat_max & 255);
967 }
968 }
969
970 /* The case {1,} is handled as the special case + */
971
972 else if (repeat_min == 1 && repeat_max == -1)
973 *code++ = OP_PLUS + repeat_type;
974
975 /* The case {n,n} is just an EXACT, while the general case {n,m} is
976 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
977
978 else
979 {
980 if (repeat_min != 1)
981 {
982 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
983 *code++ = repeat_min >> 8;
984 *code++ = (repeat_min & 255);
985 }
986
987 /* If the mininum is 1 and the previous item was a character string,
988 we either have to put back the item that got cancelled if the string
989 length was 1, or add the character back onto the end of a longer
990 string. For a character type nothing need be done; it will just get put
991 back naturally. */
992
993 else if (*previous == OP_CHARS)
994 {
995 if (code == previous) code += 2; else previous[1]++;
996 }
997
998 /* If the maximum is unlimited, insert an OP_STAR. */
999
1000 if (repeat_max < 0)
1001 {
1002 *code++ = c;
1003 *code++ = OP_STAR + repeat_type;
1004 }
1005
1006 /* Else insert an UPTO if the max is greater than the min. */
1007
1008 else if (repeat_max != repeat_min)
1009 {
1010 *code++ = c;
1011 repeat_max -= repeat_min;
1012 *code++ = OP_UPTO + repeat_type;
1013 *code++ = repeat_max >> 8;
1014 *code++ = (repeat_max & 255);
1015 }
1016 }
1017
1018 /* The character or character type itself comes last in all cases. */
1019
1020 *code++ = c;
1021 }
1022
1023 /* If previous was a character class or a back reference, we put the repeat
1024 stuff after it. */
1025
1026 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1027 *previous == OP_REF)
1028 {
1029 if (repeat_min == 0 && repeat_max == -1)
1030 *code++ = OP_CRSTAR + repeat_type;
1031 else if (repeat_min == 1 && repeat_max == -1)
1032 *code++ = OP_CRPLUS + repeat_type;
1033 else if (repeat_min == 0 && repeat_max == 1)
1034 *code++ = OP_CRQUERY + repeat_type;
1035 else
1036 {
1037 *code++ = OP_CRRANGE + repeat_type;
1038 *code++ = repeat_min >> 8;
1039 *code++ = repeat_min & 255;
1040 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1041 *code++ = repeat_max >> 8;
1042 *code++ = repeat_max & 255;
1043 }
1044 }
1045
1046 /* If previous was a bracket group, we may have to replicate it in certain
1047 cases. If the maximum repeat count is unlimited, check that the bracket
1048 group cannot match the empty string, and diagnose an error if it can. */
1049
1050 else if ((int)*previous >= OP_BRA)
1051 {
1052 int i;
1053 int len = code - previous;
1054
1055 if (repeat_max == -1 && could_be_empty(previous))
1056 {
1057 *errorptr = ERR10;
1058 goto FAILED;
1059 }
1060
1061 /* If the minimum is greater than zero, and the maximum is unlimited or
1062 equal to the minimum, the first copy remains where it is, and is
1063 replicated up to the minimum number of times. This case includes the +
1064 repeat, but of course no replication is needed in that case. */
1065
1066 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1067 {
1068 for (i = 1; i < repeat_min; i++)
1069 {
1070 memcpy(code, previous, len);
1071 code += len;
1072 }
1073 }
1074
1075 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1076 Then, if there is a fixed upper limit, replicated up to that many times,
1077 sticking BRAZERO in front of all the optional ones. */
1078
1079 else
1080 {
1081 if (repeat_min == 0)
1082 {
1083 memmove(previous+1, previous, len);
1084 code++;
1085 *previous++ = OP_BRAZERO + repeat_type;
1086 }
1087
1088 for (i = 1; i < repeat_min; i++)
1089 {
1090 memcpy(code, previous, len);
1091 code += len;
1092 }
1093
1094 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1095 {
1096 *code++ = OP_BRAZERO + repeat_type;
1097 memcpy(code, previous, len);
1098 code += len;
1099 }
1100 }
1101
1102 /* If the maximum is unlimited, set a repeater in the final copy. */
1103
1104 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1105 }
1106
1107 /* Else there's some kind of shambles */
1108
1109 else
1110 {
1111 *errorptr = ERR11;
1112 goto FAILED;
1113 }
1114
1115 /* In all case we no longer have a previous item. */
1116
1117 previous = NULL;
1118 break;
1119
1120
1121 /* Start of nested bracket sub-expression, or comment or lookahead.
1122 First deal with special things that can come after a bracket; all are
1123 introduced by ?, and the appearance of any of them means that this is not a
1124 referencing group. They were checked for validity in the first pass over
1125 the string, so we don't have to check for syntax errors here. */
1126
1127 case '(':
1128 previous = code; /* Only real brackets can be repeated */
1129 if (*(++ptr) == '?')
1130 {
1131 bravalue = OP_BRA;
1132
1133 switch (*(++ptr))
1134 {
1135 case '#':
1136 case 'i':
1137 case 'm':
1138 case 's':
1139 case 'x':
1140 ptr++;
1141 while (*ptr != ')') ptr++;
1142 previous = NULL;
1143 continue;
1144
1145 case ':': /* Non-extracting bracket */
1146 ptr++;
1147 break;
1148
1149 case '=': /* Assertions can't be repeated */
1150 bravalue = OP_ASSERT;
1151 ptr++;
1152 previous = NULL;
1153 break;
1154
1155 case '!':
1156 bravalue = OP_ASSERT_NOT;
1157 ptr++;
1158 previous = NULL;
1159 break;
1160
1161 case '>': /* "Match once" brackets */
1162 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1163 {
1164 bravalue = OP_ONCE;
1165 ptr++;
1166 previous = NULL;
1167 break;
1168 }
1169 /* Else fall through */
1170
1171 default:
1172 *errorptr = ERR12;
1173 goto FAILED;
1174 }
1175 }
1176
1177 /* Else we have a referencing group */
1178
1179 else
1180 {
1181 if (++(*brackets) > EXTRACT_MAX)
1182 {
1183 *errorptr = ERR13;
1184 goto FAILED;
1185 }
1186 bravalue = OP_BRA + *brackets;
1187 }
1188
1189 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1190 code into a non-register variable in order to be able to pass its address
1191 because some compilers complain otherwise. */
1192
1193 *code = bravalue;
1194 {
1195 uschar *mcode = code;
1196 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1197 goto FAILED;
1198 code = mcode;
1199 }
1200
1201 if (*ptr != ')')
1202 {
1203 *errorptr = ERR14;
1204 goto FAILED;
1205 }
1206 break;
1207
1208 /* Check \ for being a real metacharacter; if not, fall through and handle
1209 it as a data character at the start of a string. Escape items are checked
1210 for validity in the pre-compiling pass. */
1211
1212 case '\\':
1213 oldptr = ptr;
1214 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1215
1216 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1217 are arranged to be the negation of the corresponding OP_values. For the
1218 back references, the values are ESC_REF plus the reference number. Only
1219 back references and those types that consume a character may be repeated.
1220 We can test for values between ESC_b and ESC_Z for the latter; this may
1221 have to change if any new ones are ever created. */
1222
1223 if (c < 0)
1224 {
1225 if (-c >= ESC_REF)
1226 {
1227 int refnum = -c - ESC_REF;
1228 if (*brackets < refnum)
1229 {
1230 *errorptr = ERR15;
1231 goto FAILED;
1232 }
1233 previous = code;
1234 *code++ = OP_REF;
1235 *code++ = refnum;
1236 }
1237 else
1238 {
1239 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1240 *code++ = -c;
1241 }
1242 continue;
1243 }
1244
1245 /* Data character: reset and fall through */
1246
1247 ptr = oldptr;
1248 c = '\\';
1249
1250 /* Handle a run of data characters until a metacharacter is encountered.
1251 The first character is guaranteed not to be whitespace or # when the
1252 extended flag is set. */
1253
1254 NORMAL_CHAR:
1255 default:
1256 previous = code;
1257 *code = OP_CHARS;
1258 code += 2;
1259 length = 0;
1260
1261 do
1262 {
1263 if ((options & PCRE_EXTENDED) != 0)
1264 {
1265 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1266 if (c == '#')
1267 {
1268 while ((c = *(++ptr)) != 0 && c != '\n');
1269 if (c == 0) break;
1270 continue;
1271 }
1272 }
1273
1274 /* Backslash may introduce a data char or a metacharacter. Escaped items
1275 are checked for validity in the pre-compiling pass. Stop the string
1276 before a metaitem. */
1277
1278 if (c == '\\')
1279 {
1280 oldptr = ptr;
1281 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1282 if (c < 0) { ptr = oldptr; break; }
1283 }
1284
1285 /* Ordinary character or single-char escape */
1286
1287 *code++ = c;
1288 length++;
1289 }
1290
1291 /* This "while" is the end of the "do" above. */
1292
1293 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1294
1295 /* Compute the length and set it in the data vector, and advance to
1296 the next state. */
1297
1298 previous[1] = length;
1299 ptr--;
1300 break;
1301 }
1302 } /* end of big loop */
1303
1304 /* Control never reaches here by falling through, only by a goto for all the
1305 error states. Pass back the position in the pattern so that it can be displayed
1306 to the user for diagnosing the error. */
1307
1308 FAILED:
1309 *ptrptr = ptr;
1310 return FALSE;
1311 }
1312
1313
1314
1315
1316 /*************************************************
1317 * Compile sequence of alternatives *
1318 *************************************************/
1319
1320 /* On entry, ptr is pointing past the bracket character, but on return
1321 it points to the closing bracket, or vertical bar, or end of string.
1322 The code variable is pointing at the byte into which the BRA operator has been
1323 stored.
1324
1325 Argument:
1326 options the option bits
1327 brackets -> int containing the number of extracting brackets used
1328 codeptr -> the address of the current code pointer
1329 ptrptr -> the address of the current pattern pointer
1330 errorptr -> pointer to error message
1331
1332 Returns: TRUE on success
1333 */
1334
1335 static BOOL
1336 compile_regex(int options, int *brackets, uschar **codeptr,
1337 const uschar **ptrptr, const char **errorptr)
1338 {
1339 const uschar *ptr = *ptrptr;
1340 uschar *code = *codeptr;
1341 uschar *start_bracket = code;
1342
1343 for (;;)
1344 {
1345 int length;
1346 uschar *last_branch = code;
1347
1348 code += 3;
1349 if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1350 {
1351 *ptrptr = ptr;
1352 return FALSE;
1353 }
1354
1355 /* Fill in the length of the last branch */
1356
1357 length = code - last_branch;
1358 last_branch[1] = length >> 8;
1359 last_branch[2] = length & 255;
1360
1361 /* Reached end of expression, either ')' or end of pattern. Insert a
1362 terminating ket and the length of the whole bracketed item, and return,
1363 leaving the pointer at the terminating char. */
1364
1365 if (*ptr != '|')
1366 {
1367 length = code - start_bracket;
1368 *code++ = OP_KET;
1369 *code++ = length >> 8;
1370 *code++ = length & 255;
1371 *codeptr = code;
1372 *ptrptr = ptr;
1373 return TRUE;
1374 }
1375
1376 /* Another branch follows; insert an "or" node and advance the pointer. */
1377
1378 *code = OP_ALT;
1379 ptr++;
1380 }
1381 /* Control never reaches here */
1382 }
1383
1384
1385
1386 /*************************************************
1387 * Check for anchored expression *
1388 *************************************************/
1389
1390 /* Try to find out if this is an anchored regular expression. Consider each
1391 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1392 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1393 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1394 counts, since OP_CIRC can match in the middle.
1395
1396 A branch is also implicitly anchored if it starts with .* because that will try
1397 the rest of the pattern at all possible matching points, so there is no point
1398 trying them again.
1399
1400 Argument: points to start of expression (the bracket)
1401 Returns: TRUE or FALSE
1402 */
1403
1404 static BOOL
1405 is_anchored(register const uschar *code, BOOL multiline)
1406 {
1407 do {
1408 int op = (int)code[3];
1409 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1410 { if (!is_anchored(code+3, multiline)) return FALSE; }
1411 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1412 { if (code[4] != OP_ANY) return FALSE; }
1413 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1414 code += (code[1] << 8) + code[2];
1415 }
1416 while (*code == OP_ALT);
1417 return TRUE;
1418 }
1419
1420
1421
1422 /*************************************************
1423 * Check for start with \n line expression *
1424 *************************************************/
1425
1426 /* This is called for multiline expressions to try to find out if every branch
1427 starts with ^ so that "first char" processing can be done to speed things up.
1428
1429 Argument: points to start of expression (the bracket)
1430 Returns: TRUE or FALSE
1431 */
1432
1433 static BOOL
1434 is_startline(const uschar *code)
1435 {
1436 do {
1437 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1438 { if (!is_startline(code+3)) return FALSE; }
1439 else if (code[3] != OP_CIRC) return FALSE;
1440 code += (code[1] << 8) + code[2];
1441 }
1442 while (*code == OP_ALT);
1443 return TRUE;
1444 }
1445
1446
1447
1448 /*************************************************
1449 * Check for fixed first char *
1450 *************************************************/
1451
1452 /* Try to find out if there is a fixed first character. This is called for
1453 unanchored expressions, as it speeds up their processing quite considerably.
1454 Consider each alternative branch. If they all start with the same char, or with
1455 a bracket all of whose alternatives start with the same char (recurse ad lib),
1456 then we return that char, otherwise -1.
1457
1458 Argument: points to start of expression (the bracket)
1459 Returns: -1 or the fixed first char
1460 */
1461
1462 static int
1463 find_firstchar(uschar *code)
1464 {
1465 register int c = -1;
1466 do
1467 {
1468 register int charoffset = 4;
1469
1470 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1471 {
1472 register int d;
1473 if ((d = find_firstchar(code+3)) < 0) return -1;
1474 if (c < 0) c = d; else if (c != d) return -1;
1475 }
1476
1477 else switch(code[3])
1478 {
1479 default:
1480 return -1;
1481
1482 case OP_EXACT: /* Fall through */
1483 charoffset++;
1484
1485 case OP_CHARS: /* Fall through */
1486 charoffset++;
1487
1488 case OP_PLUS:
1489 case OP_MINPLUS:
1490 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1491 break;
1492 }
1493 code += (code[1] << 8) + code[2];
1494 }
1495 while (*code == OP_ALT);
1496 return c;
1497 }
1498
1499
1500
1501 /*************************************************
1502 * Compile a Regular Expression *
1503 *************************************************/
1504
1505 /* This function takes a string and returns a pointer to a block of store
1506 holding a compiled version of the expression.
1507
1508 Arguments:
1509 pattern the regular expression
1510 options various option bits
1511 errorptr pointer to pointer to error text
1512 erroroffset ptr offset in pattern where error was detected
1513
1514 Returns: pointer to compiled data block, or NULL on error,
1515 with errorptr and erroroffset set
1516 */
1517
1518 pcre *
1519 pcre_compile(const char *pattern, int options, const char **errorptr,
1520 int *erroroffset)
1521 {
1522 real_pcre *re;
1523 int spaces = 0;
1524 int length = 3; /* For initial BRA plus length */
1525 int runlength;
1526 int c, size;
1527 int bracount = 0;
1528 int brastack[200];
1529 int top_backref = 0;
1530 unsigned int brastackptr = 0;
1531 uschar *code;
1532 const uschar *ptr;
1533
1534 #ifdef DEBUG
1535 uschar *code_base, *code_end;
1536 #endif
1537
1538 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1539 can do is just return NULL. */
1540
1541 if (errorptr == NULL) return NULL;
1542 *errorptr = NULL;
1543
1544 /* However, we can give a message for this error */
1545
1546 if (erroroffset == NULL)
1547 {
1548 *errorptr = ERR16;
1549 return NULL;
1550 }
1551 *erroroffset = 0;
1552
1553 if ((options & ~PUBLIC_OPTIONS) != 0)
1554 {
1555 *errorptr = ERR17;
1556 return NULL;
1557 }
1558
1559 DPRINTF(("------------------------------------------------------------------\n"));
1560 DPRINTF(("%s\n", pattern));
1561
1562 /* The first thing to do is to make a pass over the pattern to compute the
1563 amount of store required to hold the compiled code. This does not have to be
1564 perfect as long as errors are overestimates. At the same time we can detect any
1565 internal flag settings. Make an attempt to correct for any counted white space
1566 if an "extended" flag setting appears late in the pattern. We can't be so
1567 clever for #-comments. */
1568
1569 ptr = (const uschar *)(pattern - 1);
1570 while ((c = *(++ptr)) != 0)
1571 {
1572 int min, max;
1573 int class_charcount;
1574
1575 if ((pcre_ctypes[c] & ctype_space) != 0)
1576 {
1577 if ((options & PCRE_EXTENDED) != 0) continue;
1578 spaces++;
1579 }
1580
1581 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1582 {
1583 while ((c = *(++ptr)) != 0 && c != '\n');
1584 continue;
1585 }
1586
1587 switch(c)
1588 {
1589 /* A backslashed item may be an escaped "normal" character or a
1590 character type. For a "normal" character, put the pointers and
1591 character back so that tests for whitespace etc. in the input
1592 are done correctly. */
1593
1594 case '\\':
1595 {
1596 const uschar *save_ptr = ptr;
1597 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1598 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1599 if (c >= 0)
1600 {
1601 ptr = save_ptr;
1602 c = '\\';
1603 goto NORMAL_CHAR;
1604 }
1605 }
1606 length++;
1607
1608 /* A back reference needs an additional char, plus either one or 5
1609 bytes for a repeat. We also need to keep the value of the highest
1610 back reference. */
1611
1612 if (c <= -ESC_REF)
1613 {
1614 int refnum = -c - ESC_REF;
1615 if (refnum > top_backref) top_backref = refnum;
1616 length++; /* For single back reference */
1617 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1618 {
1619 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1620 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1621 if ((min == 0 && (max == 1 || max == -1)) ||
1622 (min == 1 && max == -1))
1623 length++;
1624 else length += 5;
1625 if (ptr[1] == '?') ptr++;
1626 }
1627 }
1628 continue;
1629
1630 case '^':
1631 case '.':
1632 case '$':
1633 case '*': /* These repeats won't be after brackets; */
1634 case '+': /* those are handled separately */
1635 case '?':
1636 length++;
1637 continue;
1638
1639 /* This covers the cases of repeats after a single char, metachar, class,
1640 or back reference. */
1641
1642 case '{':
1643 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1644 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1645 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1646 if ((min == 0 && (max == 1 || max == -1)) ||
1647 (min == 1 && max == -1))
1648 length++;
1649 else
1650 {
1651 length--; /* Uncount the original char or metachar */
1652 if (min == 1) length++; else if (min > 0) length += 4;
1653 if (max > 0) length += 4; else length += 2;
1654 }
1655 if (ptr[1] == '?') ptr++;
1656 continue;
1657
1658 /* An alternation contains an offset to the next branch or ket. */
1659 case '|':
1660 length += 3;
1661 continue;
1662
1663 /* A character class uses 33 characters. Don't worry about character types
1664 that aren't allowed in classes - they'll get picked up during the compile.
1665 A character class that contains only one character uses 2 or 3 bytes,
1666 depending on whether it is negated or not. Notice this where we can. */
1667
1668 case '[':
1669 class_charcount = 0;
1670 if (*(++ptr) == '^') ptr++;
1671 do
1672 {
1673 if (*ptr == '\\')
1674 {
1675 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1676 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1677 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1678 }
1679 else class_charcount++;
1680 ptr++;
1681 }
1682 while (*ptr != 0 && *ptr != ']');
1683
1684 /* Repeats for negated single chars are handled by the general code */
1685
1686 if (class_charcount == 1) length += 3; else
1687 {
1688 length += 33;
1689
1690 /* A repeat needs either 1 or 5 bytes. */
1691
1692 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
1693 {
1694 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1695 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1696 if ((min == 0 && (max == 1 || max == -1)) ||
1697 (min == 1 && max == -1))
1698 length++;
1699 else length += 5;
1700 if (ptr[1] == '?') ptr++;
1701 }
1702 }
1703 continue;
1704
1705 /* Brackets may be genuine groups or special things */
1706
1707 case '(':
1708
1709 /* Handle special forms of bracket, which all start (? */
1710
1711 if (ptr[1] == '?') switch (c = ptr[2])
1712 {
1713 /* Skip over comments entirely */
1714 case '#':
1715 ptr += 3;
1716 while (*ptr != 0 && *ptr != ')') ptr++;
1717 if (*ptr == 0)
1718 {
1719 *errorptr = ERR18;
1720 goto PCRE_ERROR_RETURN;
1721 }
1722 continue;
1723
1724 /* Non-referencing groups and lookaheads just move the pointer on, and
1725 then behave like a non-special bracket, except that they don't increment
1726 the count of extracting brackets. */
1727
1728 case ':':
1729 case '=':
1730 case '!':
1731 ptr += 2;
1732 break;
1733
1734 /* Ditto for the "once only" bracket, allowed only if the extra bit
1735 is set. */
1736
1737 case '>':
1738 if ((options & PCRE_EXTRA) != 0)
1739 {
1740 ptr += 2;
1741 break;
1742 }
1743 /* Else fall thourh */
1744
1745 /* Else loop setting valid options until ) is met. Anything else is an
1746 error. */
1747
1748 default:
1749 ptr += 2;
1750 for (;; ptr++)
1751 {
1752 if ((c = *ptr) == 'i')
1753 {
1754 options |= PCRE_CASELESS;
1755 continue;
1756 }
1757 else if ((c = *ptr) == 'm')
1758 {
1759 options |= PCRE_MULTILINE;
1760 continue;
1761 }
1762 else if (c == 's')
1763 {
1764 options |= PCRE_DOTALL;
1765 continue;
1766 }
1767 else if (c == 'x')
1768 {
1769 options |= PCRE_EXTENDED;
1770 length -= spaces; /* Already counted spaces */
1771 continue;
1772 }
1773 else if (c == ')') break;
1774
1775 *errorptr = ERR12;
1776 goto PCRE_ERROR_RETURN;
1777 }
1778 continue; /* End of this bracket handling */
1779 }
1780
1781 /* Extracting brackets must be counted so we can process escapes in a
1782 Perlish way. */
1783
1784 else bracount++;
1785
1786 /* Non-special forms of bracket. Save length for computing whole length
1787 at end if there's a repeat that requires duplication of the group. */
1788
1789 if (brastackptr >= sizeof(brastack)/sizeof(int))
1790 {
1791 *errorptr = ERR19;
1792 goto PCRE_ERROR_RETURN;
1793 }
1794
1795 brastack[brastackptr++] = length;
1796 length += 3;
1797 continue;
1798
1799 /* Handle ket. Look for subsequent max/min; for certain sets of values we
1800 have to replicate this bracket up to that many times. If brastackptr is
1801 0 this is an unmatched bracket which will generate an error, but take care
1802 not to try to access brastack[-1]. */
1803
1804 case ')':
1805 length += 3;
1806 {
1807 int minval = 1;
1808 int maxval = 1;
1809 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
1810
1811 /* Leave ptr at the final char; for read_repeat_counts this happens
1812 automatically; for the others we need an increment. */
1813
1814 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1815 {
1816 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
1817 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1818 }
1819 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
1820 else if (c == '+') { maxval = -1; ptr++; }
1821 else if (c == '?') { minval = 0; ptr++; }
1822
1823 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
1824 if there is a limited maximum we have to replicate up to maxval-1 times
1825 and allow for a BRAZERO item before each optional copy, as we also have
1826 to do before the first copy if the minimum is zero. */
1827
1828 if (minval == 0) length++;
1829 else if (minval > 1) length += (minval - 1) * duplength;
1830 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
1831 }
1832 continue;
1833
1834 /* Non-special character. For a run of such characters the length required
1835 is the number of characters + 2, except that the maximum run length is 255.
1836 We won't get a skipped space or a non-data escape or the start of a #
1837 comment as the first character, so the length can't be zero. */
1838
1839 NORMAL_CHAR:
1840 default:
1841 length += 2;
1842 runlength = 0;
1843 do
1844 {
1845 if ((pcre_ctypes[c] & ctype_space) != 0)
1846 {
1847 if ((options & PCRE_EXTENDED) != 0) continue;
1848 spaces++;
1849 }
1850
1851 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1852 {
1853 while ((c = *(++ptr)) != 0 && c != '\n');
1854 continue;
1855 }
1856
1857 /* Backslash may introduce a data char or a metacharacter; stop the
1858 string before the latter. */
1859
1860 if (c == '\\')
1861 {
1862 const uschar *saveptr = ptr;
1863 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1864 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1865 if (c < 0) { ptr = saveptr; break; }
1866 }
1867
1868 /* Ordinary character or single-char escape */
1869
1870 runlength++;
1871 }
1872
1873 /* This "while" is the end of the "do" above. */
1874
1875 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1876
1877 ptr--;
1878 length += runlength;
1879 continue;
1880 }
1881 }
1882
1883 length += 4; /* For final KET and END */
1884
1885 if (length > 65539)
1886 {
1887 *errorptr = ERR20;
1888 return NULL;
1889 }
1890
1891 /* Compute the size of data block needed and get it, either from malloc or
1892 externally provided function. We specify "code[0]" in the offsetof() expression
1893 rather than just "code", because it has been reported that one broken compiler
1894 fails on "code" because it is also an independent variable. It should make no
1895 difference to the value of the offsetof(). */
1896
1897 size = length + offsetof(real_pcre, code[0]);
1898 re = (real_pcre *)(pcre_malloc)(size);
1899
1900 if (re == NULL)
1901 {
1902 *errorptr = ERR21;
1903 return NULL;
1904 }
1905
1906 /* Put in the magic number and the options. */
1907
1908 re->magic_number = MAGIC_NUMBER;
1909 re->options = options;
1910
1911 /* Set up a starting, non-extracting bracket, then compile the expression. On
1912 error, *errorptr will be set non-NULL, so we don't need to look at the result
1913 of the function here. */
1914
1915 ptr = (const uschar *)pattern;
1916 code = re->code;
1917 *code = OP_BRA;
1918 bracount = 0;
1919 (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1920 re->top_bracket = bracount;
1921 re->top_backref = top_backref;
1922
1923 /* If not reached end of pattern on success, there's an excess bracket. */
1924
1925 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1926
1927 /* Fill in the terminating state and check for disastrous overflow, but
1928 if debugging, leave the test till after things are printed out. */
1929
1930 *code++ = OP_END;
1931
1932 #ifndef DEBUG
1933 if (code - re->code > length) *errorptr = ERR23;
1934 #endif
1935
1936 /* Failed to compile */
1937
1938 if (*errorptr != NULL)
1939 {
1940 (pcre_free)(re);
1941 PCRE_ERROR_RETURN:
1942 *erroroffset = ptr - (const uschar *)pattern;
1943 return NULL;
1944 }
1945
1946 /* If the anchored option was not passed, set flag if we can determine that it
1947 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1948 we can determine what the first character has to be, because that speeds up
1949 unanchored matches no end. In the case of multiline matches, an alternative is
1950 to set the PCRE_STARTLINE flag if all branches start with ^. */
1951
1952 if ((options & PCRE_ANCHORED) == 0)
1953 {
1954 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1955 re->options |= PCRE_ANCHORED;
1956 else
1957 {
1958 int ch = find_firstchar(re->code);
1959 if (ch >= 0)
1960 {
1961 re->first_char = ch;
1962 re->options |= PCRE_FIRSTSET;
1963 }
1964 else if (is_startline(re->code))
1965 re->options |= PCRE_STARTLINE;
1966 }
1967 }
1968
1969 /* Print out the compiled data for debugging */
1970
1971 #ifdef DEBUG
1972
1973 printf("Length = %d top_bracket = %d top_backref=%d\n",
1974 length, re->top_bracket, re->top_backref);
1975
1976 if (re->options != 0)
1977 {
1978 printf("%s%s%s%s%s%s%s\n",
1979 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1980 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1981 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1982 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1983 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1984 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1985 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1986 }
1987
1988 if ((re->options & PCRE_FIRSTSET) != 0)
1989 {
1990 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1991 else printf("First char = \\x%02x\n", re->first_char);
1992 }
1993
1994 code_end = code;
1995 code_base = code = re->code;
1996
1997 while (code < code_end)
1998 {
1999 int charlength;
2000
2001 printf("%3d ", code - code_base);
2002
2003 if (*code >= OP_BRA)
2004 {
2005 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2006 code += 2;
2007 }
2008
2009 else switch(*code)
2010 {
2011 case OP_CHARS:
2012 charlength = *(++code);
2013 printf("%3d ", charlength);
2014 while (charlength-- > 0)
2015 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2016 break;
2017
2018 case OP_KETRMAX:
2019 case OP_KETRMIN:
2020 case OP_ALT:
2021 case OP_KET:
2022 case OP_ASSERT:
2023 case OP_ASSERT_NOT:
2024 case OP_ONCE:
2025 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2026 code += 2;
2027 break;
2028
2029 case OP_STAR:
2030 case OP_MINSTAR:
2031 case OP_PLUS:
2032 case OP_MINPLUS:
2033 case OP_QUERY:
2034 case OP_MINQUERY:
2035 case OP_TYPESTAR:
2036 case OP_TYPEMINSTAR:
2037 case OP_TYPEPLUS:
2038 case OP_TYPEMINPLUS:
2039 case OP_TYPEQUERY:
2040 case OP_TYPEMINQUERY:
2041 if (*code >= OP_TYPESTAR)
2042 printf(" %s", OP_names[code[1]]);
2043 else if (isprint(c = code[1])) printf(" %c", c);
2044 else printf(" \\x%02x", c);
2045 printf("%s", OP_names[*code++]);
2046 break;
2047
2048 case OP_EXACT:
2049 case OP_UPTO:
2050 case OP_MINUPTO:
2051 if (isprint(c = code[3])) printf(" %c{", c);
2052 else printf(" \\x%02x{", c);
2053 if (*code != OP_EXACT) printf("0,");
2054 printf("%d}", (code[1] << 8) + code[2]);
2055 if (*code == OP_MINUPTO) printf("?");
2056 code += 3;
2057 break;
2058
2059 case OP_TYPEEXACT:
2060 case OP_TYPEUPTO:
2061 case OP_TYPEMINUPTO:
2062 printf(" %s{", OP_names[code[3]]);
2063 if (*code != OP_TYPEEXACT) printf(",");
2064 printf("%d}", (code[1] << 8) + code[2]);
2065 if (*code == OP_TYPEMINUPTO) printf("?");
2066 code += 3;
2067 break;
2068
2069 case OP_NOT:
2070 if (isprint(c = *(++code))) printf(" [^%c]", c);
2071 else printf(" [^\\x%02x]", c);
2072 break;
2073
2074 case OP_NOTSTAR:
2075 case OP_NOTMINSTAR:
2076 case OP_NOTPLUS:
2077 case OP_NOTMINPLUS:
2078 case OP_NOTQUERY:
2079 case OP_NOTMINQUERY:
2080 if (isprint(c = code[1])) printf(" [^%c]", c);
2081 else printf(" [^\\x%02x]", c);
2082 printf("%s", OP_names[*code++]);
2083 break;
2084
2085 case OP_NOTEXACT:
2086 case OP_NOTUPTO:
2087 case OP_NOTMINUPTO:
2088 if (isprint(c = code[3])) printf(" [^%c]{", c);
2089 else printf(" [^\\x%02x]{", c);
2090 if (*code != OP_NOTEXACT) printf(",");
2091 printf("%d}", (code[1] << 8) + code[2]);
2092 if (*code == OP_NOTMINUPTO) printf("?");
2093 code += 3;
2094 break;
2095
2096 case OP_REF:
2097 printf(" \\%d", *(++code));
2098 code ++;
2099 goto CLASS_REF_REPEAT;
2100
2101 case OP_CLASS:
2102 case OP_NEGCLASS:
2103 {
2104 int i, min, max;
2105
2106 if (*code++ == OP_CLASS) printf(" [");
2107 else printf(" ^[");
2108
2109 for (i = 0; i < 256; i++)
2110 {
2111 if ((code[i/8] & (1 << (i&7))) != 0)
2112 {
2113 int j;
2114 for (j = i+1; j < 256; j++)
2115 if ((code[j/8] & (1 << (j&7))) == 0) break;
2116 if (i == '-' || i == ']') printf("\\");
2117 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2118 if (--j > i)
2119 {
2120 printf("-");
2121 if (j == '-' || j == ']') printf("\\");
2122 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2123 }
2124 i = j;
2125 }
2126 }
2127 printf("]");
2128 code += 32;
2129
2130 CLASS_REF_REPEAT:
2131
2132 switch(*code)
2133 {
2134 case OP_CRSTAR:
2135 case OP_CRMINSTAR:
2136 case OP_CRPLUS:
2137 case OP_CRMINPLUS:
2138 case OP_CRQUERY:
2139 case OP_CRMINQUERY:
2140 printf("%s", OP_names[*code]);
2141 break;
2142
2143 case OP_CRRANGE:
2144 case OP_CRMINRANGE:
2145 min = (code[1] << 8) + code[2];
2146 max = (code[3] << 8) + code[4];
2147 if (max == 0) printf("{%d,}", min);
2148 else printf("{%d,%d}", min, max);
2149 if (*code == OP_CRMINRANGE) printf("?");
2150 code += 4;
2151 break;
2152
2153 default:
2154 code--;
2155 }
2156 }
2157 break;
2158
2159 /* Anything else is just a one-node item */
2160
2161 default:
2162 printf(" %s", OP_names[*code]);
2163 break;
2164 }
2165
2166 code++;
2167 printf("\n");
2168 }
2169 printf("------------------------------------------------------------------\n");
2170
2171 /* This check is done here in the debugging case so that the code that
2172 was compiled can be seen. */
2173
2174 if (code - re->code > length)
2175 {
2176 *errorptr = ERR23;
2177 (pcre_free)(re);
2178 *erroroffset = ptr - (uschar *)pattern;
2179 return NULL;
2180 }
2181 #endif
2182
2183 return (pcre *)re;
2184 }
2185
2186
2187
2188 /*************************************************
2189 * Match a character type *
2190 *************************************************/
2191
2192 /* Not used in all the places it might be as it's sometimes faster
2193 to put the code inline.
2194
2195 Arguments:
2196 type the character type
2197 c the character
2198 dotall the dotall flag
2199
2200 Returns: TRUE if character is of the type
2201 */
2202
2203 static BOOL
2204 match_type(int type, int c, BOOL dotall)
2205 {
2206
2207 #ifdef DEBUG
2208 if (isprint(c)) printf("matching subject %c against ", c);
2209 else printf("matching subject \\x%02x against ", c);
2210 printf("%s\n", OP_names[type]);
2211 #endif
2212
2213 switch(type)
2214 {
2215 case OP_ANY: return dotall || c != '\n';
2216 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2217 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2218 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2219 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2220 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2221 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2222 }
2223 return FALSE;
2224 }
2225
2226
2227
2228 /*************************************************
2229 * Match a back-reference *
2230 *************************************************/
2231
2232 /* If a back reference hasn't been set, the match fails.
2233
2234 Arguments:
2235 number reference number
2236 eptr points into the subject
2237 length length to be matched
2238 md points to match data block
2239
2240 Returns: TRUE if matched
2241 */
2242
2243 static BOOL
2244 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2245 {
2246 const uschar *p = md->start_subject + md->offset_vector[number];
2247
2248 #ifdef DEBUG
2249 if (eptr >= md->end_subject)
2250 printf("matching subject <null>");
2251 else
2252 {
2253 printf("matching subject ");
2254 pchars(eptr, length, TRUE, md);
2255 }
2256 printf(" against backref ");
2257 pchars(p, length, FALSE, md);
2258 printf("\n");
2259 #endif
2260
2261 /* Always fail if not enough characters left */
2262
2263 if (length > md->end_subject - p) return FALSE;
2264
2265 /* Separate the caselesss case for speed */
2266
2267 if (md->caseless)
2268 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2269 else
2270 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2271
2272 return TRUE;
2273 }
2274
2275
2276
2277 /*************************************************
2278 * Match from current position *
2279 *************************************************/
2280
2281 /* On entry ecode points to the first opcode, and eptr to the first character.
2282
2283 Arguments:
2284 eptr pointer in subject
2285 ecode position in code
2286 offset_top current top pointer
2287 md pointer to "static" info for the match
2288
2289 Returns: TRUE if matched
2290 */
2291
2292 static BOOL
2293 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2294 match_data *md)
2295 {
2296 for (;;)
2297 {
2298 int min, max, ctype;
2299 register int i;
2300 register int c;
2301 BOOL minimize = FALSE;
2302
2303 /* Opening bracket. Check the alternative branches in turn, failing if none
2304 match. We have to set the start offset if required and there is space
2305 in the offset vector so that it is available for subsequent back references
2306 if the bracket matches. However, if the bracket fails, we must put back the
2307 previous value of both offsets in case they were set by a previous copy of
2308 the same bracket. Don't worry about setting the flag for the error case here;
2309 that is handled in the code for KET. */
2310
2311 if ((int)*ecode >= OP_BRA)
2312 {
2313 int number = (*ecode - OP_BRA) << 1;
2314 int save_offset1 = 0, save_offset2 = 0;
2315
2316 DPRINTF(("start bracket %d\n", number/2));
2317
2318 if (number > 0 && number < md->offset_end)
2319 {
2320 save_offset1 = md->offset_vector[number];
2321 save_offset2 = md->offset_vector[number+1];
2322 md->offset_vector[number] = eptr - md->start_subject;
2323
2324 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
2325 }
2326
2327 /* Recurse for all the alternatives. */
2328
2329 do
2330 {
2331 if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2332 ecode += (ecode[1] << 8) + ecode[2];
2333 }
2334 while (*ecode == OP_ALT);
2335
2336 DPRINTF(("bracket %d failed\n", number/2));
2337
2338 if (number > 0 && number < md->offset_end)
2339 {
2340 md->offset_vector[number] = save_offset1;
2341 md->offset_vector[number+1] = save_offset2;
2342 }
2343
2344 return FALSE;
2345 }
2346
2347 /* Other types of node can be handled by a switch */
2348
2349 switch(*ecode)
2350 {
2351 case OP_END:
2352 md->end_match_ptr = eptr; /* Record where we ended */
2353 md->end_offset_top = offset_top; /* and how many extracts were taken */
2354 return TRUE;
2355
2356 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2357 whole thing doesn't match, so we have to get out via a longjmp(). */
2358
2359 case OP_CUT:
2360 if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2361 longjmp(md->fail_env, 1);
2362
2363 /* Assertion brackets. Check the alternative branches in turn - the
2364 matching won't pass the KET for an assertion. If any one branch matches,
2365 the assertion is true. */
2366
2367 case OP_ASSERT:
2368 do
2369 {
2370 if (match(eptr, ecode+3, offset_top, md)) break;
2371 ecode += (ecode[1] << 8) + ecode[2];
2372 }
2373 while (*ecode == OP_ALT);
2374 if (*ecode == OP_KET) return FALSE;
2375
2376 /* Continue from after the assertion, updating the offsets high water
2377 mark, since extracts may have been taken during the assertion. */
2378
2379 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2380 ecode += 3;
2381 offset_top = md->end_offset_top;
2382 continue;
2383
2384 /* Negative assertion: all branches must fail to match */
2385
2386 case OP_ASSERT_NOT:
2387 do
2388 {
2389 if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2390 ecode += (ecode[1] << 8) + ecode[2];
2391 }
2392 while (*ecode == OP_ALT);
2393 ecode += 3;
2394 continue;
2395
2396 /* "Once" brackets are like assertion brackets except that after a match,
2397 the point in the subject string is not moved back. Thus there can never be
2398 a move back into the brackets. Check the alternative branches in turn - the
2399 matching won't pass the KET for this kind of subpattern. If any one branch
2400 matches, we carry on, leaving the subject pointer. */
2401
2402 case OP_ONCE:
2403 do
2404 {
2405 if (match(eptr, ecode+3, offset_top, md)) break;
2406 ecode += (ecode[1] << 8) + ecode[2];
2407 }
2408 while (*ecode == OP_ALT);
2409 if (*ecode == OP_KET) return FALSE;
2410
2411 /* Continue as from after the assertion, updating the offsets high water
2412 mark, since extracts may have been taken. */
2413
2414 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2415 ecode += 3;
2416 offset_top = md->end_offset_top;
2417 eptr = md->end_match_ptr;
2418 continue;
2419
2420 /* An alternation is the end of a branch; scan along to find the end of the
2421 bracketed group and go to there. */
2422
2423 case OP_ALT:
2424 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2425 break;
2426
2427 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2428 that it may occur zero times. It may repeat infinitely, or not at all -
2429 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2430 repeat limits are compiled as a number of copies, with the optional ones
2431 preceded by BRAZERO or BRAMINZERO. */
2432
2433 case OP_BRAZERO:
2434 {
2435 const uschar *next = ecode+1;
2436 if (match(eptr, next, offset_top, md)) return TRUE;
2437 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2438 ecode = next + 3;
2439 }
2440 break;
2441
2442 case OP_BRAMINZERO:
2443 {
2444 const uschar *next = ecode+1;
2445 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2446 if (match(eptr, next+3, offset_top, md)) return TRUE;
2447 ecode++;
2448 }
2449 break;;
2450
2451 /* End of a group, repeated or non-repeating. If we are at the end of
2452 an assertion "group", stop matching and return TRUE, but record the
2453 current high water mark for use by positive assertions. */
2454
2455 case OP_KET:
2456 case OP_KETRMIN:
2457 case OP_KETRMAX:
2458 {
2459 int number;
2460 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2461
2462 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2463 {
2464 md->end_match_ptr = eptr; /* For ONCE */
2465 md->end_offset_top = offset_top;
2466 return TRUE;
2467 }
2468
2469 /* In all other cases we have to check the group number back at the
2470 start and if necessary complete handling an extraction by setting the
2471 final offset and bumping the high water mark. */
2472
2473 number = (*prev - OP_BRA) << 1;
2474
2475 DPRINTF(("end bracket %d\n", number/2));
2476
2477 if (number > 0)
2478 {
2479 if (number >= md->offset_end) md->offset_overflow = TRUE; else
2480 {
2481 md->offset_vector[number+1] = eptr - md->start_subject;
2482 if (offset_top <= number) offset_top = number + 2;
2483 }
2484 }
2485
2486 /* For a non-repeating ket, just advance to the next node and continue at
2487 this level. */
2488
2489 if (*ecode == OP_KET)
2490 {
2491 ecode += 3;
2492 break;
2493 }
2494
2495 /* The repeating kets try the rest of the pattern or restart from the
2496 preceding bracket, in the appropriate order. */
2497
2498 if (*ecode == OP_KETRMIN)
2499 {
2500 if (match(eptr, ecode+3, offset_top, md) ||
2501 match(eptr, prev, offset_top, md)) return TRUE;
2502 }
2503 else /* OP_KETRMAX */
2504 {
2505 if (match(eptr, prev, offset_top, md) ||
2506 match(eptr, ecode+3, offset_top, md)) return TRUE;
2507 }
2508 }
2509 return FALSE;
2510
2511 /* Start of subject unless notbol, or after internal newline if multiline */
2512
2513 case OP_CIRC:
2514 if (md->notbol && eptr == md->start_subject) return FALSE;
2515 if (md->multiline)
2516 {
2517 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2518 ecode++;
2519 break;
2520 }
2521 /* ... else fall through */
2522
2523 /* Start of subject assertion */
2524
2525 case OP_SOD:
2526 if (eptr != md->start_subject) return FALSE;
2527 ecode++;
2528 break;
2529
2530 /* Assert before internal newline if multiline, or before
2531 a terminating newline unless endonly is set, else end of subject unless
2532 noteol is set. */
2533
2534 case OP_DOLL:
2535 if (md->noteol && eptr >= md->end_subject) return FALSE;
2536 if (md->multiline)
2537 {
2538 if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2539 ecode++;
2540 break;
2541 }
2542 else if (!md->endonly)
2543 {
2544 if (eptr < md->end_subject - 1 ||
2545 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2546 ecode++;
2547 break;
2548 }
2549 /* ... else fall through */
2550
2551 /* End of subject assertion */
2552
2553 case OP_EOD:
2554 if (eptr < md->end_subject) return FALSE;
2555 ecode++;
2556 break;
2557
2558 /* Word boundary assertions */
2559
2560 case OP_NOT_WORD_BOUNDARY:
2561 case OP_WORD_BOUNDARY:
2562 {
2563 BOOL prev_is_word = (eptr != md->start_subject) &&
2564 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2565 BOOL cur_is_word = (eptr < md->end_subject) &&
2566 ((pcre_ctypes[*eptr] & ctype_word) != 0);
2567 if ((*ecode++ == OP_WORD_BOUNDARY)?
2568 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2569 return FALSE;
2570 }
2571 break;
2572
2573 /* Match a single character type; inline for speed */
2574
2575 case OP_ANY:
2576 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2577 if (eptr++ >= md->end_subject) return FALSE;
2578 ecode++;
2579 break;
2580
2581 case OP_NOT_DIGIT:
2582 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2583 return FALSE;
2584 ecode++;
2585 break;
2586
2587 case OP_DIGIT:
2588 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2589 return FALSE;
2590 ecode++;
2591 break;
2592
2593 case OP_NOT_WHITESPACE:
2594 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2595 return FALSE;
2596 ecode++;
2597 break;
2598
2599 case OP_WHITESPACE:
2600 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2601 return FALSE;
2602 ecode++;
2603 break;
2604
2605 case OP_NOT_WORDCHAR:
2606 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2607 return FALSE;
2608 ecode++;
2609 break;
2610
2611 case OP_WORDCHAR:
2612 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2613 return FALSE;
2614 ecode++;
2615 break;
2616
2617 /* Match a back reference, possibly repeatedly. Look past the end of the
2618 item to see if there is repeat information following. The code is similar
2619 to that for character classes, but repeated for efficiency. Then obey
2620 similar code to character type repeats - written out again for speed.
2621 However, if the referenced string is the empty string, always treat
2622 it as matched, any number of times (otherwise there could be infinite
2623 loops). */
2624
2625 case OP_REF:
2626 {
2627 int length;
2628 int number = ecode[1] << 1; /* Doubled reference number */
2629 ecode += 2; /* Advance past the item */
2630
2631 if (number >= offset_top || md->offset_vector[number] < 0)
2632 {
2633 md->errorcode = PCRE_ERROR_BADREF;
2634 return FALSE;
2635 }
2636
2637 length = md->offset_vector[number+1] - md->offset_vector[number];
2638
2639 switch (*ecode)
2640 {
2641 case OP_CRSTAR:
2642 case OP_CRMINSTAR:
2643 case OP_CRPLUS:
2644 case OP_CRMINPLUS:
2645 case OP_CRQUERY:
2646 case OP_CRMINQUERY:
2647 c = *ecode++ - OP_CRSTAR;
2648 minimize = (c & 1) != 0;
2649 min = rep_min[c]; /* Pick up values from tables; */
2650 max = rep_max[c]; /* zero for max => infinity */
2651 if (max == 0) max = INT_MAX;
2652 break;
2653
2654 case OP_CRRANGE:
2655 case OP_CRMINRANGE:
2656 minimize = (*ecode == OP_CRMINRANGE);
2657 min = (ecode[1] << 8) + ecode[2];
2658 max = (ecode[3] << 8) + ecode[4];
2659 if (max == 0) max = INT_MAX;
2660 ecode += 5;
2661 break;
2662
2663 default: /* No repeat follows */
2664 if (!match_ref(number, eptr, length, md)) return FALSE;
2665 eptr += length;
2666 continue; /* With the main loop */
2667 }
2668
2669 /* If the length of the reference is zero, just continue with the
2670 main loop. */
2671
2672 if (length == 0) continue;
2673
2674 /* First, ensure the minimum number of matches are present. We get back
2675 the length of the reference string explicitly rather than passing the
2676 address of eptr, so that eptr can be a register variable. */
2677
2678 for (i = 1; i <= min; i++)
2679 {
2680 if (!match_ref(number, eptr, length, md)) return FALSE;
2681 eptr += length;
2682 }
2683
2684 /* If min = max, continue at the same level without recursion.
2685 They are not both allowed to be zero. */
2686
2687 if (min == max) continue;
2688
2689 /* If minimizing, keep trying and advancing the pointer */
2690
2691 if (minimize)
2692 {
2693 for (i = min;; i++)
2694 {
2695 if (match(eptr, ecode, offset_top, md)) return TRUE;
2696 if (i >= max || !match_ref(number, eptr, length, md))
2697 return FALSE;
2698 eptr += length;
2699 }
2700 /* Control never gets here */
2701 }
2702
2703 /* If maximizing, find the longest string and work backwards */
2704
2705 else
2706 {
2707 const uschar *pp = eptr;
2708 for (i = min; i < max; i++)
2709 {
2710 if (!match_ref(number, eptr, length, md)) break;
2711 eptr += length;
2712 }
2713 while (eptr >= pp)
2714 {
2715 if (match(eptr, ecode, offset_top, md)) return TRUE;
2716 eptr -= length;
2717 }
2718 return FALSE;
2719 }
2720 }
2721 /* Control never gets here */
2722
2723 /* Match a character class, possibly repeatedly. Look past the end of the
2724 item to see if there is repeat information following. Then obey similar
2725 code to character type repeats - written out again for speed. If caseless
2726 matching was set at runtime but not at compile time, we have to check both
2727 versions of a character, and we have to behave differently for positive and
2728 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
2729 treated differently. */
2730
2731 case OP_CLASS:
2732 case OP_NEGCLASS:
2733 {
2734 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
2735 const uschar *data = ecode + 1; /* Save for matching */
2736 ecode += 33; /* Advance past the item */
2737
2738 switch (*ecode)
2739 {
2740 case OP_CRSTAR:
2741 case OP_CRMINSTAR:
2742 case OP_CRPLUS:
2743 case OP_CRMINPLUS:
2744 case OP_CRQUERY:
2745 case OP_CRMINQUERY:
2746 c = *ecode++ - OP_CRSTAR;
2747 minimize = (c & 1) != 0;
2748 min = rep_min[c]; /* Pick up values from tables; */
2749 max = rep_max[c]; /* zero for max => infinity */
2750 if (max == 0) max = INT_MAX;
2751 break;
2752
2753 case OP_CRRANGE:
2754 case OP_CRMINRANGE:
2755 minimize = (*ecode == OP_CRMINRANGE);
2756 min = (ecode[1] << 8) + ecode[2];
2757 max = (ecode[3] << 8) + ecode[4];
2758 if (max == 0) max = INT_MAX;
2759 ecode += 5;
2760 break;
2761
2762 default: /* No repeat follows */
2763 min = max = 1;
2764 break;
2765 }
2766
2767 /* First, ensure the minimum number of matches are present. */
2768
2769 for (i = 1; i <= min; i++)
2770 {
2771 if (eptr >= md->end_subject) return FALSE;
2772 c = *eptr++;
2773
2774 /* Either not runtime caseless, or it was a positive class. For
2775 runtime caseless, continue if either case is in the map. */
2776
2777 if (!nasty_case)
2778 {
2779 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2780 if (md->runtime_caseless)
2781 {
2782 c = pcre_fcc[c];
2783 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2784 }
2785 }
2786
2787 /* Runtime caseless and it was a negative class. Continue only if
2788 both cases are in the map. */
2789
2790 else
2791 {
2792 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2793 c = pcre_fcc[c];
2794 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2795 }
2796
2797 return FALSE;
2798 }
2799
2800 /* If max == min we can continue with the main loop without the
2801 need to recurse. */
2802
2803 if (min == max) continue;
2804
2805 /* If minimizing, keep testing the rest of the expression and advancing
2806 the pointer while it matches the class. */
2807
2808 if (minimize)
2809 {
2810 for (i = min;; i++)
2811 {
2812 if (match(eptr, ecode, offset_top, md)) return TRUE;
2813 if (i >= max || eptr >= md->end_subject) return FALSE;
2814 c = *eptr++;
2815
2816 /* Either not runtime caseless, or it was a positive class. For
2817 runtime caseless, continue if either case is in the map. */
2818
2819 if (!nasty_case)
2820 {
2821 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2822 if (md->runtime_caseless)
2823 {
2824 c = pcre_fcc[c];
2825 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2826 }
2827 }
2828
2829 /* Runtime caseless and it was a negative class. Continue only if
2830 both cases are in the map. */
2831
2832 else
2833 {
2834 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2835 c = pcre_fcc[c];
2836 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2837 }
2838
2839 return FALSE;
2840 }
2841 /* Control never gets here */
2842 }
2843
2844 /* If maximizing, find the longest possible run, then work backwards. */
2845
2846 else
2847 {
2848 const uschar *pp = eptr;
2849 for (i = min; i < max; eptr++, i++)
2850 {
2851 if (eptr >= md->end_subject) break;
2852 c = *eptr;
2853
2854 /* Either not runtime caseless, or it was a positive class. For
2855 runtime caseless, continue if either case is in the map. */
2856
2857 if (!nasty_case)
2858 {
2859 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2860 if (md->runtime_caseless)
2861 {
2862 c = pcre_fcc[c];
2863 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2864 }
2865 }
2866
2867 /* Runtime caseless and it was a negative class. Continue only if
2868 both cases are in the map. */
2869
2870 else
2871 {
2872 if ((data[c/8] & (1 << (c&7))) == 0) break;
2873 c = pcre_fcc[c];
2874 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2875 }
2876
2877 break;
2878 }
2879
2880 while (eptr >= pp)
2881 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2882 return FALSE;
2883 }
2884 }
2885 /* Control never gets here */
2886
2887 /* Match a run of characters */
2888
2889 case OP_CHARS:
2890 {
2891 register int length = ecode[1];
2892 ecode += 2;
2893
2894 #ifdef DEBUG /* Sigh. Some compilers never learn. */
2895 if (eptr >= md->end_subject)
2896 printf("matching subject <null> against pattern ");
2897 else
2898 {
2899 printf("matching subject ");
2900 pchars(eptr, length, TRUE, md);
2901 printf(" against pattern ");
2902 }
2903 pchars(ecode, length, FALSE, md);
2904 printf("\n");
2905 #endif
2906
2907 if (length > md->end_subject - eptr) return FALSE;
2908 if (md->caseless)
2909 {
2910 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2911 }
2912 else
2913 {
2914 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2915 }
2916 }
2917 break;
2918
2919 /* Match a single character repeatedly; different opcodes share code. */
2920
2921 case OP_EXACT:
2922 min = max = (ecode[1] << 8) + ecode[2];
2923 ecode += 3;
2924 goto REPEATCHAR;
2925
2926 case OP_UPTO:
2927 case OP_MINUPTO:
2928 min = 0;
2929 max = (ecode[1] << 8) + ecode[2];
2930 minimize = *ecode == OP_MINUPTO;
2931 ecode += 3;
2932 goto REPEATCHAR;
2933
2934 case OP_STAR:
2935 case OP_MINSTAR:
2936 case OP_PLUS:
2937 case OP_MINPLUS:
2938 case OP_QUERY:
2939 case OP_MINQUERY:
2940 c = *ecode++ - OP_STAR;
2941 minimize = (c & 1) != 0;
2942 min = rep_min[c]; /* Pick up values from tables; */
2943 max = rep_max[c]; /* zero for max => infinity */
2944 if (max == 0) max = INT_MAX;
2945
2946 /* Common code for all repeated single-character matches. We can give
2947 up quickly if there are fewer than the minimum number of characters left in
2948 the subject. */
2949
2950 REPEATCHAR:
2951 if (min > md->end_subject - eptr) return FALSE;
2952 c = *ecode++;
2953
2954 /* The code is duplicated for the caseless and caseful cases, for speed,
2955 since matching characters is likely to be quite common. First, ensure the
2956 minimum number of matches are present. If min = max, continue at the same
2957 level without recursing. Otherwise, if minimizing, keep trying the rest of
2958 the expression and advancing one matching character if failing, up to the
2959 maximum. Alternatively, if maximizing, find the maximum number of
2960 characters and work backwards. */
2961
2962 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2963 max, eptr));
2964
2965 if (md->caseless)
2966 {
2967 c = pcre_lcc[c];
2968 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2969 if (min == max) continue;
2970 if (minimize)
2971 {
2972 for (i = min;; i++)
2973 {
2974 if (match(eptr, ecode, offset_top, md)) return TRUE;
2975 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2976 return FALSE;
2977 }
2978 /* Control never gets here */
2979 }
2980 else
2981 {
2982 const uschar *pp = eptr;
2983 for (i = min; i < max; i++)
2984 {
2985 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2986 eptr++;
2987 }
2988 while (eptr >= pp)
2989 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2990 return FALSE;
2991 }
2992 /* Control never gets here */
2993 }
2994
2995 /* Caseful comparisons */
2996
2997 else
2998 {
2999 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3000 if (min == max) continue;
3001 if (minimize)
3002 {
3003 for (i = min;; i++)
3004 {
3005 if (match(eptr, ecode, offset_top, md)) return TRUE;
3006 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3007 }
3008 /* Control never gets here */
3009 }
3010 else
3011 {
3012 const uschar *pp = eptr;
3013 for (i = min; i < max; i++)
3014 {
3015 if (eptr >= md->end_subject || c != *eptr) break;
3016 eptr++;
3017 }
3018 while (eptr >= pp)
3019 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3020 return FALSE;
3021 }
3022 }
3023 /* Control never gets here */
3024
3025 /* Match a negated single character */
3026
3027 case OP_NOT:
3028 if (eptr >= md->end_subject) return FALSE;
3029 ecode++;
3030 if (md->caseless)
3031 {
3032 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
3033 }
3034 else
3035 {
3036 if (*ecode++ == *eptr++) return FALSE;
3037 }
3038 break;
3039
3040 /* Match a negated single character repeatedly. This is almost a repeat of
3041 the code for a repeated single character, but I haven't found a nice way of
3042 commoning these up that doesn't require a test of the positive/negative
3043 option for each character match. Maybe that wouldn't add very much to the
3044 time taken, but character matching *is* what this is all about... */
3045
3046 case OP_NOTEXACT:
3047 min = max = (ecode[1] << 8) + ecode[2];
3048 ecode += 3;
3049 goto REPEATNOTCHAR;
3050
3051 case OP_NOTUPTO:
3052 case OP_NOTMINUPTO:
3053 min = 0;
3054 max = (ecode[1] << 8) + ecode[2];
3055 minimize = *ecode == OP_NOTMINUPTO;
3056 ecode += 3;
3057 goto REPEATNOTCHAR;
3058
3059 case OP_NOTSTAR:
3060 case OP_NOTMINSTAR:
3061 case OP_NOTPLUS:
3062 case OP_NOTMINPLUS:
3063 case OP_NOTQUERY:
3064 case OP_NOTMINQUERY:
3065 c = *ecode++ - OP_NOTSTAR;
3066 minimize = (c & 1) != 0;
3067 min = rep_min[c]; /* Pick up values from tables; */
3068 max = rep_max[c]; /* zero for max => infinity */
3069 if (max == 0) max = INT_MAX;
3070
3071 /* Common code for all repeated single-character matches. We can give
3072 up quickly if there are fewer than the minimum number of characters left in
3073 the subject. */
3074
3075 REPEATNOTCHAR:
3076 if (min > md->end_subject - eptr) return FALSE;
3077 c = *ecode++;
3078
3079 /* The code is duplicated for the caseless and caseful cases, for speed,
3080 since matching characters is likely to be quite common. First, ensure the
3081 minimum number of matches are present. If min = max, continue at the same
3082 level without recursing. Otherwise, if minimizing, keep trying the rest of
3083 the expression and advancing one matching character if failing, up to the
3084 maximum. Alternatively, if maximizing, find the maximum number of
3085 characters and work backwards. */
3086
3087 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3088 max, eptr));
3089
3090 if (md->caseless)
3091 {
3092 c = pcre_lcc[c];
3093 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3094 if (min == max) continue;
3095 if (minimize)
3096 {
3097 for (i = min;; i++)
3098 {
3099 if (match(eptr, ecode, offset_top, md)) return TRUE;
3100 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3101 return FALSE;
3102 }
3103 /* Control never gets here */
3104 }
3105 else
3106 {
3107 const uschar *pp = eptr;
3108 for (i = min; i < max; i++)
3109 {
3110 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3111 eptr++;
3112 }
3113 while (eptr >= pp)
3114 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3115 return FALSE;
3116 }
3117 /* Control never gets here */
3118 }
3119
3120 /* Caseful comparisons */
3121
3122 else
3123 {
3124 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3125 if (min == max) continue;
3126 if (minimize)
3127 {
3128 for (i = min;; i++)
3129 {
3130 if (match(eptr, ecode, offset_top, md)) return TRUE;
3131 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3132 }
3133 /* Control never gets here */
3134 }
3135 else
3136 {
3137 const uschar *pp = eptr;
3138 for (i = min; i < max; i++)
3139 {
3140 if (eptr >= md->end_subject || c == *eptr) break;
3141 eptr++;
3142 }
3143 while (eptr >= pp)
3144 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3145 return FALSE;
3146 }
3147 }
3148 /* Control never gets here */
3149
3150 /* Match a single character type repeatedly; several different opcodes
3151 share code. This is very similar to the code for single characters, but we
3152 repeat it in the interests of efficiency. */
3153
3154 case OP_TYPEEXACT:
3155 min = max = (ecode[1] << 8) + ecode[2];
3156 minimize = TRUE;
3157 ecode += 3;
3158 goto REPEATTYPE;
3159
3160 case OP_TYPEUPTO:
3161 case OP_TYPEMINUPTO:
3162 min = 0;
3163 max = (ecode[1] << 8) + ecode[2];
3164 minimize = *ecode == OP_TYPEMINUPTO;
3165 ecode += 3;
3166 goto REPEATTYPE;
3167
3168 case OP_TYPESTAR:
3169 case OP_TYPEMINSTAR:
3170 case OP_TYPEPLUS:
3171 case OP_TYPEMINPLUS:
3172 case OP_TYPEQUERY:
3173 case OP_TYPEMINQUERY:
3174 c = *ecode++ - OP_TYPESTAR;
3175 minimize = (c & 1) != 0;
3176 min = rep_min[c]; /* Pick up values from tables; */
3177 max = rep_max[c]; /* zero for max => infinity */
3178 if (max == 0) max = INT_MAX;
3179
3180 /* Common code for all repeated single character type matches */
3181
3182 REPEATTYPE:
3183 ctype = *ecode++; /* Code for the character type */
3184
3185 /* First, ensure the minimum number of matches are present. Use inline
3186 code for maximizing the speed, and do the type test once at the start
3187 (i.e. keep it out of the loop). Also test that there are at least the
3188 minimum number of characters before we start. */
3189
3190 if (min > md->end_subject - eptr) return FALSE;
3191 if (min > 0) switch(ctype)
3192 {
3193 case OP_ANY:
3194 if (!md->dotall)
3195 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3196 else eptr += min;
3197 break;
3198
3199 case OP_NOT_DIGIT:
3200 for (i = 1; i <= min; i++)
3201 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3202 break;
3203
3204 case OP_DIGIT:
3205 for (i = 1; i <= min; i++)
3206 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3207 break;
3208
3209 case OP_NOT_WHITESPACE:
3210 for (i = 1; i <= min; i++)
3211 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3212 break;
3213
3214 case OP_WHITESPACE:
3215 for (i = 1; i <= min; i++)
3216 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3217 break;
3218
3219 case OP_NOT_WORDCHAR:
3220 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3221 return FALSE;
3222 break;
3223
3224 case OP_WORDCHAR:
3225 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3226 return FALSE;
3227 break;
3228 }
3229
3230 /* If min = max, continue at the same level without recursing */
3231
3232 if (min == max) continue;
3233
3234 /* If minimizing, we have to test the rest of the pattern before each
3235 subsequent match, so inlining isn't much help; just use the function. */
3236
3237 if (minimize)
3238 {
3239 for (i = min;; i++)
3240 {
3241 if (match(eptr, ecode, offset_top, md)) return TRUE;
3242 if (i >= max || eptr >= md->end_subject ||
3243 !match_type(ctype, *eptr++, md->dotall))
3244 return FALSE;
3245 }
3246 /* Control never gets here */
3247 }
3248
3249 /* If maximizing it is worth using inline code for speed, doing the type
3250 test once at the start (i.e. keep it out of the loop). */
3251
3252 else
3253 {
3254 const uschar *pp = eptr;
3255 switch(ctype)
3256 {
3257 case OP_ANY:
3258 if (!md->dotall)
3259 {
3260 for (i = min; i < max; i++)
3261 {
3262 if (eptr >= md->end_subject || *eptr == '\n') break;
3263 eptr++;
3264 }
3265 }
3266 else
3267 {
3268 c = max - min;
3269 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3270 eptr += c;
3271 }
3272 break;
3273
3274 case OP_NOT_DIGIT:
3275 for (i = min; i < max; i++)
3276 {
3277 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3278 break;
3279 eptr++;
3280 }
3281 break;
3282
3283 case OP_DIGIT:
3284 for (i = min; i < max; i++)
3285 {
3286 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3287 break;
3288 eptr++;
3289 }
3290 break;
3291
3292 case OP_NOT_WHITESPACE:
3293 for (i = min; i < max; i++)
3294 {
3295 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3296 break;
3297 eptr++;
3298 }
3299 break;
3300
3301 case OP_WHITESPACE:
3302 for (i = min; i < max; i++)
3303 {
3304 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3305 break;
3306 eptr++;
3307 }
3308 break;
3309
3310 case OP_NOT_WORDCHAR:
3311 for (i = min; i < max; i++)
3312 {
3313 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3314 break;
3315 eptr++;
3316 }
3317 break;
3318
3319 case OP_WORDCHAR:
3320 for (i = min; i < max; i++)
3321 {
3322 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3323 break;
3324 eptr++;
3325 }
3326 break;
3327 }
3328
3329 while (eptr >= pp)
3330 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3331 return FALSE;
3332 }
3333 /* Control never gets here */
3334
3335 /* There's been some horrible disaster. */
3336
3337 default:
3338 DPRINTF(("Unknown opcode %d\n", *ecode));
3339 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3340 return FALSE;
3341 }
3342
3343 /* Do not stick any code in here without much thought; it is assumed
3344 that "continue" in the code above comes out to here to repeat the main
3345 loop. */
3346
3347 } /* End of main loop */
3348 /* Control never reaches here */
3349 }
3350
3351
3352
3353 /*************************************************
3354 * Segregate setjmp() *
3355 *************************************************/
3356
3357 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
3358 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
3359 hide it in a separate function. This is called only when PCRE_EXTRA is set,
3360 since it's needed only for the extension \X option, and with any luck, a good
3361 compiler will spot the tail recursion and compile it efficiently.
3362
3363 Arguments:
3364 eptr pointer in subject
3365 ecode position in code
3366 offset_top current top pointer
3367 md pointer to "static" info for the match
3368
3369 Returns: TRUE if matched
3370 */
3371
3372 static BOOL
3373 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
3374 match_data *match_block)
3375 {
3376 return setjmp(match_block->fail_env) == 0 &&
3377 match(eptr, ecode, offset_top, match_block);
3378 }
3379
3380
3381
3382 /*************************************************
3383 * Execute a Regular Expression *
3384 *************************************************/
3385
3386 /* This function applies a compiled re to a subject string and picks out
3387 portions of the string if it matches. Two elements in the vector are set for
3388 each substring: the offsets to the start and end of the substring.
3389
3390 Arguments:
3391 external_re points to the compiled expression
3392 external_extra points to "hints" from pcre_study() or is NULL
3393 subject points to the subject string
3394 length length of subject string (may contain binary zeros)
3395 options option bits
3396 offsets points to a vector of ints to be filled in with offsets
3397 offsetcount the number of elements in the vector
3398
3399 Returns: > 0 => success; value is the number of elements filled in
3400 = 0 => success, but offsets is not big enough
3401 -1 => failed to match
3402 < -1 => some kind of unexpected problem
3403 */
3404
3405 int
3406 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3407 const char *subject, int length, int options, int *offsets, int offsetcount)
3408 {
3409 int resetcount, ocount;
3410 int first_char = -1;
3411 match_data match_block;
3412 const uschar *start_bits = NULL;
3413 const uschar *start_match = (const uschar *)subject;
3414 const uschar *end_subject;
3415 const real_pcre *re = (const real_pcre *)external_re;
3416 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3417 BOOL using_temporary_offsets = FALSE;
3418 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3419 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3420
3421 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3422
3423 if (re == NULL || subject == NULL ||
3424 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3425 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3426
3427 match_block.start_subject = (const uschar *)subject;
3428 match_block.end_subject = match_block.start_subject + length;
3429 end_subject = match_block.end_subject;
3430
3431 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3432 match_block.runtime_caseless = match_block.caseless &&
3433 (re->options & PCRE_CASELESS) == 0;
3434
3435 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3436 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3437 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3438
3439 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3440 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3441
3442 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3443
3444 /* If the expression has got more back references than the offsets supplied can
3445 hold, we get a temporary bit of working store to use during the matching.
3446 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3447 of 2. */
3448
3449 ocount = offsetcount & (-2);
3450 if (re->top_backref > 0 && re->top_backref >= ocount/2)
3451 {
3452 ocount = re->top_backref * 2 + 2;
3453 match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
3454 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3455 using_temporary_offsets = TRUE;
3456 DPRINTF(("Got memory to hold back references\n"));
3457 }
3458 else match_block.offset_vector = offsets;
3459
3460 match_block.offset_end = ocount;
3461 match_block.offset_overflow = FALSE;
3462
3463 /* Compute the minimum number of offsets that we need to reset each time. Doing
3464 this makes a huge difference to execution time when there aren't many brackets
3465 in the pattern. */
3466
3467 resetcount = 2 + re->top_bracket * 2;
3468 if (resetcount > offsetcount) resetcount = ocount;
3469
3470 /* If MULTILINE is set at exec time but was not set at compile time, and the
3471 anchored flag is set, we must re-check because a setting provoked by ^ in the
3472 pattern is not right in multi-line mode. Calling is_anchored() again here does
3473 the right check, because multiline is now set. If it now yields FALSE, the
3474 expression must have had ^ starting some of its branches. Check to see if
3475 that is true for *all* branches, and if so, set the startline flag. */
3476
3477 if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3478 !is_anchored(re->code, match_block.multiline))
3479 {
3480 anchored = FALSE;
3481 if (is_startline(re->code)) startline = TRUE;
3482 }
3483
3484 /* Set up the first character to match, if available. The first_char value is
3485 never set for an anchored regular expression, but the anchoring may be forced
3486 at run time, so we have to test for anchoring. The first char may be unset for
3487 an unanchored pattern, of course. If there's no first char and the pattern was
3488 studied, the may be a bitmap of possible first characters. However, we can
3489 use this only if the caseless state of the studying was correct. */
3490
3491 if (!anchored)
3492 {
3493 if ((re->options & PCRE_FIRSTSET) != 0)
3494 {
3495 first_char = re->first_char;
3496 if (match_block.caseless) first_char = pcre_lcc[first_char];
3497 }
3498 else
3499 if (!startline && extra != NULL &&
3500 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3501 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3502 start_bits = extra->start_bits;
3503 }
3504
3505 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3506
3507 do
3508 {
3509 int rc;
3510 register int *iptr = match_block.offset_vector;
3511 register int *iend = iptr + resetcount;
3512
3513 /* Reset the maximum number of extractions we might see. */
3514
3515 while (iptr < iend) *iptr++ = -1;
3516
3517 /* Advance to a unique first char if possible */
3518
3519 if (first_char >= 0)
3520 {
3521 if (match_block.caseless)
3522 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3523 start_match++;
3524 else
3525 while (start_match < end_subject && *start_match != first_char)
3526 start_match++;
3527 }
3528
3529 /* Or to just after \n for a multiline match if possible */
3530
3531 else if (startline)
3532 {
3533 if (start_match > match_block.start_subject)
3534 {
3535 while (start_match < end_subject && start_match[-1] != '\n')
3536 start_match++;
3537 }
3538 }
3539
3540 /* Or to a non-unique first char */
3541
3542 else if (start_bits != NULL)
3543 {
3544 while (start_match < end_subject)
3545 {
3546 register int c = *start_match;
3547 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3548 }
3549 }
3550
3551 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3552 printf(">>>> Match against: ");
3553 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3554 printf("\n");
3555 #endif
3556
3557 /* When a match occurs, substrings will be set for all internal extractions;
3558 we just need to set up the whole thing as substring 0 before returning. If
3559 there were too many extractions, set the return code to zero. In the case
3560 where we had to get some local store to hold offsets for backreferences, copy
3561 those back references that we can. In this case there need not be overflow
3562 if certain parts of the pattern were not used.
3563
3564 Before starting the match, we have to set up a longjmp() target to enable
3565 the "cut" operation to fail a match completely without backtracking. This
3566 is done in a separate function to avoid compiler warnings. We need not do
3567 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
3568 enabled. */
3569
3570 if ((re->options & PCRE_EXTRA) != 0)
3571 {
3572 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
3573 continue;
3574 }
3575 else if (!match(start_match, re->code, 2, &match_block)) continue;
3576
3577 /* Copy the offset information from temporary store if necessary */
3578
3579 if (using_temporary_offsets)
3580 {
3581 if (offsetcount >= 4)
3582 {
3583 memcpy(offsets + 2, match_block.offset_vector + 2,
3584 (offsetcount - 2) * sizeof(int));
3585 DPRINTF(("Copied offsets from temporary memory\n"));
3586 }
3587 if (match_block.end_offset_top > offsetcount)
3588 match_block.offset_overflow = TRUE;
3589
3590 DPRINTF(("Freeing temporary memory\n"));
3591 (pcre_free)(match_block.offset_vector);
3592 }
3593
3594 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3595
3596 if (match_block.offset_end < 2) rc = 0; else
3597 {
3598 offsets[0] = start_match - match_block.start_subject;
3599 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3600 }
3601
3602 DPRINTF((">>>> returning %d\n", rc));
3603 return rc;
3604 }
3605 while (!anchored &&
3606 match_block.errorcode == PCRE_ERROR_NOMATCH &&
3607 start_match++ < end_subject);
3608
3609 if (using_temporary_offsets)
3610 {
3611 DPRINTF(("Freeing temporary memory\n"));
3612 (pcre_free)(match_block.offset_vector);
3613 }
3614
3615 DPRINTF((">>>> returning %d\n", match_block.errorcode));
3616
3617 return match_block.errorcode;
3618 }
3619
3620 /* End of pcre.c */

  ViewVC Help
Powered by ViewVC 1.1.5