/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 51 - (show annotations)
Sat Feb 24 21:39:37 2007 UTC (12 years, 3 months ago) by nigel
File MIME type: text/plain
File size: 152081 byte(s)
Load pcre-3.4 into code/trunk.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2000 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37
38 /* #define DEBUG */
39
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
43
44 #ifdef DEBUG
45 #define DPRINTF(p) printf p
46 #else
47 #define DPRINTF(p) /*nothing*/
48 #endif
49
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
52
53 #include "internal.h"
54
55
56 /* Allow compilation as C++ source code, should anybody want to do that. */
57
58 #ifdef __cplusplus
59 #define class pcre_class
60 #endif
61
62
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
65
66 #define BRASTACK_SIZE 200
67
68
69 /* The number of bytes in a literal character string above which we can't add
70 any more is different when UTF-8 characters may be encountered. */
71
72 #ifdef SUPPORT_UTF8
73 #define MAXLIT 250
74 #else
75 #define MAXLIT 255
76 #endif
77
78
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83
84 /* Text forms of OP_ values and things, for debugging (not all used) */
85
86 #ifdef DEBUG
87 static const char *OP_names[] = {
88 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
89 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
90 "Opt", "^", "$", "Any", "chars", "not",
91 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
92 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94 "*", "*?", "+", "+?", "?", "??", "{", "{",
95 "class", "Ref", "Recurse",
96 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98 "Brazero", "Braminzero", "Bra"
99 };
100 #endif
101
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106
107 static const short int escapes[] = {
108 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
111 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
115 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
116 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
117 0, 0, -ESC_z /* x - z */
118 };
119
120 /* Tables of names of POSIX character classes and their lengths. The list is
121 terminated by a zero length entry. The first three must be alpha, upper, lower,
122 as this is assumed for handling case independence. */
123
124 static const char *posix_names[] = {
125 "alpha", "lower", "upper",
126 "alnum", "ascii", "cntrl", "digit", "graph",
127 "print", "punct", "space", "word", "xdigit" };
128
129 static const uschar posix_name_lengths[] = {
130 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131
132 /* Table of class bit maps for each POSIX class; up to three may be combined
133 to form the class. */
134
135 static const int posix_class_maps[] = {
136 cbit_lower, cbit_upper, -1, /* alpha */
137 cbit_lower, -1, -1, /* lower */
138 cbit_upper, -1, -1, /* upper */
139 cbit_digit, cbit_lower, cbit_upper, /* alnum */
140 cbit_print, cbit_cntrl, -1, /* ascii */
141 cbit_cntrl, -1, -1, /* cntrl */
142 cbit_digit, -1, -1, /* digit */
143 cbit_graph, -1, -1, /* graph */
144 cbit_print, -1, -1, /* print */
145 cbit_punct, -1, -1, /* punct */
146 cbit_space, -1, -1, /* space */
147 cbit_word, -1, -1, /* word */
148 cbit_xdigit,-1, -1 /* xdigit */
149 };
150
151
152 /* Definition to allow mutual recursion */
153
154 static BOOL
155 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156 BOOL, int, int *, int *, compile_data *);
157
158 /* Structure for building a chain of data that actually lives on the
159 stack, for holding the values of the subject pointer at the start of each
160 subpattern, so as to detect when an empty string has been matched by a
161 subpattern - to break infinite loops. */
162
163 typedef struct eptrblock {
164 struct eptrblock *prev;
165 const uschar *saved_eptr;
166 } eptrblock;
167
168 /* Flag bits for the match() function */
169
170 #define match_condassert 0x01 /* Called to check a condition assertion */
171 #define match_isgroup 0x02 /* Set if start of bracketed group */
172
173
174
175 /*************************************************
176 * Global variables *
177 *************************************************/
178
179 /* PCRE is thread-clean and doesn't use any global variables in the normal
180 sense. However, it calls memory allocation and free functions via the two
181 indirections below, which are can be changed by the caller, but are shared
182 between all threads. */
183
184 void *(*pcre_malloc)(size_t) = malloc;
185 void (*pcre_free)(void *) = free;
186
187
188
189 /*************************************************
190 * Macros and tables for character handling *
191 *************************************************/
192
193 /* When UTF-8 encoding is being used, a character is no longer just a single
194 byte. The macros for character handling generate simple sequences when used in
195 byte-mode, and more complicated ones for UTF-8 characters. */
196
197 #ifndef SUPPORT_UTF8
198 #define GETCHARINC(c, eptr) c = *eptr++;
199 #define GETCHARLEN(c, eptr, len) c = *eptr;
200 #define BACKCHAR(eptr)
201
202 #else /* SUPPORT_UTF8 */
203
204 /* Get the next UTF-8 character, advancing the pointer */
205
206 #define GETCHARINC(c, eptr) \
207 c = *eptr++; \
208 if (md->utf8 && (c & 0xc0) == 0xc0) \
209 { \
210 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
211 int s = 6 - a; /* Amount to shift next byte */ \
212 c &= utf8_table3[a]; /* Low order bits from first byte */ \
213 while (a-- > 0) \
214 { \
215 c |= (*eptr++ & 0x3f) << s; \
216 s += 6; \
217 } \
218 }
219
220 /* Get the next UTF-8 character, not advancing the pointer, setting length */
221
222 #define GETCHARLEN(c, eptr, len) \
223 c = *eptr; \
224 len = 1; \
225 if (md->utf8 && (c & 0xc0) == 0xc0) \
226 { \
227 int i; \
228 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
229 int s = 6 - a; /* Amount to shift next byte */ \
230 c &= utf8_table3[a]; /* Low order bits from first byte */ \
231 for (i = 1; i <= a; i++) \
232 { \
233 c |= (eptr[i] & 0x3f) << s; \
234 s += 6; \
235 } \
236 len += a; \
237 }
238
239 /* If the pointer is not at the start of a character, move it back until
240 it is. */
241
242 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
243
244 #endif
245
246
247
248 /*************************************************
249 * Default character tables *
250 *************************************************/
251
252 /* A default set of character tables is included in the PCRE binary. Its source
253 is built by the maketables auxiliary program, which uses the default C ctypes
254 functions, and put in the file chartables.c. These tables are used by PCRE
255 whenever the caller of pcre_compile() does not provide an alternate set of
256 tables. */
257
258 #include "chartables.c"
259
260
261
262 #ifdef SUPPORT_UTF8
263 /*************************************************
264 * Tables for UTF-8 support *
265 *************************************************/
266
267 /* These are the breakpoints for different numbers of bytes in a UTF-8
268 character. */
269
270 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
271
272 /* These are the indicator bits and the mask for the data bits to set in the
273 first byte of a character, indexed by the number of additional bytes. */
274
275 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
277
278 /* Table of the number of extra characters, indexed by the first character
279 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
280 0x3d. */
281
282 static uschar utf8_table4[] = {
283 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
287
288
289 /*************************************************
290 * Convert character value to UTF-8 *
291 *************************************************/
292
293 /* This function takes an integer value in the range 0 - 0x7fffffff
294 and encodes it as a UTF-8 character in 0 to 6 bytes.
295
296 Arguments:
297 cvalue the character value
298 buffer pointer to buffer for result - at least 6 bytes long
299
300 Returns: number of characters placed in the buffer
301 */
302
303 static int
304 ord2utf8(int cvalue, uschar *buffer)
305 {
306 register int i, j;
307 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308 if (cvalue <= utf8_table1[i]) break;
309 *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
310 cvalue >>= 6 - i;
311 for (j = 0; j < i; j++)
312 {
313 *buffer++ = 0x80 | (cvalue & 0x3f);
314 cvalue >>= 6;
315 }
316 return i + 1;
317 }
318 #endif
319
320
321
322 /*************************************************
323 * Return version string *
324 *************************************************/
325
326 #define STRING(a) # a
327 #define XSTRING(s) STRING(s)
328
329 const char *
330 pcre_version(void)
331 {
332 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
333 }
334
335
336
337
338 /*************************************************
339 * (Obsolete) Return info about compiled pattern *
340 *************************************************/
341
342 /* This is the original "info" function. It picks potentially useful data out
343 of the private structure, but its interface was too rigid. It remains for
344 backwards compatibility. The public options are passed back in an int - though
345 the re->options field has been expanded to a long int, all the public options
346 at the low end of it, and so even on 16-bit systems this will still be OK.
347 Therefore, I haven't changed the API for pcre_info().
348
349 Arguments:
350 external_re points to compiled code
351 optptr where to pass back the options
352 first_char where to pass back the first character,
353 or -1 if multiline and all branches start ^,
354 or -2 otherwise
355
356 Returns: number of capturing subpatterns
357 or negative values on error
358 */
359
360 int
361 pcre_info(const pcre *external_re, int *optptr, int *first_char)
362 {
363 const real_pcre *re = (const real_pcre *)external_re;
364 if (re == NULL) return PCRE_ERROR_NULL;
365 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367 if (first_char != NULL)
368 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
370 return re->top_bracket;
371 }
372
373
374
375 /*************************************************
376 * Return info about compiled pattern *
377 *************************************************/
378
379 /* This is a newer "info" function which has an extensible interface so
380 that additional items can be added compatibly.
381
382 Arguments:
383 external_re points to compiled code
384 external_study points to study data, or NULL
385 what what information is required
386 where where to put the information
387
388 Returns: 0 if data returned, negative on error
389 */
390
391 int
392 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
393 void *where)
394 {
395 const real_pcre *re = (const real_pcre *)external_re;
396 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
397
398 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
400
401 switch (what)
402 {
403 case PCRE_INFO_OPTIONS:
404 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
405 break;
406
407 case PCRE_INFO_SIZE:
408 *((size_t *)where) = re->size;
409 break;
410
411 case PCRE_INFO_CAPTURECOUNT:
412 *((int *)where) = re->top_bracket;
413 break;
414
415 case PCRE_INFO_BACKREFMAX:
416 *((int *)where) = re->top_backref;
417 break;
418
419 case PCRE_INFO_FIRSTCHAR:
420 *((int *)where) =
421 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
423 break;
424
425 case PCRE_INFO_FIRSTTABLE:
426 *((const uschar **)where) =
427 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428 study->start_bits : NULL;
429 break;
430
431 case PCRE_INFO_LASTLITERAL:
432 *((int *)where) =
433 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
434 break;
435
436 default: return PCRE_ERROR_BADOPTION;
437 }
438
439 return 0;
440 }
441
442
443
444 #ifdef DEBUG
445 /*************************************************
446 * Debugging function to print chars *
447 *************************************************/
448
449 /* Print a sequence of chars in printable format, stopping at the end of the
450 subject if the requested.
451
452 Arguments:
453 p points to characters
454 length number to print
455 is_subject TRUE if printing from within md->start_subject
456 md pointer to matching data block, if is_subject is TRUE
457
458 Returns: nothing
459 */
460
461 static void
462 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
463 {
464 int c;
465 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
466 while (length-- > 0)
467 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
468 }
469 #endif
470
471
472
473
474 /*************************************************
475 * Handle escapes *
476 *************************************************/
477
478 /* This function is called when a \ has been encountered. It either returns a
479 positive value for a simple escape such as \n, or a negative value which
480 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481 a positive value greater than 255 may be returned. On entry, ptr is pointing at
482 the \. On exit, it is on the final character of the escape sequence.
483
484 Arguments:
485 ptrptr points to the pattern position pointer
486 errorptr points to the pointer to the error message
487 bracount number of previous extracting brackets
488 options the options bits
489 isclass TRUE if inside a character class
490 cd pointer to char tables block
491
492 Returns: zero or positive => a data character
493 negative => a special escape sequence
494 on error, errorptr is set
495 */
496
497 static int
498 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
499 int options, BOOL isclass, compile_data *cd)
500 {
501 const uschar *ptr = *ptrptr;
502 int c, i;
503
504 /* If backslash is at the end of the pattern, it's an error. */
505
506 c = *(++ptr);
507 if (c == 0) *errorptr = ERR1;
508
509 /* Digits or letters may have special meaning; all others are literals. */
510
511 else if (c < '0' || c > 'z') {}
512
513 /* Do an initial lookup in a table. A non-zero result is something that can be
514 returned immediately. Otherwise further processing may be required. */
515
516 else if ((i = escapes[c - '0']) != 0) c = i;
517
518 /* Escapes that need further processing, or are illegal. */
519
520 else
521 {
522 const uschar *oldptr;
523 switch (c)
524 {
525 /* The handling of escape sequences consisting of a string of digits
526 starting with one that is not zero is not straightforward. By experiment,
527 the way Perl works seems to be as follows:
528
529 Outside a character class, the digits are read as a decimal number. If the
530 number is less than 10, or if there are that many previous extracting
531 left brackets, then it is a back reference. Otherwise, up to three octal
532 digits are read to form an escaped byte. Thus \123 is likely to be octal
533 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
534 value is greater than 377, the least significant 8 bits are taken. Inside a
535 character class, \ followed by a digit is always an octal number. */
536
537 case '1': case '2': case '3': case '4': case '5':
538 case '6': case '7': case '8': case '9':
539
540 if (!isclass)
541 {
542 oldptr = ptr;
543 c -= '0';
544 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
545 c = c * 10 + *(++ptr) - '0';
546 if (c < 10 || c <= bracount)
547 {
548 c = -(ESC_REF + c);
549 break;
550 }
551 ptr = oldptr; /* Put the pointer back and fall through */
552 }
553
554 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
555 generates a binary zero byte and treats the digit as a following literal.
556 Thus we have to pull back the pointer by one. */
557
558 if ((c = *ptr) >= '8')
559 {
560 ptr--;
561 c = 0;
562 break;
563 }
564
565 /* \0 always starts an octal number, but we may drop through to here with a
566 larger first octal digit. */
567
568 case '0':
569 c -= '0';
570 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571 ptr[1] != '8' && ptr[1] != '9')
572 c = c * 8 + *(++ptr) - '0';
573 c &= 255; /* Take least significant 8 bits */
574 break;
575
576 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577 which can be greater than 0xff, but only if the ddd are hex digits. */
578
579 case 'x':
580 #ifdef SUPPORT_UTF8
581 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
582 {
583 const uschar *pt = ptr + 2;
584 register int count = 0;
585 c = 0;
586 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
587 {
588 count++;
589 c = c * 16 + cd->lcc[*pt] -
590 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
591 pt++;
592 }
593 if (*pt == '}')
594 {
595 if (c < 0 || count > 8) *errorptr = ERR34;
596 ptr = pt;
597 break;
598 }
599 /* If the sequence of hex digits does not end with '}', then we don't
600 recognize this construct; fall through to the normal \x handling. */
601 }
602 #endif
603
604 /* Read just a single hex char */
605
606 c = 0;
607 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
608 {
609 ptr++;
610 c = c * 16 + cd->lcc[*ptr] -
611 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
612 }
613 break;
614
615 /* Other special escapes not starting with a digit are straightforward */
616
617 case 'c':
618 c = *(++ptr);
619 if (c == 0)
620 {
621 *errorptr = ERR2;
622 return 0;
623 }
624
625 /* A letter is upper-cased; then the 0x40 bit is flipped */
626
627 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
628 c ^= 0x40;
629 break;
630
631 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
632 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
633 for Perl compatibility, it is a literal. This code looks a bit odd, but
634 there used to be some cases other than the default, and there may be again
635 in future, so I haven't "optimized" it. */
636
637 default:
638 if ((options & PCRE_EXTRA) != 0) switch(c)
639 {
640 default:
641 *errorptr = ERR3;
642 break;
643 }
644 break;
645 }
646 }
647
648 *ptrptr = ptr;
649 return c;
650 }
651
652
653
654 /*************************************************
655 * Check for counted repeat *
656 *************************************************/
657
658 /* This function is called when a '{' is encountered in a place where it might
659 start a quantifier. It looks ahead to see if it really is a quantifier or not.
660 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
661 where the ddds are digits.
662
663 Arguments:
664 p pointer to the first char after '{'
665 cd pointer to char tables block
666
667 Returns: TRUE or FALSE
668 */
669
670 static BOOL
671 is_counted_repeat(const uschar *p, compile_data *cd)
672 {
673 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
674 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
675 if (*p == '}') return TRUE;
676
677 if (*p++ != ',') return FALSE;
678 if (*p == '}') return TRUE;
679
680 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
681 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
682 return (*p == '}');
683 }
684
685
686
687 /*************************************************
688 * Read repeat counts *
689 *************************************************/
690
691 /* Read an item of the form {n,m} and return the values. This is called only
692 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
693 so the syntax is guaranteed to be correct, but we need to check the values.
694
695 Arguments:
696 p pointer to first char after '{'
697 minp pointer to int for min
698 maxp pointer to int for max
699 returned as -1 if no max
700 errorptr points to pointer to error message
701 cd pointer to character tables clock
702
703 Returns: pointer to '}' on success;
704 current ptr on error, with errorptr set
705 */
706
707 static const uschar *
708 read_repeat_counts(const uschar *p, int *minp, int *maxp,
709 const char **errorptr, compile_data *cd)
710 {
711 int min = 0;
712 int max = -1;
713
714 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
715
716 if (*p == '}') max = min; else
717 {
718 if (*(++p) != '}')
719 {
720 max = 0;
721 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
722 if (max < min)
723 {
724 *errorptr = ERR4;
725 return p;
726 }
727 }
728 }
729
730 /* Do paranoid checks, then fill in the required variables, and pass back the
731 pointer to the terminating '}'. */
732
733 if (min > 65535 || max > 65535)
734 *errorptr = ERR5;
735 else
736 {
737 *minp = min;
738 *maxp = max;
739 }
740 return p;
741 }
742
743
744
745 /*************************************************
746 * Find the fixed length of a pattern *
747 *************************************************/
748
749 /* Scan a pattern and compute the fixed length of subject that will match it,
750 if the length is fixed. This is needed for dealing with backward assertions.
751
752 Arguments:
753 code points to the start of the pattern (the bracket)
754 options the compiling options
755
756 Returns: the fixed length, or -1 if there is no fixed length
757 */
758
759 static int
760 find_fixedlength(uschar *code, int options)
761 {
762 int length = -1;
763
764 register int branchlength = 0;
765 register uschar *cc = code + 3;
766
767 /* Scan along the opcodes for this branch. If we get to the end of the
768 branch, check the length against that of the other branches. */
769
770 for (;;)
771 {
772 int d;
773 register int op = *cc;
774 if (op >= OP_BRA) op = OP_BRA;
775
776 switch (op)
777 {
778 case OP_BRA:
779 case OP_ONCE:
780 case OP_COND:
781 d = find_fixedlength(cc, options);
782 if (d < 0) return -1;
783 branchlength += d;
784 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
785 cc += 3;
786 break;
787
788 /* Reached end of a branch; if it's a ket it is the end of a nested
789 call. If it's ALT it is an alternation in a nested call. If it is
790 END it's the end of the outer call. All can be handled by the same code. */
791
792 case OP_ALT:
793 case OP_KET:
794 case OP_KETRMAX:
795 case OP_KETRMIN:
796 case OP_END:
797 if (length < 0) length = branchlength;
798 else if (length != branchlength) return -1;
799 if (*cc != OP_ALT) return length;
800 cc += 3;
801 branchlength = 0;
802 break;
803
804 /* Skip over assertive subpatterns */
805
806 case OP_ASSERT:
807 case OP_ASSERT_NOT:
808 case OP_ASSERTBACK:
809 case OP_ASSERTBACK_NOT:
810 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811 cc += 3;
812 break;
813
814 /* Skip over things that don't match chars */
815
816 case OP_REVERSE:
817 cc++;
818 /* Fall through */
819
820 case OP_CREF:
821 case OP_OPT:
822 cc++;
823 /* Fall through */
824
825 case OP_SOD:
826 case OP_EOD:
827 case OP_EODN:
828 case OP_CIRC:
829 case OP_DOLL:
830 case OP_NOT_WORD_BOUNDARY:
831 case OP_WORD_BOUNDARY:
832 cc++;
833 break;
834
835 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836 This requires a scan of the string, unfortunately. We assume valid UTF-8
837 strings, so all we do is reduce the length by one for byte whose bits are
838 10xxxxxx. */
839
840 case OP_CHARS:
841 branchlength += *(++cc);
842 #ifdef SUPPORT_UTF8
843 for (d = 1; d <= *cc; d++)
844 if ((cc[d] & 0xc0) == 0x80) branchlength--;
845 #endif
846 cc += *cc + 1;
847 break;
848
849 /* Handle exact repetitions */
850
851 case OP_EXACT:
852 case OP_TYPEEXACT:
853 branchlength += (cc[1] << 8) + cc[2];
854 cc += 4;
855 break;
856
857 /* Handle single-char matchers */
858
859 case OP_NOT_DIGIT:
860 case OP_DIGIT:
861 case OP_NOT_WHITESPACE:
862 case OP_WHITESPACE:
863 case OP_NOT_WORDCHAR:
864 case OP_WORDCHAR:
865 case OP_ANY:
866 branchlength++;
867 cc++;
868 break;
869
870
871 /* Check a class for variable quantification */
872
873 case OP_CLASS:
874 cc += (*cc == OP_REF)? 2 : 33;
875
876 switch (*cc)
877 {
878 case OP_CRSTAR:
879 case OP_CRMINSTAR:
880 case OP_CRQUERY:
881 case OP_CRMINQUERY:
882 return -1;
883
884 case OP_CRRANGE:
885 case OP_CRMINRANGE:
886 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
887 branchlength += (cc[1] << 8) + cc[2];
888 cc += 5;
889 break;
890
891 default:
892 branchlength++;
893 }
894 break;
895
896 /* Anything else is variable length */
897
898 default:
899 return -1;
900 }
901 }
902 /* Control never gets here */
903 }
904
905
906
907
908 /*************************************************
909 * Check for POSIX class syntax *
910 *************************************************/
911
912 /* This function is called when the sequence "[:" or "[." or "[=" is
913 encountered in a character class. It checks whether this is followed by an
914 optional ^ and then a sequence of letters, terminated by a matching ":]" or
915 ".]" or "=]".
916
917 Argument:
918 ptr pointer to the initial [
919 endptr where to return the end pointer
920 cd pointer to compile data
921
922 Returns: TRUE or FALSE
923 */
924
925 static BOOL
926 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
927 {
928 int terminator; /* Don't combine these lines; the Solaris cc */
929 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
930 if (*(++ptr) == '^') ptr++;
931 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932 if (*ptr == terminator && ptr[1] == ']')
933 {
934 *endptr = ptr;
935 return TRUE;
936 }
937 return FALSE;
938 }
939
940
941
942
943 /*************************************************
944 * Check POSIX class name *
945 *************************************************/
946
947 /* This function is called to check the name given in a POSIX-style class entry
948 such as [:alnum:].
949
950 Arguments:
951 ptr points to the first letter
952 len the length of the name
953
954 Returns: a value representing the name, or -1 if unknown
955 */
956
957 static int
958 check_posix_name(const uschar *ptr, int len)
959 {
960 register int yield = 0;
961 while (posix_name_lengths[yield] != 0)
962 {
963 if (len == posix_name_lengths[yield] &&
964 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
965 yield++;
966 }
967 return -1;
968 }
969
970
971
972
973 /*************************************************
974 * Compile one branch *
975 *************************************************/
976
977 /* Scan the pattern, compiling it into the code vector.
978
979 Arguments:
980 options the option bits
981 brackets points to number of brackets used
982 code points to the pointer to the current code point
983 ptrptr points to the current pattern pointer
984 errorptr points to pointer to error message
985 optchanged set to the value of the last OP_OPT item compiled
986 reqchar set to the last literal character required, else -1
987 countlits set to count of mandatory literal characters
988 cd contains pointers to tables
989
990 Returns: TRUE on success
991 FALSE, with *errorptr set on error
992 */
993
994 static BOOL
995 compile_branch(int options, int *brackets, uschar **codeptr,
996 const uschar **ptrptr, const char **errorptr, int *optchanged,
997 int *reqchar, int *countlits, compile_data *cd)
998 {
999 int repeat_type, op_type;
1000 int repeat_min, repeat_max;
1001 int bravalue, length;
1002 int greedy_default, greedy_non_default;
1003 int prevreqchar;
1004 int condcount = 0;
1005 int subcountlits = 0;
1006 register int c;
1007 register uschar *code = *codeptr;
1008 uschar *tempcode;
1009 const uschar *ptr = *ptrptr;
1010 const uschar *tempptr;
1011 uschar *previous = NULL;
1012 uschar class[32];
1013
1014 /* Set up the default and non-default settings for greediness */
1015
1016 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017 greedy_non_default = greedy_default ^ 1;
1018
1019 /* Initialize no required char, and count of literals */
1020
1021 *reqchar = prevreqchar = -1;
1022 *countlits = 0;
1023
1024 /* Switch on next character until the end of the branch */
1025
1026 for (;; ptr++)
1027 {
1028 BOOL negate_class;
1029 int class_charcount;
1030 int class_lastchar;
1031 int newoptions;
1032 int condref;
1033 int subreqchar;
1034
1035 c = *ptr;
1036 if ((options & PCRE_EXTENDED) != 0)
1037 {
1038 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1039 if (c == '#')
1040 {
1041 /* The space before the ; is to avoid a warning on a silly compiler
1042 on the Macintosh. */
1043 while ((c = *(++ptr)) != 0 && c != '\n') ;
1044 continue;
1045 }
1046 }
1047
1048 switch(c)
1049 {
1050 /* The branch terminates at end of string, |, or ). */
1051
1052 case 0:
1053 case '|':
1054 case ')':
1055 *codeptr = code;
1056 *ptrptr = ptr;
1057 return TRUE;
1058
1059 /* Handle single-character metacharacters */
1060
1061 case '^':
1062 previous = NULL;
1063 *code++ = OP_CIRC;
1064 break;
1065
1066 case '$':
1067 previous = NULL;
1068 *code++ = OP_DOLL;
1069 break;
1070
1071 case '.':
1072 previous = code;
1073 *code++ = OP_ANY;
1074 break;
1075
1076 /* Character classes. These always build a 32-byte bitmap of the permitted
1077 characters, except in the special case where there is only one character.
1078 For negated classes, we build the map as usual, then invert it at the end.
1079 */
1080
1081 case '[':
1082 previous = code;
1083 *code++ = OP_CLASS;
1084
1085 /* If the first character is '^', set the negation flag and skip it. */
1086
1087 if ((c = *(++ptr)) == '^')
1088 {
1089 negate_class = TRUE;
1090 c = *(++ptr);
1091 }
1092 else negate_class = FALSE;
1093
1094 /* Keep a count of chars so that we can optimize the case of just a single
1095 character. */
1096
1097 class_charcount = 0;
1098 class_lastchar = -1;
1099
1100 /* Initialize the 32-char bit map to all zeros. We have to build the
1101 map in a temporary bit of store, in case the class contains only 1
1102 character, because in that case the compiled code doesn't use the
1103 bit map. */
1104
1105 memset(class, 0, 32 * sizeof(uschar));
1106
1107 /* Process characters until ] is reached. By writing this as a "do" it
1108 means that an initial ] is taken as a data character. */
1109
1110 do
1111 {
1112 if (c == 0)
1113 {
1114 *errorptr = ERR6;
1115 goto FAILED;
1116 }
1117
1118 /* Handle POSIX class names. Perl allows a negation extension of the
1119 form [:^name]. A square bracket that doesn't match the syntax is
1120 treated as a literal. We also recognize the POSIX constructions
1121 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1122 5.6 does. */
1123
1124 if (c == '[' &&
1125 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126 check_posix_syntax(ptr, &tempptr, cd))
1127 {
1128 BOOL local_negate = FALSE;
1129 int posix_class, i;
1130 register const uschar *cbits = cd->cbits;
1131
1132 if (ptr[1] != ':')
1133 {
1134 *errorptr = ERR31;
1135 goto FAILED;
1136 }
1137
1138 ptr += 2;
1139 if (*ptr == '^')
1140 {
1141 local_negate = TRUE;
1142 ptr++;
1143 }
1144
1145 posix_class = check_posix_name(ptr, tempptr - ptr);
1146 if (posix_class < 0)
1147 {
1148 *errorptr = ERR30;
1149 goto FAILED;
1150 }
1151
1152 /* If matching is caseless, upper and lower are converted to
1153 alpha. This relies on the fact that the class table starts with
1154 alpha, lower, upper as the first 3 entries. */
1155
1156 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1157 posix_class = 0;
1158
1159 /* Or into the map we are building up to 3 of the static class
1160 tables, or their negations. */
1161
1162 posix_class *= 3;
1163 for (i = 0; i < 3; i++)
1164 {
1165 int taboffset = posix_class_maps[posix_class + i];
1166 if (taboffset < 0) break;
1167 if (local_negate)
1168 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1169 else
1170 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1171 }
1172
1173 ptr = tempptr + 1;
1174 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1175 continue;
1176 }
1177
1178 /* Backslash may introduce a single character, or it may introduce one
1179 of the specials, which just set a flag. Escaped items are checked for
1180 validity in the pre-compiling pass. The sequence \b is a special case.
1181 Inside a class (and only there) it is treated as backspace. Elsewhere
1182 it marks a word boundary. Other escapes have preset maps ready to
1183 or into the one we are building. We assume they have more than one
1184 character in them, so set class_count bigger than one. */
1185
1186 if (c == '\\')
1187 {
1188 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1189 if (-c == ESC_b) c = '\b';
1190 else if (c < 0)
1191 {
1192 register const uschar *cbits = cd->cbits;
1193 class_charcount = 10;
1194 switch (-c)
1195 {
1196 case ESC_d:
1197 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1198 continue;
1199
1200 case ESC_D:
1201 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1202 continue;
1203
1204 case ESC_w:
1205 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1206 continue;
1207
1208 case ESC_W:
1209 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1210 continue;
1211
1212 case ESC_s:
1213 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1214 continue;
1215
1216 case ESC_S:
1217 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1218 continue;
1219
1220 default:
1221 *errorptr = ERR7;
1222 goto FAILED;
1223 }
1224 }
1225
1226 /* Fall through if single character, but don't at present allow
1227 chars > 255 in UTF-8 mode. */
1228
1229 #ifdef SUPPORT_UTF8
1230 if (c > 255)
1231 {
1232 *errorptr = ERR33;
1233 goto FAILED;
1234 }
1235 #endif
1236 }
1237
1238 /* A single character may be followed by '-' to form a range. However,
1239 Perl does not permit ']' to be the end of the range. A '-' character
1240 here is treated as a literal. */
1241
1242 if (ptr[1] == '-' && ptr[2] != ']')
1243 {
1244 int d;
1245 ptr += 2;
1246 d = *ptr;
1247
1248 if (d == 0)
1249 {
1250 *errorptr = ERR6;
1251 goto FAILED;
1252 }
1253
1254 /* The second part of a range can be a single-character escape, but
1255 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256 in such circumstances. */
1257
1258 if (d == '\\')
1259 {
1260 const uschar *oldptr = ptr;
1261 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262
1263 #ifdef SUPPORT_UTF8
1264 if (d > 255)
1265 {
1266 *errorptr = ERR33;
1267 goto FAILED;
1268 }
1269 #endif
1270 /* \b is backslash; any other special means the '-' was literal */
1271
1272 if (d < 0)
1273 {
1274 if (d == -ESC_b) d = '\b'; else
1275 {
1276 ptr = oldptr - 2;
1277 goto SINGLE_CHARACTER; /* A few lines below */
1278 }
1279 }
1280 }
1281
1282 if (d < c)
1283 {
1284 *errorptr = ERR8;
1285 goto FAILED;
1286 }
1287
1288 for (; c <= d; c++)
1289 {
1290 class[c/8] |= (1 << (c&7));
1291 if ((options & PCRE_CASELESS) != 0)
1292 {
1293 int uc = cd->fcc[c]; /* flip case */
1294 class[uc/8] |= (1 << (uc&7));
1295 }
1296 class_charcount++; /* in case a one-char range */
1297 class_lastchar = c;
1298 }
1299 continue; /* Go get the next char in the class */
1300 }
1301
1302 /* Handle a lone single character - we can get here for a normal
1303 non-escape char, or after \ that introduces a single character. */
1304
1305 SINGLE_CHARACTER:
1306
1307 class [c/8] |= (1 << (c&7));
1308 if ((options & PCRE_CASELESS) != 0)
1309 {
1310 c = cd->fcc[c]; /* flip case */
1311 class[c/8] |= (1 << (c&7));
1312 }
1313 class_charcount++;
1314 class_lastchar = c;
1315 }
1316
1317 /* Loop until ']' reached; the check for end of string happens inside the
1318 loop. This "while" is the end of the "do" above. */
1319
1320 while ((c = *(++ptr)) != ']');
1321
1322 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1323 precisely one character. This doesn't need the whole 32-byte bit map.
1324 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1325 it's negative. */
1326
1327 if (class_charcount == 1 && class_lastchar >= 0)
1328 {
1329 if (negate_class)
1330 {
1331 code[-1] = OP_NOT;
1332 }
1333 else
1334 {
1335 code[-1] = OP_CHARS;
1336 *code++ = 1;
1337 }
1338 *code++ = class_lastchar;
1339 }
1340
1341 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1342 the code vector. */
1343
1344 else
1345 {
1346 if (negate_class)
1347 for (c = 0; c < 32; c++) code[c] = ~class[c];
1348 else
1349 memcpy(code, class, 32);
1350 code += 32;
1351 }
1352 break;
1353
1354 /* Various kinds of repeat */
1355
1356 case '{':
1357 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1358 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1359 if (*errorptr != NULL) goto FAILED;
1360 goto REPEAT;
1361
1362 case '*':
1363 repeat_min = 0;
1364 repeat_max = -1;
1365 goto REPEAT;
1366
1367 case '+':
1368 repeat_min = 1;
1369 repeat_max = -1;
1370 goto REPEAT;
1371
1372 case '?':
1373 repeat_min = 0;
1374 repeat_max = 1;
1375
1376 REPEAT:
1377 if (previous == NULL)
1378 {
1379 *errorptr = ERR9;
1380 goto FAILED;
1381 }
1382
1383 /* If the next character is '?' this is a minimizing repeat, by default,
1384 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1385 next character. */
1386
1387 if (ptr[1] == '?')
1388 { repeat_type = greedy_non_default; ptr++; }
1389 else repeat_type = greedy_default;
1390
1391 /* If previous was a string of characters, chop off the last one and use it
1392 as the subject of the repeat. If there was only one character, we can
1393 abolish the previous item altogether. A repeat with a zero minimum wipes
1394 out any reqchar setting, backing up to the previous value. We must also
1395 adjust the countlits value. */
1396
1397 if (*previous == OP_CHARS)
1398 {
1399 int len = previous[1];
1400
1401 if (repeat_min == 0) *reqchar = prevreqchar;
1402 *countlits += repeat_min - 1;
1403
1404 if (len == 1)
1405 {
1406 c = previous[2];
1407 code = previous;
1408 }
1409 else
1410 {
1411 c = previous[len+1];
1412 previous[1]--;
1413 code--;
1414 }
1415 op_type = 0; /* Use single-char op codes */
1416 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1417 }
1418
1419 /* If previous was a single negated character ([^a] or similar), we use
1420 one of the special opcodes, replacing it. The code is shared with single-
1421 character repeats by adding a suitable offset into repeat_type. */
1422
1423 else if ((int)*previous == OP_NOT)
1424 {
1425 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1426 c = previous[1];
1427 code = previous;
1428 goto OUTPUT_SINGLE_REPEAT;
1429 }
1430
1431 /* If previous was a character type match (\d or similar), abolish it and
1432 create a suitable repeat item. The code is shared with single-character
1433 repeats by adding a suitable offset into repeat_type. */
1434
1435 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1436 {
1437 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1438 c = *previous;
1439 code = previous;
1440
1441 OUTPUT_SINGLE_REPEAT:
1442
1443 /* If the maximum is zero then the minimum must also be zero; Perl allows
1444 this case, so we do too - by simply omitting the item altogether. */
1445
1446 if (repeat_max == 0) goto END_REPEAT;
1447
1448 /* Combine the op_type with the repeat_type */
1449
1450 repeat_type += op_type;
1451
1452 /* A minimum of zero is handled either as the special case * or ?, or as
1453 an UPTO, with the maximum given. */
1454
1455 if (repeat_min == 0)
1456 {
1457 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1458 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1459 else
1460 {
1461 *code++ = OP_UPTO + repeat_type;
1462 *code++ = repeat_max >> 8;
1463 *code++ = (repeat_max & 255);
1464 }
1465 }
1466
1467 /* The case {1,} is handled as the special case + */
1468
1469 else if (repeat_min == 1 && repeat_max == -1)
1470 *code++ = OP_PLUS + repeat_type;
1471
1472 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1473 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1474
1475 else
1476 {
1477 if (repeat_min != 1)
1478 {
1479 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1480 *code++ = repeat_min >> 8;
1481 *code++ = (repeat_min & 255);
1482 }
1483
1484 /* If the mininum is 1 and the previous item was a character string,
1485 we either have to put back the item that got cancelled if the string
1486 length was 1, or add the character back onto the end of a longer
1487 string. For a character type nothing need be done; it will just get
1488 put back naturally. Note that the final character is always going to
1489 get added below. */
1490
1491 else if (*previous == OP_CHARS)
1492 {
1493 if (code == previous) code += 2; else previous[1]++;
1494 }
1495
1496 /* For a single negated character we also have to put back the
1497 item that got cancelled. */
1498
1499 else if (*previous == OP_NOT) code++;
1500
1501 /* If the maximum is unlimited, insert an OP_STAR. */
1502
1503 if (repeat_max < 0)
1504 {
1505 *code++ = c;
1506 *code++ = OP_STAR + repeat_type;
1507 }
1508
1509 /* Else insert an UPTO if the max is greater than the min. */
1510
1511 else if (repeat_max != repeat_min)
1512 {
1513 *code++ = c;
1514 repeat_max -= repeat_min;
1515 *code++ = OP_UPTO + repeat_type;
1516 *code++ = repeat_max >> 8;
1517 *code++ = (repeat_max & 255);
1518 }
1519 }
1520
1521 /* The character or character type itself comes last in all cases. */
1522
1523 *code++ = c;
1524 }
1525
1526 /* If previous was a character class or a back reference, we put the repeat
1527 stuff after it, but just skip the item if the repeat was {0,0}. */
1528
1529 else if (*previous == OP_CLASS || *previous == OP_REF)
1530 {
1531 if (repeat_max == 0)
1532 {
1533 code = previous;
1534 goto END_REPEAT;
1535 }
1536 if (repeat_min == 0 && repeat_max == -1)
1537 *code++ = OP_CRSTAR + repeat_type;
1538 else if (repeat_min == 1 && repeat_max == -1)
1539 *code++ = OP_CRPLUS + repeat_type;
1540 else if (repeat_min == 0 && repeat_max == 1)
1541 *code++ = OP_CRQUERY + repeat_type;
1542 else
1543 {
1544 *code++ = OP_CRRANGE + repeat_type;
1545 *code++ = repeat_min >> 8;
1546 *code++ = repeat_min & 255;
1547 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1548 *code++ = repeat_max >> 8;
1549 *code++ = repeat_max & 255;
1550 }
1551 }
1552
1553 /* If previous was a bracket group, we may have to replicate it in certain
1554 cases. */
1555
1556 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1557 (int)*previous == OP_COND)
1558 {
1559 register int i;
1560 int ketoffset = 0;
1561 int len = code - previous;
1562 uschar *bralink = NULL;
1563
1564 /* If the maximum repeat count is unlimited, find the end of the bracket
1565 by scanning through from the start, and compute the offset back to it
1566 from the current code pointer. There may be an OP_OPT setting following
1567 the final KET, so we can't find the end just by going back from the code
1568 pointer. */
1569
1570 if (repeat_max == -1)
1571 {
1572 register uschar *ket = previous;
1573 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1574 ketoffset = code - ket;
1575 }
1576
1577 /* The case of a zero minimum is special because of the need to stick
1578 OP_BRAZERO in front of it, and because the group appears once in the
1579 data, whereas in other cases it appears the minimum number of times. For
1580 this reason, it is simplest to treat this case separately, as otherwise
1581 the code gets far too mess. There are several special subcases when the
1582 minimum is zero. */
1583
1584 if (repeat_min == 0)
1585 {
1586 /* If we set up a required char from the bracket, we must back off
1587 to the previous value and reset the countlits value too. */
1588
1589 if (subcountlits > 0)
1590 {
1591 *reqchar = prevreqchar;
1592 *countlits -= subcountlits;
1593 }
1594
1595 /* If the maximum is also zero, we just omit the group from the output
1596 altogether. */
1597
1598 if (repeat_max == 0)
1599 {
1600 code = previous;
1601 goto END_REPEAT;
1602 }
1603
1604 /* If the maximum is 1 or unlimited, we just have to stick in the
1605 BRAZERO and do no more at this point. */
1606
1607 if (repeat_max <= 1)
1608 {
1609 memmove(previous+1, previous, len);
1610 code++;
1611 *previous++ = OP_BRAZERO + repeat_type;
1612 }
1613
1614 /* If the maximum is greater than 1 and limited, we have to replicate
1615 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1616 The first one has to be handled carefully because it's the original
1617 copy, which has to be moved up. The remainder can be handled by code
1618 that is common with the non-zero minimum case below. We just have to
1619 adjust the value or repeat_max, since one less copy is required. */
1620
1621 else
1622 {
1623 int offset;
1624 memmove(previous+4, previous, len);
1625 code += 4;
1626 *previous++ = OP_BRAZERO + repeat_type;
1627 *previous++ = OP_BRA;
1628
1629 /* We chain together the bracket offset fields that have to be
1630 filled in later when the ends of the brackets are reached. */
1631
1632 offset = (bralink == NULL)? 0 : previous - bralink;
1633 bralink = previous;
1634 *previous++ = offset >> 8;
1635 *previous++ = offset & 255;
1636 }
1637
1638 repeat_max--;
1639 }
1640
1641 /* If the minimum is greater than zero, replicate the group as many
1642 times as necessary, and adjust the maximum to the number of subsequent
1643 copies that we need. */
1644
1645 else
1646 {
1647 for (i = 1; i < repeat_min; i++)
1648 {
1649 memcpy(code, previous, len);
1650 code += len;
1651 }
1652 if (repeat_max > 0) repeat_max -= repeat_min;
1653 }
1654
1655 /* This code is common to both the zero and non-zero minimum cases. If
1656 the maximum is limited, it replicates the group in a nested fashion,
1657 remembering the bracket starts on a stack. In the case of a zero minimum,
1658 the first one was set up above. In all cases the repeat_max now specifies
1659 the number of additional copies needed. */
1660
1661 if (repeat_max >= 0)
1662 {
1663 for (i = repeat_max - 1; i >= 0; i--)
1664 {
1665 *code++ = OP_BRAZERO + repeat_type;
1666
1667 /* All but the final copy start a new nesting, maintaining the
1668 chain of brackets outstanding. */
1669
1670 if (i != 0)
1671 {
1672 int offset;
1673 *code++ = OP_BRA;
1674 offset = (bralink == NULL)? 0 : code - bralink;
1675 bralink = code;
1676 *code++ = offset >> 8;
1677 *code++ = offset & 255;
1678 }
1679
1680 memcpy(code, previous, len);
1681 code += len;
1682 }
1683
1684 /* Now chain through the pending brackets, and fill in their length
1685 fields (which are holding the chain links pro tem). */
1686
1687 while (bralink != NULL)
1688 {
1689 int oldlinkoffset;
1690 int offset = code - bralink + 1;
1691 uschar *bra = code - offset;
1692 oldlinkoffset = (bra[1] << 8) + bra[2];
1693 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1694 *code++ = OP_KET;
1695 *code++ = bra[1] = offset >> 8;
1696 *code++ = bra[2] = (offset & 255);
1697 }
1698 }
1699
1700 /* If the maximum is unlimited, set a repeater in the final copy. We
1701 can't just offset backwards from the current code point, because we
1702 don't know if there's been an options resetting after the ket. The
1703 correct offset was computed above. */
1704
1705 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1706 }
1707
1708 /* Else there's some kind of shambles */
1709
1710 else
1711 {
1712 *errorptr = ERR11;
1713 goto FAILED;
1714 }
1715
1716 /* In all case we no longer have a previous item. */
1717
1718 END_REPEAT:
1719 previous = NULL;
1720 break;
1721
1722
1723 /* Start of nested bracket sub-expression, or comment or lookahead or
1724 lookbehind or option setting or condition. First deal with special things
1725 that can come after a bracket; all are introduced by ?, and the appearance
1726 of any of them means that this is not a referencing group. They were
1727 checked for validity in the first pass over the string, so we don't have to
1728 check for syntax errors here. */
1729
1730 case '(':
1731 newoptions = options;
1732 condref = -1;
1733
1734 if (*(++ptr) == '?')
1735 {
1736 int set, unset;
1737 int *optset;
1738
1739 switch (*(++ptr))
1740 {
1741 case '#': /* Comment; skip to ket */
1742 ptr++;
1743 while (*ptr != ')') ptr++;
1744 continue;
1745
1746 case ':': /* Non-extracting bracket */
1747 bravalue = OP_BRA;
1748 ptr++;
1749 break;
1750
1751 case '(':
1752 bravalue = OP_COND; /* Conditional group */
1753 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1754 {
1755 condref = *ptr - '0';
1756 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1757 if (condref == 0)
1758 {
1759 *errorptr = ERR35;
1760 goto FAILED;
1761 }
1762 ptr++;
1763 }
1764 else ptr--;
1765 break;
1766
1767 case '=': /* Positive lookahead */
1768 bravalue = OP_ASSERT;
1769 ptr++;
1770 break;
1771
1772 case '!': /* Negative lookahead */
1773 bravalue = OP_ASSERT_NOT;
1774 ptr++;
1775 break;
1776
1777 case '<': /* Lookbehinds */
1778 switch (*(++ptr))
1779 {
1780 case '=': /* Positive lookbehind */
1781 bravalue = OP_ASSERTBACK;
1782 ptr++;
1783 break;
1784
1785 case '!': /* Negative lookbehind */
1786 bravalue = OP_ASSERTBACK_NOT;
1787 ptr++;
1788 break;
1789
1790 default: /* Syntax error */
1791 *errorptr = ERR24;
1792 goto FAILED;
1793 }
1794 break;
1795
1796 case '>': /* One-time brackets */
1797 bravalue = OP_ONCE;
1798 ptr++;
1799 break;
1800
1801 case 'R': /* Pattern recursion */
1802 *code++ = OP_RECURSE;
1803 ptr++;
1804 continue;
1805
1806 default: /* Option setting */
1807 set = unset = 0;
1808 optset = &set;
1809
1810 while (*ptr != ')' && *ptr != ':')
1811 {
1812 switch (*ptr++)
1813 {
1814 case '-': optset = &unset; break;
1815
1816 case 'i': *optset |= PCRE_CASELESS; break;
1817 case 'm': *optset |= PCRE_MULTILINE; break;
1818 case 's': *optset |= PCRE_DOTALL; break;
1819 case 'x': *optset |= PCRE_EXTENDED; break;
1820 case 'U': *optset |= PCRE_UNGREEDY; break;
1821 case 'X': *optset |= PCRE_EXTRA; break;
1822
1823 default:
1824 *errorptr = ERR12;
1825 goto FAILED;
1826 }
1827 }
1828
1829 /* Set up the changed option bits, but don't change anything yet. */
1830
1831 newoptions = (options | set) & (~unset);
1832
1833 /* If the options ended with ')' this is not the start of a nested
1834 group with option changes, so the options change at this level. At top
1835 level there is nothing else to be done (the options will in fact have
1836 been set from the start of compiling as a result of the first pass) but
1837 at an inner level we must compile code to change the ims options if
1838 necessary, and pass the new setting back so that it can be put at the
1839 start of any following branches, and when this group ends, a resetting
1840 item can be compiled. */
1841
1842 if (*ptr == ')')
1843 {
1844 if ((options & PCRE_INGROUP) != 0 &&
1845 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1846 {
1847 *code++ = OP_OPT;
1848 *code++ = *optchanged = newoptions & PCRE_IMS;
1849 }
1850 options = newoptions; /* Change options at this level */
1851 previous = NULL; /* This item can't be repeated */
1852 continue; /* It is complete */
1853 }
1854
1855 /* If the options ended with ':' we are heading into a nested group
1856 with possible change of options. Such groups are non-capturing and are
1857 not assertions of any kind. All we need to do is skip over the ':';
1858 the newoptions value is handled below. */
1859
1860 bravalue = OP_BRA;
1861 ptr++;
1862 }
1863 }
1864
1865 /* Else we have a referencing group; adjust the opcode. */
1866
1867 else
1868 {
1869 if (++(*brackets) > EXTRACT_MAX)
1870 {
1871 *errorptr = ERR13;
1872 goto FAILED;
1873 }
1874 bravalue = OP_BRA + *brackets;
1875 }
1876
1877 /* Process nested bracketed re. Assertions may not be repeated, but other
1878 kinds can be. We copy code into a non-register variable in order to be able
1879 to pass its address because some compilers complain otherwise. Pass in a
1880 new setting for the ims options if they have changed. */
1881
1882 previous = (bravalue >= OP_ONCE)? code : NULL;
1883 *code = bravalue;
1884 tempcode = code;
1885
1886 if (!compile_regex(
1887 options | PCRE_INGROUP, /* Set for all nested groups */
1888 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1889 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1890 brackets, /* Bracket level */
1891 &tempcode, /* Where to put code (updated) */
1892 &ptr, /* Input pointer (updated) */
1893 errorptr, /* Where to put an error message */
1894 (bravalue == OP_ASSERTBACK ||
1895 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1896 condref, /* Condition reference number */
1897 &subreqchar, /* For possible last char */
1898 &subcountlits, /* For literal count */
1899 cd)) /* Tables block */
1900 goto FAILED;
1901
1902 /* At the end of compiling, code is still pointing to the start of the
1903 group, while tempcode has been updated to point past the end of the group
1904 and any option resetting that may follow it. The pattern pointer (ptr)
1905 is on the bracket. */
1906
1907 /* If this is a conditional bracket, check that there are no more than
1908 two branches in the group. */
1909
1910 if (bravalue == OP_COND)
1911 {
1912 uschar *tc = code;
1913 condcount = 0;
1914
1915 do {
1916 condcount++;
1917 tc += (tc[1] << 8) | tc[2];
1918 }
1919 while (*tc != OP_KET);
1920
1921 if (condcount > 2)
1922 {
1923 *errorptr = ERR27;
1924 goto FAILED;
1925 }
1926 }
1927
1928 /* Handle updating of the required character. If the subpattern didn't
1929 set one, leave it as it was. Otherwise, update it for normal brackets of
1930 all kinds, forward assertions, and conditions with two branches. Don't
1931 update the literal count for forward assertions, however. If the bracket
1932 is followed by a quantifier with zero repeat, we have to back off. Hence
1933 the definition of prevreqchar and subcountlits outside the main loop so
1934 that they can be accessed for the back off. */
1935
1936 if (subreqchar > 0 &&
1937 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1938 (bravalue == OP_COND && condcount == 2)))
1939 {
1940 prevreqchar = *reqchar;
1941 *reqchar = subreqchar;
1942 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1943 }
1944
1945 /* Now update the main code pointer to the end of the group. */
1946
1947 code = tempcode;
1948
1949 /* Error if hit end of pattern */
1950
1951 if (*ptr != ')')
1952 {
1953 *errorptr = ERR14;
1954 goto FAILED;
1955 }
1956 break;
1957
1958 /* Check \ for being a real metacharacter; if not, fall through and handle
1959 it as a data character at the start of a string. Escape items are checked
1960 for validity in the pre-compiling pass. */
1961
1962 case '\\':
1963 tempptr = ptr;
1964 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1965
1966 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1967 are arranged to be the negation of the corresponding OP_values. For the
1968 back references, the values are ESC_REF plus the reference number. Only
1969 back references and those types that consume a character may be repeated.
1970 We can test for values between ESC_b and ESC_Z for the latter; this may
1971 have to change if any new ones are ever created. */
1972
1973 if (c < 0)
1974 {
1975 if (-c >= ESC_REF)
1976 {
1977 previous = code;
1978 *code++ = OP_REF;
1979 *code++ = -c - ESC_REF;
1980 }
1981 else
1982 {
1983 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1984 *code++ = -c;
1985 }
1986 continue;
1987 }
1988
1989 /* Data character: reset and fall through */
1990
1991 ptr = tempptr;
1992 c = '\\';
1993
1994 /* Handle a run of data characters until a metacharacter is encountered.
1995 The first character is guaranteed not to be whitespace or # when the
1996 extended flag is set. */
1997
1998 NORMAL_CHAR:
1999 default:
2000 previous = code;
2001 *code = OP_CHARS;
2002 code += 2;
2003 length = 0;
2004
2005 do
2006 {
2007 if ((options & PCRE_EXTENDED) != 0)
2008 {
2009 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2010 if (c == '#')
2011 {
2012 /* The space before the ; is to avoid a warning on a silly compiler
2013 on the Macintosh. */
2014 while ((c = *(++ptr)) != 0 && c != '\n') ;
2015 if (c == 0) break;
2016 continue;
2017 }
2018 }
2019
2020 /* Backslash may introduce a data char or a metacharacter. Escaped items
2021 are checked for validity in the pre-compiling pass. Stop the string
2022 before a metaitem. */
2023
2024 if (c == '\\')
2025 {
2026 tempptr = ptr;
2027 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2028 if (c < 0) { ptr = tempptr; break; }
2029
2030 /* If a character is > 127 in UTF-8 mode, we have to turn it into
2031 two or more characters in the UTF-8 encoding. */
2032
2033 #ifdef SUPPORT_UTF8
2034 if (c > 127 && (options & PCRE_UTF8) != 0)
2035 {
2036 uschar buffer[8];
2037 int len = ord2utf8(c, buffer);
2038 for (c = 0; c < len; c++) *code++ = buffer[c];
2039 length += len;
2040 continue;
2041 }
2042 #endif
2043 }
2044
2045 /* Ordinary character or single-char escape */
2046
2047 *code++ = c;
2048 length++;
2049 }
2050
2051 /* This "while" is the end of the "do" above. */
2052
2053 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2054
2055 /* Update the last character and the count of literals */
2056
2057 prevreqchar = (length > 1)? code[-2] : *reqchar;
2058 *reqchar = code[-1];
2059 *countlits += length;
2060
2061 /* Compute the length and set it in the data vector, and advance to
2062 the next state. */
2063
2064 previous[1] = length;
2065 if (length < MAXLIT) ptr--;
2066 break;
2067 }
2068 } /* end of big loop */
2069
2070 /* Control never reaches here by falling through, only by a goto for all the
2071 error states. Pass back the position in the pattern so that it can be displayed
2072 to the user for diagnosing the error. */
2073
2074 FAILED:
2075 *ptrptr = ptr;
2076 return FALSE;
2077 }
2078
2079
2080
2081
2082 /*************************************************
2083 * Compile sequence of alternatives *
2084 *************************************************/
2085
2086 /* On entry, ptr is pointing past the bracket character, but on return
2087 it points to the closing bracket, or vertical bar, or end of string.
2088 The code variable is pointing at the byte into which the BRA operator has been
2089 stored. If the ims options are changed at the start (for a (?ims: group) or
2090 during any branch, we need to insert an OP_OPT item at the start of every
2091 following branch to ensure they get set correctly at run time, and also pass
2092 the new options into every subsequent branch compile.
2093
2094 Argument:
2095 options the option bits
2096 optchanged new ims options to set as if (?ims) were at the start, or -1
2097 for no change
2098 brackets -> int containing the number of extracting brackets used
2099 codeptr -> the address of the current code pointer
2100 ptrptr -> the address of the current pattern pointer
2101 errorptr -> pointer to error message
2102 lookbehind TRUE if this is a lookbehind assertion
2103 condref >= 0 for OPT_CREF setting at start of conditional group
2104 reqchar -> place to put the last required character, or a negative number
2105 countlits -> place to put the shortest literal count of any branch
2106 cd points to the data block with tables pointers
2107
2108 Returns: TRUE on success
2109 */
2110
2111 static BOOL
2112 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2113 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2114 int *reqchar, int *countlits, compile_data *cd)
2115 {
2116 const uschar *ptr = *ptrptr;
2117 uschar *code = *codeptr;
2118 uschar *last_branch = code;
2119 uschar *start_bracket = code;
2120 uschar *reverse_count = NULL;
2121 int oldoptions = options & PCRE_IMS;
2122 int branchreqchar, branchcountlits;
2123
2124 *reqchar = -1;
2125 *countlits = INT_MAX;
2126 code += 3;
2127
2128 /* At the start of a reference-based conditional group, insert the reference
2129 number as an OP_CREF item. */
2130
2131 if (condref >= 0)
2132 {
2133 *code++ = OP_CREF;
2134 *code++ = condref;
2135 }
2136
2137 /* Loop for each alternative branch */
2138
2139 for (;;)
2140 {
2141 int length;
2142
2143 /* Handle change of options */
2144
2145 if (optchanged >= 0)
2146 {
2147 *code++ = OP_OPT;
2148 *code++ = optchanged;
2149 options = (options & ~PCRE_IMS) | optchanged;
2150 }
2151
2152 /* Set up dummy OP_REVERSE if lookbehind assertion */
2153
2154 if (lookbehind)
2155 {
2156 *code++ = OP_REVERSE;
2157 reverse_count = code;
2158 *code++ = 0;
2159 *code++ = 0;
2160 }
2161
2162 /* Now compile the branch */
2163
2164 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2165 &branchreqchar, &branchcountlits, cd))
2166 {
2167 *ptrptr = ptr;
2168 return FALSE;
2169 }
2170
2171 /* Fill in the length of the last branch */
2172
2173 length = code - last_branch;
2174 last_branch[1] = length >> 8;
2175 last_branch[2] = length & 255;
2176
2177 /* Save the last required character if all branches have the same; a current
2178 value of -1 means unset, while -2 means "previous branch had no last required
2179 char". */
2180
2181 if (*reqchar != -2)
2182 {
2183 if (branchreqchar >= 0)
2184 {
2185 if (*reqchar == -1) *reqchar = branchreqchar;
2186 else if (*reqchar != branchreqchar) *reqchar = -2;
2187 }
2188 else *reqchar = -2;
2189 }
2190
2191 /* Keep the shortest literal count */
2192
2193 if (branchcountlits < *countlits) *countlits = branchcountlits;
2194 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2195
2196 /* If lookbehind, check that this branch matches a fixed-length string,
2197 and put the length into the OP_REVERSE item. Temporarily mark the end of
2198 the branch with OP_END. */
2199
2200 if (lookbehind)
2201 {
2202 *code = OP_END;
2203 length = find_fixedlength(last_branch, options);
2204 DPRINTF(("fixed length = %d\n", length));
2205 if (length < 0)
2206 {
2207 *errorptr = ERR25;
2208 *ptrptr = ptr;
2209 return FALSE;
2210 }
2211 reverse_count[0] = (length >> 8);
2212 reverse_count[1] = length & 255;
2213 }
2214
2215 /* Reached end of expression, either ')' or end of pattern. Insert a
2216 terminating ket and the length of the whole bracketed item, and return,
2217 leaving the pointer at the terminating char. If any of the ims options
2218 were changed inside the group, compile a resetting op-code following. */
2219
2220 if (*ptr != '|')
2221 {
2222 length = code - start_bracket;
2223 *code++ = OP_KET;
2224 *code++ = length >> 8;
2225 *code++ = length & 255;
2226 if (optchanged >= 0)
2227 {
2228 *code++ = OP_OPT;
2229 *code++ = oldoptions;
2230 }
2231 *codeptr = code;
2232 *ptrptr = ptr;
2233 return TRUE;
2234 }
2235
2236 /* Another branch follows; insert an "or" node and advance the pointer. */
2237
2238 *code = OP_ALT;
2239 last_branch = code;
2240 code += 3;
2241 ptr++;
2242 }
2243 /* Control never reaches here */
2244 }
2245
2246
2247
2248
2249 /*************************************************
2250 * Find first significant op code *
2251 *************************************************/
2252
2253 /* This is called by several functions that scan a compiled expression looking
2254 for a fixed first character, or an anchoring op code etc. It skips over things
2255 that do not influence this. For one application, a change of caseless option is
2256 important.
2257
2258 Arguments:
2259 code pointer to the start of the group
2260 options pointer to external options
2261 optbit the option bit whose changing is significant, or
2262 zero if none are
2263 optstop TRUE to return on option change, otherwise change the options
2264 value and continue
2265
2266 Returns: pointer to the first significant opcode
2267 */
2268
2269 static const uschar*
2270 first_significant_code(const uschar *code, int *options, int optbit,
2271 BOOL optstop)
2272 {
2273 for (;;)
2274 {
2275 switch ((int)*code)
2276 {
2277 case OP_OPT:
2278 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2279 {
2280 if (optstop) return code;
2281 *options = (int)code[1];
2282 }
2283 code += 2;
2284 break;
2285
2286 case OP_CREF:
2287 code += 2;
2288 break;
2289
2290 case OP_WORD_BOUNDARY:
2291 case OP_NOT_WORD_BOUNDARY:
2292 code++;
2293 break;
2294
2295 case OP_ASSERT_NOT:
2296 case OP_ASSERTBACK:
2297 case OP_ASSERTBACK_NOT:
2298 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2299 code += 3;
2300 break;
2301
2302 default:
2303 return code;
2304 }
2305 }
2306 /* Control never reaches here */
2307 }
2308
2309
2310
2311
2312 /*************************************************
2313 * Check for anchored expression *
2314 *************************************************/
2315
2316 /* Try to find out if this is an anchored regular expression. Consider each
2317 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2318 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2319 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2320 counts, since OP_CIRC can match in the middle.
2321
2322 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2323 because that will try the rest of the pattern at all possible matching points,
2324 so there is no point trying them again.
2325
2326 Arguments:
2327 code points to start of expression (the bracket)
2328 options points to the options setting
2329
2330 Returns: TRUE or FALSE
2331 */
2332
2333 static BOOL
2334 is_anchored(register const uschar *code, int *options)
2335 {
2336 do {
2337 const uschar *scode = first_significant_code(code + 3, options,
2338 PCRE_MULTILINE, FALSE);
2339 register int op = *scode;
2340 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2341 { if (!is_anchored(scode, options)) return FALSE; }
2342 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2343 (*options & PCRE_DOTALL) != 0)
2344 { if (scode[1] != OP_ANY) return FALSE; }
2345 else if (op != OP_SOD &&
2346 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2347 return FALSE;
2348 code += (code[1] << 8) + code[2];
2349 }
2350 while (*code == OP_ALT);
2351 return TRUE;
2352 }
2353
2354
2355
2356 /*************************************************
2357 * Check for starting with ^ or .* *
2358 *************************************************/
2359
2360 /* This is called to find out if every branch starts with ^ or .* so that
2361 "first char" processing can be done to speed things up in multiline
2362 matching and for non-DOTALL patterns that start with .* (which must start at
2363 the beginning or after \n).
2364
2365 Argument: points to start of expression (the bracket)
2366 Returns: TRUE or FALSE
2367 */
2368
2369 static BOOL
2370 is_startline(const uschar *code)
2371 {
2372 do {
2373 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2374 register int op = *scode;
2375 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2376 { if (!is_startline(scode)) return FALSE; }
2377 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2378 { if (scode[1] != OP_ANY) return FALSE; }
2379 else if (op != OP_CIRC) return FALSE;
2380 code += (code[1] << 8) + code[2];
2381 }
2382 while (*code == OP_ALT);
2383 return TRUE;
2384 }
2385
2386
2387
2388 /*************************************************
2389 * Check for fixed first char *
2390 *************************************************/
2391
2392 /* Try to find out if there is a fixed first character. This is called for
2393 unanchored expressions, as it speeds up their processing quite considerably.
2394 Consider each alternative branch. If they all start with the same char, or with
2395 a bracket all of whose alternatives start with the same char (recurse ad lib),
2396 then we return that char, otherwise -1.
2397
2398 Arguments:
2399 code points to start of expression (the bracket)
2400 options pointer to the options (used to check casing changes)
2401
2402 Returns: -1 or the fixed first char
2403 */
2404
2405 static int
2406 find_firstchar(const uschar *code, int *options)
2407 {
2408 register int c = -1;
2409 do {
2410 int d;
2411 const uschar *scode = first_significant_code(code + 3, options,
2412 PCRE_CASELESS, TRUE);
2413 register int op = *scode;
2414
2415 if (op >= OP_BRA) op = OP_BRA;
2416
2417 switch(op)
2418 {
2419 default:
2420 return -1;
2421
2422 case OP_BRA:
2423 case OP_ASSERT:
2424 case OP_ONCE:
2425 case OP_COND:
2426 if ((d = find_firstchar(scode, options)) < 0) return -1;
2427 if (c < 0) c = d; else if (c != d) return -1;
2428 break;
2429
2430 case OP_EXACT: /* Fall through */
2431 scode++;
2432
2433 case OP_CHARS: /* Fall through */
2434 scode++;
2435
2436 case OP_PLUS:
2437 case OP_MINPLUS:
2438 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2439 break;
2440 }
2441
2442 code += (code[1] << 8) + code[2];
2443 }
2444 while (*code == OP_ALT);
2445 return c;
2446 }
2447
2448
2449
2450
2451
2452 /*************************************************
2453 * Compile a Regular Expression *
2454 *************************************************/
2455
2456 /* This function takes a string and returns a pointer to a block of store
2457 holding a compiled version of the expression.
2458
2459 Arguments:
2460 pattern the regular expression
2461 options various option bits
2462 errorptr pointer to pointer to error text
2463 erroroffset ptr offset in pattern where error was detected
2464 tables pointer to character tables or NULL
2465
2466 Returns: pointer to compiled data block, or NULL on error,
2467 with errorptr and erroroffset set
2468 */
2469
2470 pcre *
2471 pcre_compile(const char *pattern, int options, const char **errorptr,
2472 int *erroroffset, const unsigned char *tables)
2473 {
2474 real_pcre *re;
2475 int length = 3; /* For initial BRA plus length */
2476 int runlength;
2477 int c, reqchar, countlits;
2478 int bracount = 0;
2479 int top_backref = 0;
2480 int branch_extra = 0;
2481 int branch_newextra;
2482 unsigned int brastackptr = 0;
2483 size_t size;
2484 uschar *code;
2485 const uschar *ptr;
2486 compile_data compile_block;
2487 int brastack[BRASTACK_SIZE];
2488 uschar bralenstack[BRASTACK_SIZE];
2489
2490 #ifdef DEBUG
2491 uschar *code_base, *code_end;
2492 #endif
2493
2494 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2495
2496 #ifndef SUPPORT_UTF8
2497 if ((options & PCRE_UTF8) != 0)
2498 {
2499 *errorptr = ERR32;
2500 return NULL;
2501 }
2502 #endif
2503
2504 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2505 can do is just return NULL. */
2506
2507 if (errorptr == NULL) return NULL;
2508 *errorptr = NULL;
2509
2510 /* However, we can give a message for this error */
2511
2512 if (erroroffset == NULL)
2513 {
2514 *errorptr = ERR16;
2515 return NULL;
2516 }
2517 *erroroffset = 0;
2518
2519 if ((options & ~PUBLIC_OPTIONS) != 0)
2520 {
2521 *errorptr = ERR17;
2522 return NULL;
2523 }
2524
2525 /* Set up pointers to the individual character tables */
2526
2527 if (tables == NULL) tables = pcre_default_tables;
2528 compile_block.lcc = tables + lcc_offset;
2529 compile_block.fcc = tables + fcc_offset;
2530 compile_block.cbits = tables + cbits_offset;
2531 compile_block.ctypes = tables + ctypes_offset;
2532
2533 /* Reflect pattern for debugging output */
2534
2535 DPRINTF(("------------------------------------------------------------------\n"));
2536 DPRINTF(("%s\n", pattern));
2537
2538 /* The first thing to do is to make a pass over the pattern to compute the
2539 amount of store required to hold the compiled code. This does not have to be
2540 perfect as long as errors are overestimates. At the same time we can detect any
2541 internal flag settings. Make an attempt to correct for any counted white space
2542 if an "extended" flag setting appears late in the pattern. We can't be so
2543 clever for #-comments. */
2544
2545 ptr = (const uschar *)(pattern - 1);
2546 while ((c = *(++ptr)) != 0)
2547 {
2548 int min, max;
2549 int class_charcount;
2550
2551 if ((options & PCRE_EXTENDED) != 0)
2552 {
2553 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2554 if (c == '#')
2555 {
2556 /* The space before the ; is to avoid a warning on a silly compiler
2557 on the Macintosh. */
2558 while ((c = *(++ptr)) != 0 && c != '\n') ;
2559 continue;
2560 }
2561 }
2562
2563 switch(c)
2564 {
2565 /* A backslashed item may be an escaped "normal" character or a
2566 character type. For a "normal" character, put the pointers and
2567 character back so that tests for whitespace etc. in the input
2568 are done correctly. */
2569
2570 case '\\':
2571 {
2572 const uschar *save_ptr = ptr;
2573 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2574 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2575 if (c >= 0)
2576 {
2577 ptr = save_ptr;
2578 c = '\\';
2579 goto NORMAL_CHAR;
2580 }
2581 }
2582 length++;
2583
2584 /* A back reference needs an additional char, plus either one or 5
2585 bytes for a repeat. We also need to keep the value of the highest
2586 back reference. */
2587
2588 if (c <= -ESC_REF)
2589 {
2590 int refnum = -c - ESC_REF;
2591 if (refnum > top_backref) top_backref = refnum;
2592 length++; /* For single back reference */
2593 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2594 {
2595 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2596 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2597 if ((min == 0 && (max == 1 || max == -1)) ||
2598 (min == 1 && max == -1))
2599 length++;
2600 else length += 5;
2601 if (ptr[1] == '?') ptr++;
2602 }
2603 }
2604 continue;
2605
2606 case '^':
2607 case '.':
2608 case '$':
2609 case '*': /* These repeats won't be after brackets; */
2610 case '+': /* those are handled separately */
2611 case '?':
2612 length++;
2613 continue;
2614
2615 /* This covers the cases of repeats after a single char, metachar, class,
2616 or back reference. */
2617
2618 case '{':
2619 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2620 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2621 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2622 if ((min == 0 && (max == 1 || max == -1)) ||
2623 (min == 1 && max == -1))
2624 length++;
2625 else
2626 {
2627 length--; /* Uncount the original char or metachar */
2628 if (min == 1) length++; else if (min > 0) length += 4;
2629 if (max > 0) length += 4; else length += 2;
2630 }
2631 if (ptr[1] == '?') ptr++;
2632 continue;
2633
2634 /* An alternation contains an offset to the next branch or ket. If any ims
2635 options changed in the previous branch(es), and/or if we are in a
2636 lookbehind assertion, extra space will be needed at the start of the
2637 branch. This is handled by branch_extra. */
2638
2639 case '|':
2640 length += 3 + branch_extra;
2641 continue;
2642
2643 /* A character class uses 33 characters. Don't worry about character types
2644 that aren't allowed in classes - they'll get picked up during the compile.
2645 A character class that contains only one character uses 2 or 3 bytes,
2646 depending on whether it is negated or not. Notice this where we can. */
2647
2648 case '[':
2649 class_charcount = 0;
2650 if (*(++ptr) == '^') ptr++;
2651 do
2652 {
2653 if (*ptr == '\\')
2654 {
2655 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2656 &compile_block);
2657 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2658 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2659 }
2660 else class_charcount++;
2661 ptr++;
2662 }
2663 while (*ptr != 0 && *ptr != ']');
2664
2665 /* Repeats for negated single chars are handled by the general code */
2666
2667 if (class_charcount == 1) length += 3; else
2668 {
2669 length += 33;
2670
2671 /* A repeat needs either 1 or 5 bytes. */
2672
2673 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2674 {
2675 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2676 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2677 if ((min == 0 && (max == 1 || max == -1)) ||
2678 (min == 1 && max == -1))
2679 length++;
2680 else length += 5;
2681 if (ptr[1] == '?') ptr++;
2682 }
2683 }
2684 continue;
2685
2686 /* Brackets may be genuine groups or special things */
2687
2688 case '(':
2689 branch_newextra = 0;
2690
2691 /* Handle special forms of bracket, which all start (? */
2692
2693 if (ptr[1] == '?')
2694 {
2695 int set, unset;
2696 int *optset;
2697
2698 switch (c = ptr[2])
2699 {
2700 /* Skip over comments entirely */
2701 case '#':
2702 ptr += 3;
2703 while (*ptr != 0 && *ptr != ')') ptr++;
2704 if (*ptr == 0)
2705 {
2706 *errorptr = ERR18;
2707 goto PCRE_ERROR_RETURN;
2708 }
2709 continue;
2710
2711 /* Non-referencing groups and lookaheads just move the pointer on, and
2712 then behave like a non-special bracket, except that they don't increment
2713 the count of extracting brackets. Ditto for the "once only" bracket,
2714 which is in Perl from version 5.005. */
2715
2716 case ':':
2717 case '=':
2718 case '!':
2719 case '>':
2720 ptr += 2;
2721 break;
2722
2723 /* A recursive call to the regex is an extension, to provide the
2724 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2725
2726 case 'R':
2727 if (ptr[3] != ')')
2728 {
2729 *errorptr = ERR29;
2730 goto PCRE_ERROR_RETURN;
2731 }
2732 ptr += 3;
2733 length += 1;
2734 break;
2735
2736 /* Lookbehinds are in Perl from version 5.005 */
2737
2738 case '<':
2739 if (ptr[3] == '=' || ptr[3] == '!')
2740 {
2741 ptr += 3;
2742 branch_newextra = 3;
2743 length += 3; /* For the first branch */
2744 break;
2745 }
2746 *errorptr = ERR24;
2747 goto PCRE_ERROR_RETURN;
2748
2749 /* Conditionals are in Perl from version 5.005. The bracket must either
2750 be followed by a number (for bracket reference) or by an assertion
2751 group. */
2752
2753 case '(':
2754 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2755 {
2756 ptr += 4;
2757 length += 2;
2758 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2759 if (*ptr != ')')
2760 {
2761 *errorptr = ERR26;
2762 goto PCRE_ERROR_RETURN;
2763 }
2764 }
2765 else /* An assertion must follow */
2766 {
2767 ptr++; /* Can treat like ':' as far as spacing is concerned */
2768 if (ptr[2] != '?' ||
2769 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2770 {
2771 ptr += 2; /* To get right offset in message */
2772 *errorptr = ERR28;
2773 goto PCRE_ERROR_RETURN;
2774 }
2775 }
2776 break;
2777
2778 /* Else loop checking valid options until ) is met. Anything else is an
2779 error. If we are without any brackets, i.e. at top level, the settings
2780 act as if specified in the options, so massage the options immediately.
2781 This is for backward compatibility with Perl 5.004. */
2782
2783 default:
2784 set = unset = 0;
2785 optset = &set;
2786 ptr += 2;
2787
2788 for (;; ptr++)
2789 {
2790 c = *ptr;
2791 switch (c)
2792 {
2793 case 'i':
2794 *optset |= PCRE_CASELESS;
2795 continue;
2796
2797 case 'm':
2798 *optset |= PCRE_MULTILINE;
2799 continue;
2800
2801 case 's':
2802 *optset |= PCRE_DOTALL;
2803 continue;
2804
2805 case 'x':
2806 *optset |= PCRE_EXTENDED;
2807 continue;
2808
2809 case 'X':
2810 *optset |= PCRE_EXTRA;
2811 continue;
2812
2813 case 'U':
2814 *optset |= PCRE_UNGREEDY;
2815 continue;
2816
2817 case '-':
2818 optset = &unset;
2819 continue;
2820
2821 /* A termination by ')' indicates an options-setting-only item;
2822 this is global at top level; otherwise nothing is done here and
2823 it is handled during the compiling process on a per-bracket-group
2824 basis. */
2825
2826 case ')':
2827 if (brastackptr == 0)
2828 {
2829 options = (options | set) & (~unset);
2830 set = unset = 0; /* To save length */
2831 }
2832 /* Fall through */
2833
2834 /* A termination by ':' indicates the start of a nested group with
2835 the given options set. This is again handled at compile time, but
2836 we must allow for compiled space if any of the ims options are
2837 set. We also have to allow for resetting space at the end of
2838 the group, which is why 4 is added to the length and not just 2.
2839 If there are several changes of options within the same group, this
2840 will lead to an over-estimate on the length, but this shouldn't
2841 matter very much. We also have to allow for resetting options at
2842 the start of any alternations, which we do by setting
2843 branch_newextra to 2. Finally, we record whether the case-dependent
2844 flag ever changes within the regex. This is used by the "required
2845 character" code. */
2846
2847 case ':':
2848 if (((set|unset) & PCRE_IMS) != 0)
2849 {
2850 length += 4;
2851 branch_newextra = 2;
2852 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2853 }
2854 goto END_OPTIONS;
2855
2856 /* Unrecognized option character */
2857
2858 default:
2859 *errorptr = ERR12;
2860 goto PCRE_ERROR_RETURN;
2861 }
2862 }
2863
2864 /* If we hit a closing bracket, that's it - this is a freestanding
2865 option-setting. We need to ensure that branch_extra is updated if
2866 necessary. The only values branch_newextra can have here are 0 or 2.
2867 If the value is 2, then branch_extra must either be 2 or 5, depending
2868 on whether this is a lookbehind group or not. */
2869
2870 END_OPTIONS:
2871 if (c == ')')
2872 {
2873 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2874 branch_extra += branch_newextra;
2875 continue;
2876 }
2877
2878 /* If options were terminated by ':' control comes here. Fall through
2879 to handle the group below. */
2880 }
2881 }
2882
2883 /* Extracting brackets must be counted so we can process escapes in a
2884 Perlish way. */
2885
2886 else bracount++;
2887
2888 /* Non-special forms of bracket. Save length for computing whole length
2889 at end if there's a repeat that requires duplication of the group. Also
2890 save the current value of branch_extra, and start the new group with
2891 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2892 for a lookbehind assertion. */
2893
2894 if (brastackptr >= sizeof(brastack)/sizeof(int))
2895 {
2896 *errorptr = ERR19;
2897 goto PCRE_ERROR_RETURN;
2898 }
2899
2900 bralenstack[brastackptr] = branch_extra;
2901 branch_extra = branch_newextra;
2902
2903 brastack[brastackptr++] = length;
2904 length += 3;
2905 continue;
2906
2907 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2908 have to replicate this bracket up to that many times. If brastackptr is
2909 0 this is an unmatched bracket which will generate an error, but take care
2910 not to try to access brastack[-1] when computing the length and restoring
2911 the branch_extra value. */
2912
2913 case ')':
2914 length += 3;
2915 {
2916 int minval = 1;
2917 int maxval = 1;
2918 int duplength;
2919
2920 if (brastackptr > 0)
2921 {
2922 duplength = length - brastack[--brastackptr];
2923 branch_extra = bralenstack[brastackptr];
2924 }
2925 else duplength = 0;
2926
2927 /* Leave ptr at the final char; for read_repeat_counts this happens
2928 automatically; for the others we need an increment. */
2929
2930 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2931 {
2932 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2933 &compile_block);
2934 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2935 }
2936 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2937 else if (c == '+') { maxval = -1; ptr++; }
2938 else if (c == '?') { minval = 0; ptr++; }
2939
2940 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2941 group, and if the maximum is greater than zero, we have to replicate
2942 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2943 bracket set - hence the 7. */
2944
2945 if (minval == 0)
2946 {
2947 length++;
2948 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2949 }
2950
2951 /* When the minimum is greater than zero, 1 we have to replicate up to
2952 minval-1 times, with no additions required in the copies. Then, if
2953 there is a limited maximum we have to replicate up to maxval-1 times
2954 allowing for a BRAZERO item before each optional copy and nesting
2955 brackets for all but one of the optional copies. */
2956
2957 else
2958 {
2959 length += (minval - 1) * duplength;
2960 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2961 length += (maxval - minval) * (duplength + 7) - 6;
2962 }
2963 }
2964 continue;
2965
2966 /* Non-special character. For a run of such characters the length required
2967 is the number of characters + 2, except that the maximum run length is 255.
2968 We won't get a skipped space or a non-data escape or the start of a #
2969 comment as the first character, so the length can't be zero. */
2970
2971 NORMAL_CHAR:
2972 default:
2973 length += 2;
2974 runlength = 0;
2975 do
2976 {
2977 if ((options & PCRE_EXTENDED) != 0)
2978 {
2979 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2980 if (c == '#')
2981 {
2982 /* The space before the ; is to avoid a warning on a silly compiler
2983 on the Macintosh. */
2984 while ((c = *(++ptr)) != 0 && c != '\n') ;
2985 continue;
2986 }
2987 }
2988
2989 /* Backslash may introduce a data char or a metacharacter; stop the
2990 string before the latter. */
2991
2992 if (c == '\\')
2993 {
2994 const uschar *saveptr = ptr;
2995 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2996 &compile_block);
2997 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2998 if (c < 0) { ptr = saveptr; break; }
2999
3000 #ifdef SUPPORT_UTF8
3001 if (c > 127 && (options & PCRE_UTF8) != 0)
3002 {
3003 int i;
3004 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3005 if (c <= utf8_table1[i]) break;
3006 runlength += i;
3007 }
3008 #endif
3009 }
3010
3011 /* Ordinary character or single-char escape */
3012
3013 runlength++;
3014 }
3015
3016 /* This "while" is the end of the "do" above. */
3017
3018 while (runlength < MAXLIT &&
3019 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3020
3021 ptr--;
3022 length += runlength;
3023 continue;
3024 }
3025 }
3026
3027 length += 4; /* For final KET and END */
3028
3029 if (length > 65539)
3030 {
3031 *errorptr = ERR20;
3032 return NULL;
3033 }
3034
3035 /* Compute the size of data block needed and get it, either from malloc or
3036 externally provided function. We specify "code[0]" in the offsetof() expression
3037 rather than just "code", because it has been reported that one broken compiler
3038 fails on "code" because it is also an independent variable. It should make no
3039 difference to the value of the offsetof(). */
3040
3041 size = length + offsetof(real_pcre, code[0]);
3042 re = (real_pcre *)(pcre_malloc)(size);
3043
3044 if (re == NULL)
3045 {
3046 *errorptr = ERR21;
3047 return NULL;
3048 }
3049
3050 /* Put in the magic number, and save the size, options, and table pointer */
3051
3052 re->magic_number = MAGIC_NUMBER;
3053 re->size = size;
3054 re->options = options;
3055 re->tables = tables;
3056
3057 /* Set up a starting, non-extracting bracket, then compile the expression. On
3058 error, *errorptr will be set non-NULL, so we don't need to look at the result
3059 of the function here. */
3060
3061 ptr = (const uschar *)pattern;
3062 code = re->code;
3063 *code = OP_BRA;
3064 bracount = 0;
3065 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3066 &reqchar, &countlits, &compile_block);
3067 re->top_bracket = bracount;
3068 re->top_backref = top_backref;
3069
3070 /* If not reached end of pattern on success, there's an excess bracket. */
3071
3072 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3073
3074 /* Fill in the terminating state and check for disastrous overflow, but
3075 if debugging, leave the test till after things are printed out. */
3076
3077 *code++ = OP_END;
3078
3079 #ifndef DEBUG
3080 if (code - re->code > length) *errorptr = ERR23;
3081 #endif
3082
3083 /* Give an error if there's back reference to a non-existent capturing
3084 subpattern. */
3085
3086 if (top_backref > re->top_bracket) *errorptr = ERR15;
3087
3088 /* Failed to compile */
3089
3090 if (*errorptr != NULL)
3091 {
3092 (pcre_free)(re);
3093 PCRE_ERROR_RETURN:
3094 *erroroffset = ptr - (const uschar *)pattern;
3095 return NULL;
3096 }
3097
3098 /* If the anchored option was not passed, set flag if we can determine that the
3099 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3100 starting with .* when DOTALL is set).
3101
3102 Otherwise, see if we can determine what the first character has to be, because
3103 that speeds up unanchored matches no end. If not, see if we can set the
3104 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3105 start with ^. and also when all branches start with .* for non-DOTALL matches.
3106 */
3107
3108 if ((options & PCRE_ANCHORED) == 0)
3109 {
3110 int temp_options = options;
3111 if (is_anchored(re->code, &temp_options))
3112 re->options |= PCRE_ANCHORED;
3113 else
3114 {
3115 int ch = find_firstchar(re->code, &temp_options);
3116 if (ch >= 0)
3117 {
3118 re->first_char = ch;
3119 re->options |= PCRE_FIRSTSET;
3120 }
3121 else if (is_startline(re->code))
3122 re->options |= PCRE_STARTLINE;
3123 }
3124 }
3125
3126 /* Save the last required character if there are at least two literal
3127 characters on all paths, or if there is no first character setting. */
3128
3129 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3130 {
3131 re->req_char = reqchar;
3132 re->options |= PCRE_REQCHSET;
3133 }
3134
3135 /* Print out the compiled data for debugging */
3136
3137 #ifdef DEBUG
3138
3139 printf("Length = %d top_bracket = %d top_backref = %d\n",
3140 length, re->top_bracket, re->top_backref);
3141
3142 if (re->options != 0)
3143 {
3144 printf("%s%s%s%s%s%s%s%s%s\n",
3145 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3146 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3147 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3148 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3149 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3150 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3151 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3152 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3153 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3154 }
3155
3156 if ((re->options & PCRE_FIRSTSET) != 0)
3157 {
3158 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3159 else printf("First char = \\x%02x\n", re->first_char);
3160 }
3161
3162 if ((re->options & PCRE_REQCHSET) != 0)
3163 {
3164 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3165 else printf("Req char = \\x%02x\n", re->req_char);
3166 }
3167
3168 code_end = code;
3169 code_base = code = re->code;
3170
3171 while (code < code_end)
3172 {
3173 int charlength;
3174
3175 printf("%3d ", code - code_base);
3176
3177 if (*code >= OP_BRA)
3178 {
3179 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3180 code += 2;
3181 }
3182
3183 else switch(*code)
3184 {
3185 case OP_OPT:
3186 printf(" %.2x %s", code[1], OP_names[*code]);
3187 code++;
3188 break;
3189
3190 case OP_COND:
3191 printf("%3d Cond", (code[1] << 8) + code[2]);
3192 code += 2;
3193 break;
3194
3195 case OP_CREF:
3196 printf(" %.2d %s", code[1], OP_names[*code]);
3197 code++;
3198 break;
3199
3200 case OP_CHARS:
3201 charlength = *(++code);
3202 printf("%3d ", charlength);
3203 while (charlength-- > 0)
3204 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3205 break;
3206
3207 case OP_KETRMAX:
3208 case OP_KETRMIN:
3209 case OP_ALT:
3210 case OP_KET:
3211 case OP_ASSERT:
3212 case OP_ASSERT_NOT:
3213 case OP_ASSERTBACK:
3214 case OP_ASSERTBACK_NOT:
3215 case OP_ONCE:
3216 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3217 code += 2;
3218 break;
3219
3220 case OP_REVERSE:
3221 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3222 code += 2;
3223 break;
3224
3225 case OP_STAR:
3226 case OP_MINSTAR:
3227 case OP_PLUS:
3228 case OP_MINPLUS:
3229 case OP_QUERY:
3230 case OP_MINQUERY:
3231 case OP_TYPESTAR:
3232 case OP_TYPEMINSTAR:
3233 case OP_TYPEPLUS:
3234 case OP_TYPEMINPLUS:
3235 case OP_TYPEQUERY:
3236 case OP_TYPEMINQUERY:
3237 if (*code >= OP_TYPESTAR)
3238 printf(" %s", OP_names[code[1]]);
3239 else if (isprint(c = code[1])) printf(" %c", c);
3240 else printf(" \\x%02x", c);
3241 printf("%s", OP_names[*code++]);
3242 break;
3243
3244 case OP_EXACT:
3245 case OP_UPTO:
3246 case OP_MINUPTO:
3247 if (isprint(c = code[3])) printf(" %c{", c);
3248 else printf(" \\x%02x{", c);
3249 if (*code != OP_EXACT) printf("0,");
3250 printf("%d}", (code[1] << 8) + code[2]);
3251 if (*code == OP_MINUPTO) printf("?");
3252 code += 3;
3253 break;
3254
3255 case OP_TYPEEXACT:
3256 case OP_TYPEUPTO:
3257 case OP_TYPEMINUPTO:
3258 printf(" %s{", OP_names[code[3]]);
3259 if (*code != OP_TYPEEXACT) printf(",");
3260 printf("%d}", (code[1] << 8) + code[2]);
3261 if (*code == OP_TYPEMINUPTO) printf("?");
3262 code += 3;
3263 break;
3264
3265 case OP_NOT:
3266 if (isprint(c = *(++code))) printf(" [^%c]", c);
3267 else printf(" [^\\x%02x]", c);
3268 break;
3269
3270 case OP_NOTSTAR:
3271 case OP_NOTMINSTAR:
3272 case OP_NOTPLUS:
3273 case OP_NOTMINPLUS:
3274 case OP_NOTQUERY:
3275 case OP_NOTMINQUERY:
3276 if (isprint(c = code[1])) printf(" [^%c]", c);
3277 else printf(" [^\\x%02x]", c);
3278 printf("%s", OP_names[*code++]);
3279 break;
3280
3281 case OP_NOTEXACT:
3282 case OP_NOTUPTO:
3283 case OP_NOTMINUPTO:
3284 if (isprint(c = code[3])) printf(" [^%c]{", c);
3285 else printf(" [^\\x%02x]{", c);
3286 if (*code != OP_NOTEXACT) printf(",");
3287 printf("%d}", (code[1] << 8) + code[2]);
3288 if (*code == OP_NOTMINUPTO) printf("?");
3289 code += 3;
3290 break;
3291
3292 case OP_REF:
3293 printf(" \\%d", *(++code));
3294 code ++;
3295 goto CLASS_REF_REPEAT;
3296
3297 case OP_CLASS:
3298 {
3299 int i, min, max;
3300 code++;
3301 printf(" [");
3302
3303 for (i = 0; i < 256; i++)
3304 {
3305 if ((code[i/8] & (1 << (i&7))) != 0)
3306 {
3307 int j;
3308 for (j = i+1; j < 256; j++)
3309 if ((code[j/8] & (1 << (j&7))) == 0) break;
3310 if (i == '-' || i == ']') printf("\\");
3311 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3312 if (--j > i)
3313 {
3314 printf("-");
3315 if (j == '-' || j == ']') printf("\\");
3316 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3317 }
3318 i = j;
3319 }
3320 }
3321 printf("]");
3322 code += 32;
3323
3324 CLASS_REF_REPEAT:
3325
3326 switch(*code)
3327 {
3328 case OP_CRSTAR:
3329 case OP_CRMINSTAR:
3330 case OP_CRPLUS:
3331 case OP_CRMINPLUS:
3332 case OP_CRQUERY:
3333 case OP_CRMINQUERY:
3334 printf("%s", OP_names[*code]);
3335 break;
3336
3337 case OP_CRRANGE:
3338 case OP_CRMINRANGE:
3339 min = (code[1] << 8) + code[2];
3340 max = (code[3] << 8) + code[4];
3341 if (max == 0) printf("{%d,}", min);
3342 else printf("{%d,%d}", min, max);
3343 if (*code == OP_CRMINRANGE) printf("?");
3344 code += 4;
3345 break;
3346
3347 default:
3348 code--;
3349 }
3350 }
3351 break;
3352
3353 /* Anything else is just a one-node item */
3354
3355 default:
3356 printf(" %s", OP_names[*code]);
3357 break;
3358 }
3359
3360 code++;
3361 printf("\n");
3362 }
3363 printf("------------------------------------------------------------------\n");
3364
3365 /* This check is done here in the debugging case so that the code that
3366 was compiled can be seen. */
3367
3368 if (code - re->code > length)
3369 {
3370 *errorptr = ERR23;
3371 (pcre_free)(re);
3372 *erroroffset = ptr - (uschar *)pattern;
3373 return NULL;
3374 }
3375 #endif
3376
3377 return (pcre *)re;
3378 }
3379
3380
3381
3382 /*************************************************
3383 * Match a back-reference *
3384 *************************************************/
3385
3386 /* If a back reference hasn't been set, the length that is passed is greater
3387 than the number of characters left in the string, so the match fails.
3388
3389 Arguments:
3390 offset index into the offset vector
3391 eptr points into the subject
3392 length length to be matched
3393 md points to match data block
3394 ims the ims flags
3395
3396 Returns: TRUE if matched
3397 */
3398
3399 static BOOL
3400 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3401 unsigned long int ims)
3402 {
3403 const uschar *p = md->start_subject + md->offset_vector[offset];
3404
3405 #ifdef DEBUG
3406 if (eptr >= md->end_subject)
3407 printf("matching subject <null>");
3408 else
3409 {
3410 printf("matching subject ");
3411 pchars(eptr, length, TRUE, md);
3412 }
3413 printf(" against backref ");
3414 pchars(p, length, FALSE, md);
3415 printf("\n");
3416 #endif
3417
3418 /* Always fail if not enough characters left */
3419
3420 if (length > md->end_subject - eptr) return FALSE;
3421
3422 /* Separate the caselesss case for speed */
3423
3424 if ((ims & PCRE_CASELESS) != 0)
3425 {
3426 while (length-- > 0)
3427 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3428 }
3429 else
3430 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3431
3432 return TRUE;
3433 }
3434
3435
3436
3437 /*************************************************
3438 * Match from current position *
3439 *************************************************/
3440
3441 /* On entry ecode points to the first opcode, and eptr to the first character
3442 in the subject string, while eptrb holds the value of eptr at the start of the
3443 last bracketed group - used for breaking infinite loops matching zero-length
3444 strings.
3445
3446 Arguments:
3447 eptr pointer in subject
3448 ecode position in code
3449 offset_top current top pointer
3450 md pointer to "static" info for the match
3451 ims current /i, /m, and /s options
3452 eptrb pointer to chain of blocks containing eptr at start of
3453 brackets - for testing for empty matches
3454 flags can contain
3455 match_condassert - this is an assertion condition
3456 match_isgroup - this is the start of a bracketed group
3457
3458 Returns: TRUE if matched
3459 */
3460
3461 static BOOL
3462 match(register const uschar *eptr, register const uschar *ecode,
3463 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3464 int flags)
3465 {
3466 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3467 eptrblock newptrb;
3468
3469 /* At the start of a bracketed group, add the current subject pointer to the
3470 stack of such pointers, to be re-instated at the end of the group when we hit
3471 the closing ket. When match() is called in other circumstances, we don't add to
3472 the stack. */
3473
3474 if ((flags & match_isgroup) != 0)
3475 {
3476 newptrb.prev = eptrb;
3477 newptrb.saved_eptr = eptr;
3478 eptrb = &newptrb;
3479 }
3480
3481 /* Now start processing the operations. */
3482
3483 for (;;)
3484 {
3485 int op = (int)*ecode;
3486 int min, max, ctype;
3487 register int i;
3488 register int c;
3489 BOOL minimize = FALSE;
3490
3491 /* Opening capturing bracket. If there is space in the offset vector, save
3492 the current subject position in the working slot at the top of the vector. We
3493 mustn't change the current values of the data slot, because they may be set
3494 from a previous iteration of this group, and be referred to by a reference
3495 inside the group.
3496
3497 If the bracket fails to match, we need to restore this value and also the
3498 values of the final offsets, in case they were set by a previous iteration of
3499 the same bracket.
3500
3501 If there isn't enough space in the offset vector, treat this as if it were a
3502 non-capturing bracket. Don't worry about setting the flag for the error case
3503 here; that is handled in the code for KET. */
3504
3505 if (op > OP_BRA)
3506 {
3507 int number = op - OP_BRA;
3508 int offset = number << 1;
3509
3510 #ifdef DEBUG
3511 printf("start bracket %d subject=", number);
3512 pchars(eptr, 16, TRUE, md);
3513 printf("\n");
3514 #endif
3515
3516 if (offset < md->offset_max)
3517 {
3518 int save_offset1 = md->offset_vector[offset];
3519 int save_offset2 = md->offset_vector[offset+1];
3520 int save_offset3 = md->offset_vector[md->offset_end - number];
3521
3522 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3523 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3524
3525 do
3526 {
3527 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3528 return TRUE;
3529 ecode += (ecode[1] << 8) + ecode[2];
3530 }
3531 while (*ecode == OP_ALT);
3532
3533 DPRINTF(("bracket %d failed\n", number));
3534
3535 md->offset_vector[offset] = save_offset1;
3536 md->offset_vector[offset+1] = save_offset2;
3537 md->offset_vector[md->offset_end - number] = save_offset3;
3538 return FALSE;
3539 }
3540
3541 /* Insufficient room for saving captured contents */
3542
3543 else op = OP_BRA;
3544 }
3545
3546 /* Other types of node can be handled by a switch */
3547
3548 switch(op)
3549 {
3550 case OP_BRA: /* Non-capturing bracket: optimized */
3551 DPRINTF(("start bracket 0\n"));
3552 do
3553 {
3554 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3555 return TRUE;
3556 ecode += (ecode[1] << 8) + ecode[2];
3557 }
3558 while (*ecode == OP_ALT);
3559 DPRINTF(("bracket 0 failed\n"));
3560 return FALSE;
3561
3562 /* Conditional group: compilation checked that there are no more than
3563 two branches. If the condition is false, skipping the first branch takes us
3564 past the end if there is only one branch, but that's OK because that is
3565 exactly what going to the ket would do. */
3566
3567 case OP_COND:
3568 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3569 {
3570 int offset = ecode[4] << 1; /* Doubled reference number */
3571 return match(eptr,
3572 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3573 5 : 3 + (ecode[1] << 8) + ecode[2]),
3574 offset_top, md, ims, eptrb, match_isgroup);
3575 }
3576
3577 /* The condition is an assertion. Call match() to evaluate it - setting
3578 the final argument TRUE causes it to stop at the end of an assertion. */
3579
3580 else
3581 {
3582 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3583 match_condassert | match_isgroup))
3584 {
3585 ecode += 3 + (ecode[4] << 8) + ecode[5];
3586 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3587 }
3588 else ecode += (ecode[1] << 8) + ecode[2];
3589 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3590 }
3591 /* Control never reaches here */
3592
3593 /* Skip over conditional reference data if encountered (should not be) */
3594
3595 case OP_CREF:
3596 ecode += 2;
3597 break;
3598
3599 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3600 an empty string - recursion will then try other alternatives, if any. */
3601
3602 case OP_END:
3603 if (md->notempty && eptr == md->start_match) return FALSE;
3604 md->end_match_ptr = eptr; /* Record where we ended */
3605 md->end_offset_top = offset_top; /* and how many extracts were taken */
3606 return TRUE;
3607
3608 /* Change option settings */
3609
3610 case OP_OPT:
3611 ims = ecode[1];
3612 ecode += 2;
3613 DPRINTF(("ims set to %02lx\n", ims));
3614 break;
3615
3616 /* Assertion brackets. Check the alternative branches in turn - the
3617 matching won't pass the KET for an assertion. If any one branch matches,
3618 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3619 start of each branch to move the current point backwards, so the code at
3620 this level is identical to the lookahead case. */
3621
3622 case OP_ASSERT:
3623 case OP_ASSERTBACK:
3624 do
3625 {
3626 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3627 ecode += (ecode[1] << 8) + ecode[2];
3628 }
3629 while (*ecode == OP_ALT);
3630 if (*ecode == OP_KET) return FALSE;
3631
3632 /* If checking an assertion for a condition, return TRUE. */
3633
3634 if ((flags & match_condassert) != 0) return TRUE;
3635
3636 /* Continue from after the assertion, updating the offsets high water
3637 mark, since extracts may have been taken during the assertion. */
3638
3639 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3640 ecode += 3;
3641 offset_top = md->end_offset_top;
3642 continue;
3643
3644 /* Negative assertion: all branches must fail to match */
3645
3646 case OP_ASSERT_NOT:
3647 case OP_ASSERTBACK_NOT:
3648 do
3649 {
3650 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3651 return FALSE;
3652 ecode += (ecode[1] << 8) + ecode[2];
3653 }
3654 while (*ecode == OP_ALT);
3655
3656 if ((flags & match_condassert) != 0) return TRUE;
3657
3658 ecode += 3;
3659 continue;
3660
3661 /* Move the subject pointer back. This occurs only at the start of
3662 each branch of a lookbehind assertion. If we are too close to the start to
3663 move back, this match function fails. When working with UTF-8 we move
3664 back a number of characters, not bytes. */
3665
3666 case OP_REVERSE:
3667 #ifdef SUPPORT_UTF8
3668 c = (ecode[1] << 8) + ecode[2];
3669 for (i = 0; i < c; i++)
3670 {
3671 eptr--;
3672 BACKCHAR(eptr)
3673 }
3674 #else
3675 eptr -= (ecode[1] << 8) + ecode[2];
3676 #endif
3677
3678 if (eptr < md->start_subject) return FALSE;
3679 ecode += 3;
3680 break;
3681
3682 /* Recursion matches the current regex, nested. If there are any capturing
3683 brackets started but not finished, we have to save their starting points
3684 and reinstate them after the recursion. However, we don't know how many
3685 such there are (offset_top records the completed total) so we just have
3686 to save all the potential data. There may be up to 99 such values, which
3687 is a bit large to put on the stack, but using malloc for small numbers
3688 seems expensive. As a compromise, the stack is used when there are fewer
3689 than 16 values to store; otherwise malloc is used. A problem is what to do
3690 if the malloc fails ... there is no way of returning to the top level with
3691 an error. Save the top 15 values on the stack, and accept that the rest
3692 may be wrong. */
3693
3694 case OP_RECURSE:
3695 {
3696 BOOL rc;
3697 int *save;
3698 int stacksave[15];
3699
3700 c = md->offset_max;
3701
3702 if (c < 16) save = stacksave; else
3703 {
3704 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3705 if (save == NULL)
3706 {
3707 save = stacksave;
3708 c = 15;
3709 }
3710 }
3711
3712 for (i = 1; i <= c; i++)
3713 save[i] = md->offset_vector[md->offset_end - i];
3714 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3715 match_isgroup);
3716 for (i = 1; i <= c; i++)
3717 md->offset_vector[md->offset_end - i] = save[i];
3718 if (save != stacksave) (pcre_free)(save);
3719 if (!rc) return FALSE;
3720
3721 /* In case the recursion has set more capturing values, save the final
3722 number, then move along the subject till after the recursive match,
3723 and advance one byte in the pattern code. */
3724
3725 offset_top = md->end_offset_top;
3726 eptr = md->end_match_ptr;
3727 ecode++;
3728 }
3729 break;
3730
3731 /* "Once" brackets are like assertion brackets except that after a match,
3732 the point in the subject string is not moved back. Thus there can never be
3733 a move back into the brackets. Check the alternative branches in turn - the
3734 matching won't pass the KET for this kind of subpattern. If any one branch
3735 matches, we carry on as at the end of a normal bracket, leaving the subject
3736 pointer. */
3737
3738 case OP_ONCE:
3739 {
3740 const uschar *prev = ecode;
3741 const uschar *saved_eptr = eptr;
3742
3743 do
3744 {
3745 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3746 break;
3747 ecode += (ecode[1] << 8) + ecode[2];
3748 }
3749 while (*ecode == OP_ALT);
3750
3751 /* If hit the end of the group (which could be repeated), fail */
3752
3753 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3754
3755 /* Continue as from after the assertion, updating the offsets high water
3756 mark, since extracts may have been taken. */
3757
3758 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3759
3760 offset_top = md->end_offset_top;
3761 eptr = md->end_match_ptr;
3762
3763 /* For a non-repeating ket, just continue at this level. This also
3764 happens for a repeating ket if no characters were matched in the group.
3765 This is the forcible breaking of infinite loops as implemented in Perl
3766 5.005. If there is an options reset, it will get obeyed in the normal
3767 course of events. */
3768
3769 if (*ecode == OP_KET || eptr == saved_eptr)
3770 {
3771 ecode += 3;
3772 break;
3773 }
3774
3775 /* The repeating kets try the rest of the pattern or restart from the
3776 preceding bracket, in the appropriate order. We need to reset any options
3777 that changed within the bracket before re-running it, so check the next
3778 opcode. */
3779
3780 if (ecode[3] == OP_OPT)
3781 {
3782 ims = (ims & ~PCRE_IMS) | ecode[4];
3783 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3784 }
3785
3786 if (*ecode == OP_KETRMIN)
3787 {
3788 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3789 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3790 return TRUE;
3791 }
3792 else /* OP_KETRMAX */
3793 {
3794 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3795 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3796 }
3797 }
3798 return FALSE;
3799
3800 /* An alternation is the end of a branch; scan along to find the end of the
3801 bracketed group and go to there. */
3802
3803 case OP_ALT:
3804 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3805 break;
3806
3807 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3808 that it may occur zero times. It may repeat infinitely, or not at all -
3809 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3810 repeat limits are compiled as a number of copies, with the optional ones
3811 preceded by BRAZERO or BRAMINZERO. */
3812
3813 case OP_BRAZERO:
3814 {
3815 const uschar *next = ecode+1;
3816 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3817 return TRUE;
3818 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3819 ecode = next + 3;
3820 }
3821 break;
3822
3823 case OP_BRAMINZERO:
3824 {
3825 const uschar *next = ecode+1;
3826 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3827 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3828 return TRUE;
3829 ecode++;
3830 }
3831 break;
3832
3833 /* End of a group, repeated or non-repeating. If we are at the end of
3834 an assertion "group", stop matching and return TRUE, but record the
3835 current high water mark for use by positive assertions. Do this also
3836 for the "once" (not-backup up) groups. */
3837
3838 case OP_KET:
3839 case OP_KETRMIN:
3840 case OP_KETRMAX:
3841 {
3842 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3843 const uschar *saved_eptr = eptrb->saved_eptr;
3844
3845 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3846
3847 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3848 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3849 *prev == OP_ONCE)
3850 {
3851 md->end_match_ptr = eptr; /* For ONCE */
3852 md->end_offset_top = offset_top;
3853 return TRUE;
3854 }
3855
3856 /* In all other cases except a conditional group we have to check the
3857 group number back at the start and if necessary complete handling an
3858 extraction by setting the offsets and bumping the high water mark. */
3859
3860 if (*prev != OP_COND)
3861 {
3862 int number = *prev - OP_BRA;
3863 int offset = number << 1;
3864
3865 #ifdef DEBUG
3866 printf("end bracket %d", number);
3867 printf("\n");
3868 #endif
3869
3870 if (number > 0)
3871 {
3872 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3873 {
3874 md->offset_vector[offset] =
3875 md->offset_vector[md->offset_end - number];
3876 md->offset_vector[offset+1] = eptr - md->start_subject;
3877 if (offset_top <= offset) offset_top = offset + 2;
3878 }
3879 }
3880 }
3881
3882 /* Reset the value of the ims flags, in case they got changed during
3883 the group. */
3884
3885 ims = original_ims;
3886 DPRINTF(("ims reset to %02lx\n", ims));
3887
3888 /* For a non-repeating ket, just continue at this level. This also
3889 happens for a repeating ket if no characters were matched in the group.
3890 This is the forcible breaking of infinite loops as implemented in Perl
3891 5.005. If there is an options reset, it will get obeyed in the normal
3892 course of events. */
3893
3894 if (*ecode == OP_KET || eptr == saved_eptr)
3895 {
3896 ecode += 3;
3897 break;
3898 }
3899
3900 /* The repeating kets try the rest of the pattern or restart from the
3901 preceding bracket, in the appropriate order. */
3902
3903 if (*ecode == OP_KETRMIN)
3904 {
3905 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3906 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3907 return TRUE;
3908 }
3909 else /* OP_KETRMAX */
3910 {
3911 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3912 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3913 }
3914 }
3915 return FALSE;
3916
3917 /* Start of subject unless notbol, or after internal newline if multiline */
3918
3919 case OP_CIRC:
3920 if (md->notbol && eptr == md->start_subject) return FALSE;
3921 if ((ims & PCRE_MULTILINE) != 0)
3922 {
3923 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3924 ecode++;
3925 break;
3926 }
3927 /* ... else fall through */
3928
3929 /* Start of subject assertion */
3930
3931 case OP_SOD:
3932 if (eptr != md->start_subject) return FALSE;
3933 ecode++;
3934 break;
3935
3936 /* Assert before internal newline if multiline, or before a terminating
3937 newline unless endonly is set, else end of subject unless noteol is set. */
3938
3939 case OP_DOLL:
3940 if ((ims & PCRE_MULTILINE) != 0)
3941 {
3942 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3943 else { if (md->noteol) return FALSE; }
3944 ecode++;
3945 break;
3946 }
3947 else
3948 {
3949 if (md->noteol) return FALSE;
3950 if (!md->endonly)
3951 {
3952 if (eptr < md->end_subject - 1 ||
3953 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3954
3955 ecode++;
3956 break;
3957 }
3958 }
3959 /* ... else fall through */
3960
3961 /* End of subject assertion (\z) */
3962
3963 case OP_EOD:
3964 if (eptr < md->end_subject) return FALSE;
3965 ecode++;
3966 break;
3967
3968 /* End of subject or ending \n assertion (\Z) */
3969
3970 case OP_EODN:
3971 if (eptr < md->end_subject - 1 ||
3972 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3973 ecode++;
3974 break;
3975
3976 /* Word boundary assertions */
3977
3978 case OP_NOT_WORD_BOUNDARY:
3979 case OP_WORD_BOUNDARY:
3980 {
3981 BOOL prev_is_word = (eptr != md->start_subject) &&
3982 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3983 BOOL cur_is_word = (eptr < md->end_subject) &&
3984 ((md->ctypes[*eptr] & ctype_word) != 0);
3985 if ((*ecode++ == OP_WORD_BOUNDARY)?
3986 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3987 return FALSE;
3988 }
3989 break;
3990
3991 /* Match a single character type; inline for speed */
3992
3993 case OP_ANY:
3994 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3995 return FALSE;
3996 if (eptr++ >= md->end_subject) return FALSE;
3997 #ifdef SUPPORT_UTF8
3998 if (md->utf8)
3999 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4000 #endif
4001 ecode++;
4002 break;
4003
4004 case OP_NOT_DIGIT:
4005 if (eptr >= md->end_subject ||
4006 (md->ctypes[*eptr++] & ctype_digit) != 0)
4007 return FALSE;
4008 ecode++;
4009 break;
4010
4011 case OP_DIGIT:
4012 if (eptr >= md->end_subject ||
4013 (md->ctypes[*eptr++] & ctype_digit) == 0)
4014 return FALSE;
4015 ecode++;
4016 break;
4017
4018 case OP_NOT_WHITESPACE:
4019 if (eptr >= md->end_subject ||
4020 (md->ctypes[*eptr++] & ctype_space) != 0)
4021 return FALSE;
4022 ecode++;
4023 break;
4024
4025 case OP_WHITESPACE:
4026 if (eptr >= md->end_subject ||
4027 (md->ctypes[*eptr++] & ctype_space) == 0)
4028 return FALSE;
4029 ecode++;
4030 break;
4031
4032 case OP_NOT_WORDCHAR:
4033 if (eptr >= md->end_subject ||
4034 (md->ctypes[*eptr++] & ctype_word) != 0)
4035 return FALSE;
4036 ecode++;
4037 break;
4038
4039 case OP_WORDCHAR:
4040 if (eptr >= md->end_subject ||
4041 (md->ctypes[*eptr++] & ctype_word) == 0)
4042 return FALSE;
4043 ecode++;
4044 break;
4045
4046 /* Match a back reference, possibly repeatedly. Look past the end of the
4047 item to see if there is repeat information following. The code is similar
4048 to that for character classes, but repeated for efficiency. Then obey
4049 similar code to character type repeats - written out again for speed.
4050 However, if the referenced string is the empty string, always treat
4051 it as matched, any number of times (otherwise there could be infinite
4052 loops). */
4053
4054 case OP_REF:
4055 {
4056 int length;
4057 int offset = ecode[1] << 1; /* Doubled reference number */
4058 ecode += 2; /* Advance past the item */
4059
4060 /* If the reference is unset, set the length to be longer than the amount
4061 of subject left; this ensures that every attempt at a match fails. We
4062 can't just fail here, because of the possibility of quantifiers with zero
4063 minima. */
4064
4065 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4066 md->end_subject - eptr + 1 :
4067 md->offset_vector[offset+1] - md->offset_vector[offset];
4068
4069 /* Set up for repetition, or handle the non-repeated case */
4070
4071 switch (*ecode)
4072 {
4073 case OP_CRSTAR:
4074 case OP_CRMINSTAR:
4075 case OP_CRPLUS:
4076 case OP_CRMINPLUS:
4077 case OP_CRQUERY:
4078 case OP_CRMINQUERY:
4079 c = *ecode++ - OP_CRSTAR;
4080 minimize = (c & 1) != 0;
4081 min = rep_min[c]; /* Pick up values from tables; */
4082 max = rep_max[c]; /* zero for max => infinity */
4083 if (max == 0) max = INT_MAX;
4084 break;
4085
4086 case OP_CRRANGE:
4087 case OP_CRMINRANGE:
4088 minimize = (*ecode == OP_CRMINRANGE);
4089 min = (ecode[1] << 8) + ecode[2];
4090 max = (ecode[3] << 8) + ecode[4];
4091 if (max == 0) max = INT_MAX;
4092 ecode += 5;
4093 break;
4094
4095 default: /* No repeat follows */
4096 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4097 eptr += length;
4098 continue; /* With the main loop */
4099 }
4100
4101 /* If the length of the reference is zero, just continue with the
4102 main loop. */
4103
4104 if (length == 0) continue;
4105
4106 /* First, ensure the minimum number of matches are present. We get back
4107 the length of the reference string explicitly rather than passing the
4108 address of eptr, so that eptr can be a register variable. */
4109
4110 for (i = 1; i <= min; i++)
4111 {
4112 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4113 eptr += length;
4114 }
4115
4116 /* If min = max, continue at the same level without recursion.
4117 They are not both allowed to be zero. */
4118
4119 if (min == max) continue;
4120
4121 /* If minimizing, keep trying and advancing the pointer */
4122
4123 if (minimize)
4124 {
4125 for (i = min;; i++)
4126 {
4127 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4128 return TRUE;
4129 if (i >= max || !match_ref(offset, eptr, length, md, ims))
4130 return FALSE;
4131 eptr += length;
4132 }
4133 /* Control never gets here */
4134 }
4135
4136 /* If maximizing, find the longest string and work backwards */
4137
4138 else
4139 {
4140 const uschar *pp = eptr;
4141 for (i = min; i < max; i++)
4142 {
4143 if (!match_ref(offset, eptr, length, md, ims)) break;
4144 eptr += length;
4145 }
4146 while (eptr >= pp)
4147 {
4148 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4149 return TRUE;
4150 eptr -= length;
4151 }
4152 return FALSE;
4153 }
4154 }
4155 /* Control never gets here */
4156
4157
4158
4159 /* Match a character class, possibly repeatedly. Look past the end of the
4160 item to see if there is repeat information following. Then obey similar
4161 code to character type repeats - written out again for speed. */
4162
4163 case OP_CLASS:
4164 {
4165 const uschar *data = ecode + 1; /* Save for matching */
4166 ecode += 33; /* Advance past the item */
4167
4168 switch (*ecode)
4169 {
4170 case OP_CRSTAR:
4171 case OP_CRMINSTAR:
4172 case OP_CRPLUS:
4173 case OP_CRMINPLUS:
4174 case OP_CRQUERY:
4175 case OP_CRMINQUERY:
4176 c = *ecode++ - OP_CRSTAR;
4177 minimize = (c & 1) != 0;
4178 min = rep_min[c]; /* Pick up values from tables; */
4179 max = rep_max[c]; /* zero for max => infinity */
4180 if (max == 0) max = INT_MAX;
4181 break;
4182
4183 case OP_CRRANGE:
4184 case OP_CRMINRANGE:
4185 minimize = (*ecode == OP_CRMINRANGE);
4186 min = (ecode[1] << 8) + ecode[2];
4187 max = (ecode[3] << 8) + ecode[4];
4188 if (max == 0) max = INT_MAX;
4189 ecode += 5;
4190 break;
4191
4192 default: /* No repeat follows */
4193 min = max = 1;
4194 break;
4195 }
4196
4197 /* First, ensure the minimum number of matches are present. */
4198
4199 for (i = 1; i <= min; i++)
4200 {
4201 if (eptr >= md->end_subject) return FALSE;
4202 GETCHARINC(c, eptr) /* Get character; increment eptr */
4203
4204 #ifdef SUPPORT_UTF8
4205 /* We do not yet support class members > 255 */
4206 if (c > 255) return FALSE;
4207 #endif
4208
4209 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4210 return FALSE;
4211 }
4212
4213 /* If max == min we can continue with the main loop without the
4214 need to recurse. */
4215
4216 if (min == max) continue;
4217
4218 /* If minimizing, keep testing the rest of the expression and advancing
4219 the pointer while it matches the class. */
4220
4221 if (minimize)
4222 {
4223 for (i = min;; i++)
4224 {
4225 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4226 return TRUE;
4227 if (i >= max || eptr >= md->end_subject) return FALSE;
4228 GETCHARINC(c, eptr) /* Get character; increment eptr */
4229
4230 #ifdef SUPPORT_UTF8
4231 /* We do not yet support class members > 255 */
4232 if (c > 255) return FALSE;
4233 #endif
4234 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4235 return FALSE;
4236 }
4237 /* Control never gets here */
4238 }
4239
4240 /* If maximizing, find the longest possible run, then work backwards. */
4241
4242 else
4243 {
4244 const uschar *pp = eptr;
4245 int len = 1;
4246 for (i = min; i < max; i++)
4247 {
4248 if (eptr >= md->end_subject) break;
4249 GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
4250
4251 #ifdef SUPPORT_UTF8
4252 /* We do not yet support class members > 255 */
4253 if (c > 255) break;
4254 #endif
4255 if ((data[c/8] & (1 << (c&7))) == 0) break;
4256 eptr += len;
4257 }
4258
4259 while (eptr >= pp)
4260 {
4261 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4262 return TRUE;
4263
4264 #ifdef SUPPORT_UTF8
4265 BACKCHAR(eptr)
4266 #endif
4267 }
4268 return FALSE;
4269 }
4270 }
4271 /* Control never gets here */
4272
4273 /* Match a run of characters */
4274
4275 case OP_CHARS:
4276 {
4277 register int length = ecode[1];
4278 ecode += 2;
4279
4280 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4281 if (eptr >= md->end_subject)
4282 printf("matching subject <null> against pattern ");
4283 else
4284 {
4285 printf("matching subject ");
4286 pchars(eptr, length, TRUE, md);
4287 printf(" against pattern ");
4288 }
4289 pchars(ecode, length, FALSE, md);
4290 printf("\n");
4291 #endif
4292
4293 if (length > md->end_subject - eptr) return FALSE;
4294 if ((ims & PCRE_CASELESS) != 0)
4295 {
4296 while (length-- > 0)
4297 if (md->lcc[*ecode++] != md->lcc[*eptr++])
4298 return FALSE;
4299 }
4300 else
4301 {
4302 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4303 }
4304 }
4305 break;
4306
4307 /* Match a single character repeatedly; different opcodes share code. */
4308
4309 case OP_EXACT:
4310 min = max = (ecode[1] << 8) + ecode[2];
4311 ecode += 3;
4312 goto REPEATCHAR;
4313
4314 case OP_UPTO:
4315 case OP_MINUPTO:
4316 min = 0;
4317 max = (ecode[1] << 8) + ecode[2];
4318 minimize = *ecode == OP_MINUPTO;
4319 ecode += 3;
4320 goto REPEATCHAR;
4321
4322 case OP_STAR:
4323 case OP_MINSTAR:
4324 case OP_PLUS:
4325 case OP_MINPLUS:
4326 case OP_QUERY:
4327 case OP_MINQUERY:
4328 c = *ecode++ - OP_STAR;
4329 minimize = (c & 1) != 0;
4330 min = rep_min[c]; /* Pick up values from tables; */
4331 max = rep_max[c]; /* zero for max => infinity */
4332 if (max == 0) max = INT_MAX;
4333
4334 /* Common code for all repeated single-character matches. We can give
4335 up quickly if there are fewer than the minimum number of characters left in
4336 the subject. */
4337
4338 REPEATCHAR:
4339 if (min > md->end_subject - eptr) return FALSE;
4340 c = *ecode++;
4341
4342 /* The code is duplicated for the caseless and caseful cases, for speed,
4343 since matching characters is likely to be quite common. First, ensure the
4344 minimum number of matches are present. If min = max, continue at the same
4345 level without recursing. Otherwise, if minimizing, keep trying the rest of
4346 the expression and advancing one matching character if failing, up to the
4347 maximum. Alternatively, if maximizing, find the maximum number of
4348 characters and work backwards. */
4349
4350 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4351 max, eptr));
4352
4353 if ((ims & PCRE_CASELESS) != 0)
4354 {
4355 c = md->lcc[c];
4356 for (i = 1; i <= min; i++)
4357 if (c != md->lcc[*eptr++]) return FALSE;
4358 if (min == max) continue;
4359 if (minimize)
4360 {
4361 for (i = min;; i++)
4362 {
4363 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4364 return TRUE;
4365 if (i >= max || eptr >= md->end_subject ||
4366 c != md->lcc[*eptr++])
4367 return FALSE;
4368 }
4369 /* Control never gets here */
4370 }
4371 else
4372 {
4373 const uschar *pp = eptr;
4374 for (i = min; i < max; i++)
4375 {
4376 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4377 eptr++;
4378 }
4379 while (eptr >= pp)
4380 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4381 return TRUE;
4382 return FALSE;
4383 }
4384 /* Control never gets here */
4385 }
4386
4387 /* Caseful comparisons */
4388
4389 else
4390 {
4391 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4392 if (min == max) continue;
4393 if (minimize)
4394 {
4395 for (i = min;; i++)
4396 {
4397 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4398 return TRUE;
4399 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4400 }
4401 /* Control never gets here */
4402 }
4403 else
4404 {
4405 const uschar *pp = eptr;
4406 for (i = min; i < max; i++)
4407 {
4408 if (eptr >= md->end_subject || c != *eptr) break;
4409 eptr++;
4410 }
4411 while (eptr >= pp)
4412 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4413 return TRUE;
4414 return FALSE;
4415 }
4416 }
4417 /* Control never gets here */
4418
4419 /* Match a negated single character */
4420
4421 case OP_NOT:
4422 if (eptr >= md->end_subject) return FALSE;
4423 ecode++;
4424 if ((ims & PCRE_CASELESS) != 0)
4425 {
4426 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4427 }
4428 else
4429 {
4430 if (*ecode++ == *eptr++) return FALSE;
4431 }
4432 break;
4433
4434 /* Match a negated single character repeatedly. This is almost a repeat of
4435 the code for a repeated single character, but I haven't found a nice way of
4436 commoning these up that doesn't require a test of the positive/negative
4437 option for each character match. Maybe that wouldn't add very much to the
4438 time taken, but character matching *is* what this is all about... */
4439
4440 case OP_NOTEXACT:
4441 min = max = (ecode[1] << 8) + ecode[2];
4442 ecode += 3;
4443 goto REPEATNOTCHAR;
4444
4445 case OP_NOTUPTO:
4446 case OP_NOTMINUPTO:
4447 min = 0;
4448 max = (ecode[1] << 8) + ecode[2];
4449 minimize = *ecode == OP_NOTMINUPTO;
4450 ecode += 3;
4451 goto REPEATNOTCHAR;
4452
4453 case OP_NOTSTAR:
4454 case OP_NOTMINSTAR:
4455 case OP_NOTPLUS:
4456 case OP_NOTMINPLUS:
4457 case OP_NOTQUERY:
4458 case OP_NOTMINQUERY:
4459 c = *ecode++ - OP_NOTSTAR;
4460 minimize = (c & 1) != 0;
4461 min = rep_min[c]; /* Pick up values from tables; */
4462 max = rep_max[c]; /* zero for max => infinity */
4463 if (max == 0) max = INT_MAX;
4464
4465 /* Common code for all repeated single-character matches. We can give
4466 up quickly if there are fewer than the minimum number of characters left in
4467 the subject. */
4468
4469 REPEATNOTCHAR:
4470 if (min > md->end_subject - eptr) return FALSE;
4471 c = *ecode++;
4472
4473 /* The code is duplicated for the caseless and caseful cases, for speed,
4474 since matching characters is likely to be quite common. First, ensure the
4475 minimum number of matches are present. If min = max, continue at the same
4476 level without recursing. Otherwise, if minimizing, keep trying the rest of
4477 the expression and advancing one matching character if failing, up to the
4478 maximum. Alternatively, if maximizing, find the maximum number of
4479 characters and work backwards. */
4480
4481 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4482 max, eptr));
4483
4484 if ((ims & PCRE_CASELESS) != 0)
4485 {
4486 c = md->lcc[c];
4487 for (i = 1; i <= min; i++)
4488 if (c == md->lcc[*eptr++]) return FALSE;
4489 if (min == max) continue;
4490 if (minimize)
4491 {
4492 for (i = min;; i++)
4493 {
4494 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4495 return TRUE;
4496 if (i >= max || eptr >= md->end_subject ||
4497 c == md->lcc[*eptr++])
4498 return FALSE;
4499 }
4500 /* Control never gets here */
4501 }
4502 else
4503 {
4504 const uschar *pp = eptr;
4505 for (i = min; i < max; i++)
4506 {
4507 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4508 eptr++;
4509 }
4510 while (eptr >= pp)
4511 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4512 return TRUE;
4513 return FALSE;
4514 }
4515 /* Control never gets here */
4516 }
4517
4518 /* Caseful comparisons */
4519
4520 else
4521 {
4522 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4523 if (min == max) continue;
4524 if (minimize)
4525 {
4526 for (i = min;; i++)
4527 {
4528 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4529 return TRUE;
4530 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4531 }
4532 /* Control never gets here */
4533 }
4534 else
4535 {
4536 const uschar *pp = eptr;
4537 for (i = min; i < max; i++)
4538 {
4539 if (eptr >= md->end_subject || c == *eptr) break;
4540 eptr++;
4541 }
4542 while (eptr >= pp)
4543 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4544 return TRUE;
4545 return FALSE;
4546 }
4547 }
4548 /* Control never gets here */
4549
4550 /* Match a single character type repeatedly; several different opcodes
4551 share code. This is very similar to the code for single characters, but we
4552 repeat it in the interests of efficiency. */
4553
4554 case OP_TYPEEXACT:
4555 min = max = (ecode[1] << 8) + ecode[2];
4556 minimize = TRUE;
4557 ecode += 3;
4558 goto REPEATTYPE;
4559
4560 case OP_TYPEUPTO:
4561 case OP_TYPEMINUPTO:
4562 min = 0;
4563 max = (ecode[1] << 8) + ecode[2];
4564 minimize = *ecode == OP_TYPEMINUPTO;
4565 ecode += 3;
4566 goto REPEATTYPE;
4567
4568 case OP_TYPESTAR:
4569 case OP_TYPEMINSTAR:
4570 case OP_TYPEPLUS:
4571 case OP_TYPEMINPLUS:
4572 case OP_TYPEQUERY:
4573 case OP_TYPEMINQUERY:
4574 c = *ecode++ - OP_TYPESTAR;
4575 minimize = (c & 1) != 0;
4576 min = rep_min[c]; /* Pick up values from tables; */
4577 max = rep_max[c]; /* zero for max => infinity */
4578 if (max == 0) max = INT_MAX;
4579
4580 /* Common code for all repeated single character type matches */
4581
4582 REPEATTYPE:
4583 ctype = *ecode++; /* Code for the character type */
4584
4585 /* First, ensure the minimum number of matches are present. Use inline
4586 code for maximizing the speed, and do the type test once at the start
4587 (i.e. keep it out of the loop). Also we can test that there are at least
4588 the minimum number of bytes before we start, except when doing '.' in
4589 UTF8 mode. Leave the test in in all cases; in the special case we have
4590 to test after each character. */
4591
4592 if (min > md->end_subject - eptr) return FALSE;
4593 if (min > 0) switch(ctype)
4594 {
4595 case OP_ANY:
4596 #ifdef SUPPORT_UTF8
4597 if (md->utf8)
4598 {
4599 for (i = 1; i <= min; i++)
4600 {
4601 if (eptr >= md->end_subject ||
4602 (*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
4603 return FALSE;
4604 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4605 }
4606 break;
4607 }
4608 #endif
4609 /* Non-UTF8 can be faster */
4610 if ((ims & PCRE_DOTALL) == 0)
4611 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4612 else eptr += min;
4613 break;
4614
4615 case OP_NOT_DIGIT:
4616 for (i = 1; i <= min; i++)
4617 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4618 break;
4619
4620 case OP_DIGIT:
4621 for (i = 1; i <= min; i++)
4622 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4623 break;
4624
4625 case OP_NOT_WHITESPACE:
4626 for (i = 1; i <= min; i++)
4627 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4628 break;
4629
4630 case OP_WHITESPACE:
4631 for (i = 1; i <= min; i++)
4632 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4633 break;
4634
4635 case OP_NOT_WORDCHAR:
4636 for (i = 1; i <= min; i++)
4637 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4638 return FALSE;
4639 break;
4640
4641 case OP_WORDCHAR:
4642 for (i = 1; i <= min; i++)
4643 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4644 return FALSE;
4645 break;
4646 }
4647
4648 /* If min = max, continue at the same level without recursing */
4649
4650 if (min == max) continue;
4651
4652 /* If minimizing, we have to test the rest of the pattern before each
4653 subsequent match. */
4654
4655 if (minimize)
4656 {
4657 for (i = min;; i++)
4658 {
4659 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4660 if (i >= max || eptr >= md->end_subject) return FALSE;
4661
4662 c = *eptr++;
4663 switch(ctype)
4664 {
4665 case OP_ANY:
4666 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4667 #ifdef SUPPORT_UTF8
4668 if (md->utf8)
4669 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4670 #endif
4671 break;
4672
4673 case OP_NOT_DIGIT:
4674 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4675 break;
4676
4677 case OP_DIGIT:
4678 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4679 break;
4680
4681 case OP_NOT_WHITESPACE:
4682 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4683 break;
4684
4685 case OP_WHITESPACE:
4686 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4687 break;
4688
4689 case OP_NOT_WORDCHAR:
4690 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4691 break;
4692
4693 case OP_WORDCHAR:
4694 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4695 break;
4696 }
4697 }
4698 /* Control never gets here */
4699 }
4700
4701 /* If maximizing it is worth using inline code for speed, doing the type
4702 test once at the start (i.e. keep it out of the loop). */
4703
4704 else
4705 {
4706 const uschar *pp = eptr;
4707 switch(ctype)
4708 {
4709 case OP_ANY:
4710
4711 /* Special code is required for UTF8, but when the maximum is unlimited
4712 we don't need it. */
4713
4714 #ifdef SUPPORT_UTF8
4715 if (md->utf8 && max < INT_MAX)
4716 {
4717 if ((ims & PCRE_DOTALL) == 0)
4718 {
4719 for (i = min; i < max; i++)
4720 {
4721 if (eptr >= md->end_subject || *eptr++ == '\n') break;
4722 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4723 }
4724 }
4725 else
4726 {
4727 for (i = min; i < max; i++)
4728 {
4729 eptr++;
4730 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4731 }
4732 }
4733 break;
4734 }
4735 #endif
4736 /* Non-UTF8 can be faster */
4737 if ((ims & PCRE_DOTALL) == 0)
4738 {
4739 for (i = min; i < max; i++)
4740 {
4741 if (eptr >= md->end_subject || *eptr == '\n') break;
4742 eptr++;
4743 }
4744 }
4745 else
4746 {
4747 c = max - min;
4748 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4749 eptr += c;
4750 }
4751 break;
4752
4753 case OP_NOT_DIGIT:
4754 for (i = min; i < max; i++)
4755 {
4756 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4757 break;
4758 eptr++;
4759 }
4760 break;
4761
4762 case OP_DIGIT:
4763 for (i = min; i < max; i++)
4764 {
4765 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4766 break;
4767 eptr++;
4768 }
4769 break;
4770
4771 case OP_NOT_WHITESPACE:
4772 for (i = min; i < max; i++)
4773 {
4774 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4775 break;
4776 eptr++;
4777 }
4778 break;
4779
4780 case OP_WHITESPACE:
4781 for (i = min; i < max; i++)
4782 {
4783 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4784 break;
4785 eptr++;
4786 }
4787 break;
4788
4789 case OP_NOT_WORDCHAR:
4790 for (i = min; i < max; i++)
4791 {
4792 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4793 break;
4794 eptr++;
4795 }
4796 break;
4797
4798 case OP_WORDCHAR:
4799 for (i = min; i < max; i++)
4800 {
4801 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4802 break;
4803 eptr++;
4804 }
4805 break;
4806 }
4807
4808 while (eptr >= pp)
4809 {
4810 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4811 return TRUE;
4812 #ifdef SUPPORT_UTF8
4813 if (md->utf8)
4814 while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4815 #endif
4816 }
4817 return FALSE;
4818 }
4819 /* Control never gets here */
4820
4821 /* There's been some horrible disaster. */
4822
4823 default:
4824 DPRINTF(("Unknown opcode %d\n", *ecode));
4825 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4826 return FALSE;
4827 }
4828
4829 /* Do not stick any code in here without much thought; it is assumed
4830 that "continue" in the code above comes out to here to repeat the main
4831 loop. */
4832
4833 } /* End of main loop */
4834 /* Control never reaches here */
4835 }
4836
4837
4838
4839
4840 /*************************************************
4841 * Execute a Regular Expression *
4842 *************************************************/
4843
4844 /* This function applies a compiled re to a subject string and picks out
4845 portions of the string if it matches. Two elements in the vector are set for
4846 each substring: the offsets to the start and end of the substring.
4847
4848 Arguments:
4849 external_re points to the compiled expression
4850 external_extra points to "hints" from pcre_study() or is NULL
4851 subject points to the subject string
4852 length length of subject string (may contain binary zeros)
4853 start_offset where to start in the subject string
4854 options option bits
4855 offsets points to a vector of ints to be filled in with offsets
4856 offsetcount the number of elements in the vector
4857
4858 Returns: > 0 => success; value is the number of elements filled in
4859 = 0 => success, but offsets is not big enough
4860 -1 => failed to match
4861 < -1 => some kind of unexpected problem
4862 */
4863
4864 int
4865 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4866 const char *subject, int length, int start_offset, int options, int *offsets,
4867 int offsetcount)
4868 {
4869 int resetcount, ocount;
4870 int first_char = -1;
4871 int req_char = -1;
4872 int req_char2 = -1;
4873 unsigned long int ims = 0;
4874 match_data match_block;
4875 const uschar *start_bits = NULL;
4876 const uschar *start_match = (const uschar *)subject + start_offset;
4877 const uschar *end_subject;
4878 const uschar *req_char_ptr = start_match - 1;
4879 const real_pcre *re = (const real_pcre *)external_re;
4880 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4881 BOOL using_temporary_offsets = FALSE;
4882 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4883 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4884
4885 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4886
4887 if (re == NULL || subject == NULL ||
4888 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4889 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4890
4891 match_block.start_pattern = re->code;
4892 match_block.start_subject = (const uschar *)subject;
4893 match_block.end_subject = match_block.start_subject + length;
4894 end_subject = match_block.end_subject;
4895
4896 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4897 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4898
4899 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4900 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4901 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4902
4903 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4904
4905 match_block.lcc = re->tables + lcc_offset;
4906 match_block.ctypes = re->tables + ctypes_offset;
4907
4908 /* The ims options can vary during the matching as a result of the presence
4909 of (?ims) items in the pattern. They are kept in a local variable so that
4910 restoring at the exit of a group is easy. */
4911
4912 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4913
4914 /* If the expression has got more back references than the offsets supplied can
4915 hold, we get a temporary bit of working store to use during the matching.
4916 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4917 of 3. */
4918
4919 ocount = offsetcount - (offsetcount % 3);
4920
4921 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4922 {
4923 ocount = re->top_backref * 3 + 3;
4924 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4925 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4926 using_temporary_offsets = TRUE;
4927 DPRINTF(("Got memory to hold back references\n"));
4928 }
4929 else match_block.offset_vector = offsets;
4930
4931 match_block.offset_end = ocount;
4932 match_block.offset_max = (2*ocount)/3;
4933 match_block.offset_overflow = FALSE;
4934
4935 /* Compute the minimum number of offsets that we need to reset each time. Doing
4936 this makes a huge difference to execution time when there aren't many brackets
4937 in the pattern. */
4938
4939 resetcount = 2 + re->top_bracket * 2;
4940 if (resetcount > offsetcount) resetcount = ocount;
4941
4942 /* Reset the working variable associated with each extraction. These should
4943 never be used unless previously set, but they get saved and restored, and so we
4944 initialize them to avoid reading uninitialized locations. */
4945
4946 if (match_block.offset_vector != NULL)
4947 {
4948 register int *iptr = match_block.offset_vector + ocount;
4949 register int *iend = iptr - resetcount/2 + 1;
4950 while (--iptr >= iend) *iptr = -1;
4951 }
4952
4953 /* Set up the first character to match, if available. The first_char value is
4954 never set for an anchored regular expression, but the anchoring may be forced
4955 at run time, so we have to test for anchoring. The first char may be unset for
4956 an unanchored pattern, of course. If there's no first char and the pattern was
4957 studied, there may be a bitmap of possible first characters. */
4958
4959 if (!anchored)
4960 {
4961 if ((re->options & PCRE_FIRSTSET) != 0)
4962 {
4963 first_char = re->first_char;
4964 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4965 }
4966 else
4967 if (!startline && extra != NULL &&
4968 (extra->options & PCRE_STUDY_MAPPED) != 0)
4969 start_bits = extra->start_bits;
4970 }
4971
4972 /* For anchored or unanchored matches, there may be a "last known required
4973 character" set. If the PCRE_CASELESS is set, implying that the match starts
4974 caselessly, or if there are any changes of this flag within the regex, set up
4975 both cases of the character. Otherwise set the two values the same, which will
4976 avoid duplicate testing (which takes significant time). This covers the vast
4977 majority of cases. It will be suboptimal when the case flag changes in a regex
4978 and the required character in fact is caseful. */
4979
4980 if ((re->options & PCRE_REQCHSET) != 0)
4981 {
4982 req_char = re->req_char;
4983 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4984 (re->tables + fcc_offset)[req_char] : req_char;
4985 }
4986
4987 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4988 the loop runs just once. */
4989
4990 do
4991 {
4992 int rc;
4993 register int *iptr = match_block.offset_vector;
4994 register int *iend = iptr + resetcount;
4995
4996 /* Reset the maximum number of extractions we might see. */
4997
4998 while (iptr < iend) *iptr++ = -1;
4999
5000 /* Advance to a unique first char if possible */
5001
5002 if (first_char >= 0)
5003 {
5004 if ((ims & PCRE_CASELESS) != 0)
5005 while (start_match < end_subject &&
5006 match_block.lcc[*start_match] != first_char)
5007 start_match++;
5008 else
5009 while (start_match < end_subject && *start_match != first_char)
5010 start_match++;
5011 }
5012
5013 /* Or to just after \n for a multiline match if possible */
5014
5015 else if (startline)
5016 {
5017 if (start_match > match_block.start_subject + start_offset)
5018 {
5019 while (start_match < end_subject && start_match[-1] != '\n')
5020 start_match++;
5021 }
5022 }
5023
5024 /* Or to a non-unique first char after study */
5025
5026 else if (start_bits != NULL)
5027 {
5028 while (start_match < end_subject)
5029 {
5030 register int c = *start_match;
5031 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5032 }
5033 }
5034
5035 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5036 printf(">>>> Match against: ");
5037 pchars(start_match, end_subject - start_match, TRUE, &match_block);
5038 printf("\n");
5039 #endif
5040
5041 /* If req_char is set, we know that that character must appear in the subject
5042 for the match to succeed. If the first character is set, req_char must be
5043 later in the subject; otherwise the test starts at the match point. This
5044 optimization can save a huge amount of backtracking in patterns with nested
5045 unlimited repeats that aren't going to match. We don't know what the state of
5046 case matching may be when this character is hit, so test for it in both its
5047 cases if necessary. However, the different cased versions will not be set up
5048 unless PCRE_CASELESS was given or the casing state changes within the regex.
5049 Writing separate code makes it go faster, as does using an autoincrement and
5050 backing off on a match. */
5051
5052 if (req_char >= 0)
5053 {
5054 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5055
5056 /* We don't need to repeat the search if we haven't yet reached the
5057 place we found it at last time. */
5058
5059 if (p > req_char_ptr)
5060 {
5061 /* Do a single test if no case difference is set up */
5062
5063 if (req_char == req_char2)
5064 {
5065 while (p < end_subject)
5066 {
5067 if (*p++ == req_char) { p--; break; }
5068 }
5069 }
5070
5071 /* Otherwise test for either case */
5072
5073 else
5074 {
5075 while (p < end_subject)
5076 {
5077 register int pp = *p++;
5078 if (pp == req_char || pp == req_char2) { p--; break; }
5079 }
5080 }
5081
5082 /* If we can't find the required character, break the matching loop */
5083
5084 if (p >= end_subject) break;
5085
5086 /* If we have found the required character, save the point where we
5087 found it, so that we don't search again next time round the loop if
5088 the start hasn't passed this character yet. */
5089
5090 req_char_ptr = p;
5091 }
5092 }
5093
5094 /* When a match occurs, substrings will be set for all internal extractions;
5095 we just need to set up the whole thing as substring 0 before returning. If
5096 there were too many extractions, set the return code to zero. In the case
5097 where we had to get some local store to hold offsets for backreferences, copy
5098 those back references that we can. In this case there need not be overflow
5099 if certain parts of the pattern were not used. */
5100
5101 match_block.start_match = start_match;
5102 if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5103 continue;
5104
5105 /* Copy the offset information from temporary store if necessary */
5106
5107 if (using_temporary_offsets)
5108 {
5109 if (offsetcount >= 4)
5110 {
5111 memcpy(offsets + 2, match_block.offset_vector + 2,
5112 (offsetcount - 2) * sizeof(int));
5113 DPRINTF(("Copied offsets from temporary memory\n"));
5114 }
5115 if (match_block.end_offset_top > offsetcount)
5116 match_block.offset_overflow = TRUE;
5117
5118 DPRINTF(("Freeing temporary memory\n"));
5119 (pcre_free)(match_block.offset_vector);
5120 }
5121
5122 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5123
5124 if (match_block.offset_end < 2) rc = 0; else
5125 {
5126 offsets[0] = start_match - match_block.start_subject;
5127 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5128 }
5129
5130 DPRINTF((">>>> returning %d\n", rc));
5131 return rc;
5132 }
5133
5134 /* This "while" is the end of the "do" above */
5135
5136 while (!anchored &&
5137 match_block.errorcode == PCRE_ERROR_NOMATCH &&
5138 start_match++ < end_subject);
5139
5140 if (using_temporary_offsets)
5141 {
5142 DPRINTF(("Freeing temporary memory\n"));
5143 (pcre_free)(match_block.offset_vector);
5144 }
5145
5146 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5147
5148 return match_block.errorcode;
5149 }
5150
5151 /* End of pcre.c */

  ViewVC Help
Powered by ViewVC 1.1.5