/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 65 - (show annotations)
Sat Feb 24 21:40:08 2007 UTC (12 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 228134 byte(s)
Error occurred while calculating annotation data.
Load pcre-4.1 into code/trunk.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2003 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35 /* Define DEBUG to get debugging output on stdout. */
36
37 /* #define DEBUG */
38
39 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
40 inline, and there are *still* stupid compilers about that don't like indented
41 pre-processor statements. I suppose it's only been 10 years... */
42
43 #ifdef DEBUG
44 #define DPRINTF(p) printf p
45 #else
46 #define DPRINTF(p) /*nothing*/
47 #endif
48
49 /* Include the internals header, which itself includes Standard C headers plus
50 the external pcre header. */
51
52 #include "internal.h"
53
54
55 /* Allow compilation as C++ source code, should anybody want to do that. */
56
57 #ifdef __cplusplus
58 #define class pcre_class
59 #endif
60
61
62 /* Maximum number of items on the nested bracket stacks at compile time. This
63 applies to the nesting of all kinds of parentheses. It does not limit
64 un-nested, non-capturing parentheses. This number can be made bigger if
65 necessary - it is used to dimension one int and one unsigned char vector at
66 compile time. */
67
68 #define BRASTACK_SIZE 200
69
70
71 /* Maximum number of ints of offset to save on the stack for recursive calls.
72 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73 because the offset vector is always a multiple of 3 long. */
74
75 #define REC_STACK_SAVE_MAX 30
76
77
78 /* The number of bytes in a literal character string above which we can't add
79 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80 could be 255 when UTF-8 support is excluded, but that means that some of the
81 test output would be different, which just complicates things.) */
82
83 #define MAXLIT 250
84
85
86 /* The maximum remaining length of subject we are prepared to search for a
87 req_byte match. */
88
89 #define REQ_BYTE_MAX 1000
90
91
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
94
95 static uschar OP_lengths[] = { OP_LENGTHS };
96
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106
107 static const short int escapes[] = {
108 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
111 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
115 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
116 0, 0, ESC_r, -ESC_s, ESC_t, 0, 0, -ESC_w, /* p - w */
117 0, 0, -ESC_z /* x - z */
118 };
119
120 /* Tables of names of POSIX character classes and their lengths. The list is
121 terminated by a zero length entry. The first three must be alpha, upper, lower,
122 as this is assumed for handling case independence. */
123
124 static const char *posix_names[] = {
125 "alpha", "lower", "upper",
126 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
127 "print", "punct", "space", "word", "xdigit" };
128
129 static const uschar posix_name_lengths[] = {
130 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131
132 /* Table of class bit maps for each POSIX class; up to three may be combined
133 to form the class. The table for [:blank:] is dynamically modified to remove
134 the vertical space characters. */
135
136 static const int posix_class_maps[] = {
137 cbit_lower, cbit_upper, -1, /* alpha */
138 cbit_lower, -1, -1, /* lower */
139 cbit_upper, -1, -1, /* upper */
140 cbit_digit, cbit_lower, cbit_upper, /* alnum */
141 cbit_print, cbit_cntrl, -1, /* ascii */
142 cbit_space, -1, -1, /* blank - a GNU extension */
143 cbit_cntrl, -1, -1, /* cntrl */
144 cbit_digit, -1, -1, /* digit */
145 cbit_graph, -1, -1, /* graph */
146 cbit_print, -1, -1, /* print */
147 cbit_punct, -1, -1, /* punct */
148 cbit_space, -1, -1, /* space */
149 cbit_word, -1, -1, /* word - a Perl extension */
150 cbit_xdigit,-1, -1 /* xdigit */
151 };
152
153
154 /* Definition to allow mutual recursion */
155
156 static BOOL
157 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
158 BOOL, int, int *, int *, branch_chain *, compile_data *);
159
160 /* Structure for building a chain of data that actually lives on the
161 stack, for holding the values of the subject pointer at the start of each
162 subpattern, so as to detect when an empty string has been matched by a
163 subpattern - to break infinite loops. */
164
165 typedef struct eptrblock {
166 struct eptrblock *prev;
167 const uschar *saved_eptr;
168 } eptrblock;
169
170 /* Flag bits for the match() function */
171
172 #define match_condassert 0x01 /* Called to check a condition assertion */
173 #define match_isgroup 0x02 /* Set if start of bracketed group */
174
175 /* Non-error returns from the match() function. Error returns are externally
176 defined PCRE_ERROR_xxx codes, which are all negative. */
177
178 #define MATCH_MATCH 1
179 #define MATCH_NOMATCH 0
180
181
182
183 /*************************************************
184 * Global variables *
185 *************************************************/
186
187 /* PCRE is thread-clean and doesn't use any global variables in the normal
188 sense. However, it calls memory allocation and free functions via the two
189 indirections below, and it can optionally do callouts. These values can be
190 changed by the caller, but are shared between all threads. However, when
191 compiling for Virtual Pascal, things are done differently (see pcre.in). */
192
193 #ifndef VPCOMPAT
194 void *(*pcre_malloc)(size_t) = malloc;
195 void (*pcre_free)(void *) = free;
196 int (*pcre_callout)(pcre_callout_block *) = NULL;
197 #endif
198
199
200 /*************************************************
201 * Macros and tables for character handling *
202 *************************************************/
203
204 /* When UTF-8 encoding is being used, a character is no longer just a single
205 byte. The macros for character handling generate simple sequences when used in
206 byte-mode, and more complicated ones for UTF-8 characters. */
207
208 #ifndef SUPPORT_UTF8
209 #define GETCHAR(c, eptr) c = *eptr;
210 #define GETCHARINC(c, eptr) c = *eptr++;
211 #define GETCHARINCTEST(c, eptr) c = *eptr++;
212 #define GETCHARLEN(c, eptr, len) c = *eptr;
213 #define BACKCHAR(eptr)
214
215 #else /* SUPPORT_UTF8 */
216
217 /* Get the next UTF-8 character, not advancing the pointer. This is called when
218 we know we are in UTF-8 mode. */
219
220 #define GETCHAR(c, eptr) \
221 c = *eptr; \
222 if ((c & 0xc0) == 0xc0) \
223 { \
224 int i; \
225 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
226 int s = 6*a; \
227 c = (c & utf8_table3[a]) << s; \
228 for (i = 1; i <= a; i++) \
229 { \
230 s -= 6; \
231 c |= (eptr[i] & 0x3f) << s; \
232 } \
233 }
234
235 /* Get the next UTF-8 character, advancing the pointer. This is called when we
236 know we are in UTF-8 mode. */
237
238 #define GETCHARINC(c, eptr) \
239 c = *eptr++; \
240 if ((c & 0xc0) == 0xc0) \
241 { \
242 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
243 int s = 6*a; \
244 c = (c & utf8_table3[a]) << s; \
245 while (a-- > 0) \
246 { \
247 s -= 6; \
248 c |= (*eptr++ & 0x3f) << s; \
249 } \
250 }
251
252 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
253
254 #define GETCHARINCTEST(c, eptr) \
255 c = *eptr++; \
256 if (md->utf8 && (c & 0xc0) == 0xc0) \
257 { \
258 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
259 int s = 6*a; \
260 c = (c & utf8_table3[a]) << s; \
261 while (a-- > 0) \
262 { \
263 s -= 6; \
264 c |= (*eptr++ & 0x3f) << s; \
265 } \
266 }
267
268 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
269 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
270
271 #define GETCHARLEN(c, eptr, len) \
272 c = *eptr; \
273 if ((c & 0xc0) == 0xc0) \
274 { \
275 int i; \
276 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
277 int s = 6*a; \
278 c = (c & utf8_table3[a]) << s; \
279 for (i = 1; i <= a; i++) \
280 { \
281 s -= 6; \
282 c |= (eptr[i] & 0x3f) << s; \
283 } \
284 len += a; \
285 }
286
287 /* If the pointer is not at the start of a character, move it back until
288 it is. Called only in UTF-8 mode. */
289
290 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
291
292 #endif
293
294
295
296 /*************************************************
297 * Default character tables *
298 *************************************************/
299
300 /* A default set of character tables is included in the PCRE binary. Its source
301 is built by the maketables auxiliary program, which uses the default C ctypes
302 functions, and put in the file chartables.c. These tables are used by PCRE
303 whenever the caller of pcre_compile() does not provide an alternate set of
304 tables. */
305
306 #include "chartables.c"
307
308
309
310 #ifdef SUPPORT_UTF8
311 /*************************************************
312 * Tables for UTF-8 support *
313 *************************************************/
314
315 /* These are the breakpoints for different numbers of bytes in a UTF-8
316 character. */
317
318 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
319
320 /* These are the indicator bits and the mask for the data bits to set in the
321 first byte of a character, indexed by the number of additional bytes. */
322
323 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
324 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
325
326 /* Table of the number of extra characters, indexed by the first character
327 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
328 0x3d. */
329
330 static uschar utf8_table4[] = {
331 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
332 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
333 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
334 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
335
336
337 /*************************************************
338 * Convert character value to UTF-8 *
339 *************************************************/
340
341 /* This function takes an integer value in the range 0 - 0x7fffffff
342 and encodes it as a UTF-8 character in 0 to 6 bytes.
343
344 Arguments:
345 cvalue the character value
346 buffer pointer to buffer for result - at least 6 bytes long
347
348 Returns: number of characters placed in the buffer
349 */
350
351 static int
352 ord2utf8(int cvalue, uschar *buffer)
353 {
354 register int i, j;
355 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
356 if (cvalue <= utf8_table1[i]) break;
357 buffer += i;
358 for (j = i; j > 0; j--)
359 {
360 *buffer-- = 0x80 | (cvalue & 0x3f);
361 cvalue >>= 6;
362 }
363 *buffer = utf8_table2[i] | cvalue;
364 return i + 1;
365 }
366 #endif
367
368
369
370 /*************************************************
371 * Print compiled regex *
372 *************************************************/
373
374 /* The code for doing this is held in a separate file that is also included in
375 pcretest.c. It defines a function called print_internals(). */
376
377 #ifdef DEBUG
378 #include "printint.c"
379 #endif
380
381
382
383 /*************************************************
384 * Return version string *
385 *************************************************/
386
387 #define STRING(a) # a
388 #define XSTRING(s) STRING(s)
389
390 const char *
391 pcre_version(void)
392 {
393 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
394 }
395
396
397
398
399 /*************************************************
400 * (Obsolete) Return info about compiled pattern *
401 *************************************************/
402
403 /* This is the original "info" function. It picks potentially useful data out
404 of the private structure, but its interface was too rigid. It remains for
405 backwards compatibility. The public options are passed back in an int - though
406 the re->options field has been expanded to a long int, all the public options
407 at the low end of it, and so even on 16-bit systems this will still be OK.
408 Therefore, I haven't changed the API for pcre_info().
409
410 Arguments:
411 external_re points to compiled code
412 optptr where to pass back the options
413 first_byte where to pass back the first character,
414 or -1 if multiline and all branches start ^,
415 or -2 otherwise
416
417 Returns: number of capturing subpatterns
418 or negative values on error
419 */
420
421 int
422 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
423 {
424 const real_pcre *re = (const real_pcre *)external_re;
425 if (re == NULL) return PCRE_ERROR_NULL;
426 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
427 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
428 if (first_byte != NULL)
429 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
430 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
431 return re->top_bracket;
432 }
433
434
435
436 /*************************************************
437 * Return info about compiled pattern *
438 *************************************************/
439
440 /* This is a newer "info" function which has an extensible interface so
441 that additional items can be added compatibly.
442
443 Arguments:
444 external_re points to compiled code
445 extra_data points extra data, or NULL
446 what what information is required
447 where where to put the information
448
449 Returns: 0 if data returned, negative on error
450 */
451
452 int
453 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
454 void *where)
455 {
456 const real_pcre *re = (const real_pcre *)external_re;
457 const pcre_study_data *study = NULL;
458
459 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
460 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
461
462 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
463 study = extra_data->study_data;
464
465 switch (what)
466 {
467 case PCRE_INFO_OPTIONS:
468 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
469 break;
470
471 case PCRE_INFO_SIZE:
472 *((size_t *)where) = re->size;
473 break;
474
475 case PCRE_INFO_STUDYSIZE:
476 *((size_t *)where) = (study == NULL)? 0 : study->size;
477 break;
478
479 case PCRE_INFO_CAPTURECOUNT:
480 *((int *)where) = re->top_bracket;
481 break;
482
483 case PCRE_INFO_BACKREFMAX:
484 *((int *)where) = re->top_backref;
485 break;
486
487 case PCRE_INFO_FIRSTBYTE:
488 *((int *)where) =
489 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
490 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
491 break;
492
493 case PCRE_INFO_FIRSTTABLE:
494 *((const uschar **)where) =
495 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
496 study->start_bits : NULL;
497 break;
498
499 case PCRE_INFO_LASTLITERAL:
500 *((int *)where) =
501 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
502 break;
503
504 case PCRE_INFO_NAMEENTRYSIZE:
505 *((int *)where) = re->name_entry_size;
506 break;
507
508 case PCRE_INFO_NAMECOUNT:
509 *((int *)where) = re->name_count;
510 break;
511
512 case PCRE_INFO_NAMETABLE:
513 *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
514 break;
515
516 default: return PCRE_ERROR_BADOPTION;
517 }
518
519 return 0;
520 }
521
522
523
524 /*************************************************
525 * Return info about what features are configured *
526 *************************************************/
527
528 /* This is function which has an extensible interface so that additional items
529 can be added compatibly.
530
531 Arguments:
532 what what information is required
533 where where to put the information
534
535 Returns: 0 if data returned, negative on error
536 */
537
538 int
539 pcre_config(int what, void *where)
540 {
541 switch (what)
542 {
543 case PCRE_CONFIG_UTF8:
544 #ifdef SUPPORT_UTF8
545 *((int *)where) = 1;
546 #else
547 *((int *)where) = 0;
548 #endif
549 break;
550
551 case PCRE_CONFIG_NEWLINE:
552 *((int *)where) = NEWLINE;
553 break;
554
555 case PCRE_CONFIG_LINK_SIZE:
556 *((int *)where) = LINK_SIZE;
557 break;
558
559 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
560 *((int *)where) = POSIX_MALLOC_THRESHOLD;
561 break;
562
563 case PCRE_CONFIG_MATCH_LIMIT:
564 *((unsigned int *)where) = MATCH_LIMIT;
565 break;
566
567 default: return PCRE_ERROR_BADOPTION;
568 }
569
570 return 0;
571 }
572
573
574
575 #ifdef DEBUG
576 /*************************************************
577 * Debugging function to print chars *
578 *************************************************/
579
580 /* Print a sequence of chars in printable format, stopping at the end of the
581 subject if the requested.
582
583 Arguments:
584 p points to characters
585 length number to print
586 is_subject TRUE if printing from within md->start_subject
587 md pointer to matching data block, if is_subject is TRUE
588
589 Returns: nothing
590 */
591
592 static void
593 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
594 {
595 int c;
596 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
597 while (length-- > 0)
598 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
599 }
600 #endif
601
602
603
604
605 /*************************************************
606 * Handle escapes *
607 *************************************************/
608
609 /* This function is called when a \ has been encountered. It either returns a
610 positive value for a simple escape such as \n, or a negative value which
611 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
612 a positive value greater than 255 may be returned. On entry, ptr is pointing at
613 the \. On exit, it is on the final character of the escape sequence.
614
615 Arguments:
616 ptrptr points to the pattern position pointer
617 errorptr points to the pointer to the error message
618 bracount number of previous extracting brackets
619 options the options bits
620 isclass TRUE if inside a character class
621 cd pointer to char tables block
622
623 Returns: zero or positive => a data character
624 negative => a special escape sequence
625 on error, errorptr is set
626 */
627
628 static int
629 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
630 int options, BOOL isclass, compile_data *cd)
631 {
632 const uschar *ptr = *ptrptr;
633 int c, i;
634
635 /* If backslash is at the end of the pattern, it's an error. */
636
637 c = *(++ptr);
638 if (c == 0) *errorptr = ERR1;
639
640 /* Digits or letters may have special meaning; all others are literals. */
641
642 else if (c < '0' || c > 'z') {}
643
644 /* Do an initial lookup in a table. A non-zero result is something that can be
645 returned immediately. Otherwise further processing may be required. */
646
647 else if ((i = escapes[c - '0']) != 0) c = i;
648
649 /* Escapes that need further processing, or are illegal. */
650
651 else
652 {
653 const uschar *oldptr;
654 switch (c)
655 {
656 /* A number of Perl escapes are not handled by PCRE. We give an explicit
657 error. */
658
659 case 'l':
660 case 'L':
661 case 'N':
662 case 'p':
663 case 'P':
664 case 'u':
665 case 'U':
666 case 'X':
667 *errorptr = ERR37;
668 break;
669
670 /* The handling of escape sequences consisting of a string of digits
671 starting with one that is not zero is not straightforward. By experiment,
672 the way Perl works seems to be as follows:
673
674 Outside a character class, the digits are read as a decimal number. If the
675 number is less than 10, or if there are that many previous extracting
676 left brackets, then it is a back reference. Otherwise, up to three octal
677 digits are read to form an escaped byte. Thus \123 is likely to be octal
678 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
679 value is greater than 377, the least significant 8 bits are taken. Inside a
680 character class, \ followed by a digit is always an octal number. */
681
682 case '1': case '2': case '3': case '4': case '5':
683 case '6': case '7': case '8': case '9':
684
685 if (!isclass)
686 {
687 oldptr = ptr;
688 c -= '0';
689 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
690 c = c * 10 + *(++ptr) - '0';
691 if (c < 10 || c <= bracount)
692 {
693 c = -(ESC_REF + c);
694 break;
695 }
696 ptr = oldptr; /* Put the pointer back and fall through */
697 }
698
699 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
700 generates a binary zero byte and treats the digit as a following literal.
701 Thus we have to pull back the pointer by one. */
702
703 if ((c = *ptr) >= '8')
704 {
705 ptr--;
706 c = 0;
707 break;
708 }
709
710 /* \0 always starts an octal number, but we may drop through to here with a
711 larger first octal digit. */
712
713 case '0':
714 c -= '0';
715 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
716 ptr[1] != '8' && ptr[1] != '9')
717 c = c * 8 + *(++ptr) - '0';
718 c &= 255; /* Take least significant 8 bits */
719 break;
720
721 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
722 which can be greater than 0xff, but only if the ddd are hex digits. */
723
724 case 'x':
725 #ifdef SUPPORT_UTF8
726 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
727 {
728 const uschar *pt = ptr + 2;
729 register int count = 0;
730 c = 0;
731 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
732 {
733 count++;
734 c = c * 16 + cd->lcc[*pt] -
735 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
736 pt++;
737 }
738 if (*pt == '}')
739 {
740 if (c < 0 || count > 8) *errorptr = ERR34;
741 ptr = pt;
742 break;
743 }
744 /* If the sequence of hex digits does not end with '}', then we don't
745 recognize this construct; fall through to the normal \x handling. */
746 }
747 #endif
748
749 /* Read just a single hex char */
750
751 c = 0;
752 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
753 {
754 ptr++;
755 c = c * 16 + cd->lcc[*ptr] -
756 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
757 }
758 break;
759
760 /* Other special escapes not starting with a digit are straightforward */
761
762 case 'c':
763 c = *(++ptr);
764 if (c == 0)
765 {
766 *errorptr = ERR2;
767 return 0;
768 }
769
770 /* A letter is upper-cased; then the 0x40 bit is flipped */
771
772 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
773 c ^= 0x40;
774 break;
775
776 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
777 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
778 for Perl compatibility, it is a literal. This code looks a bit odd, but
779 there used to be some cases other than the default, and there may be again
780 in future, so I haven't "optimized" it. */
781
782 default:
783 if ((options & PCRE_EXTRA) != 0) switch(c)
784 {
785 default:
786 *errorptr = ERR3;
787 break;
788 }
789 break;
790 }
791 }
792
793 *ptrptr = ptr;
794 return c;
795 }
796
797
798
799 /*************************************************
800 * Check for counted repeat *
801 *************************************************/
802
803 /* This function is called when a '{' is encountered in a place where it might
804 start a quantifier. It looks ahead to see if it really is a quantifier or not.
805 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
806 where the ddds are digits.
807
808 Arguments:
809 p pointer to the first char after '{'
810 cd pointer to char tables block
811
812 Returns: TRUE or FALSE
813 */
814
815 static BOOL
816 is_counted_repeat(const uschar *p, compile_data *cd)
817 {
818 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
819 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
820 if (*p == '}') return TRUE;
821
822 if (*p++ != ',') return FALSE;
823 if (*p == '}') return TRUE;
824
825 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
826 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
827 return (*p == '}');
828 }
829
830
831
832 /*************************************************
833 * Read repeat counts *
834 *************************************************/
835
836 /* Read an item of the form {n,m} and return the values. This is called only
837 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
838 so the syntax is guaranteed to be correct, but we need to check the values.
839
840 Arguments:
841 p pointer to first char after '{'
842 minp pointer to int for min
843 maxp pointer to int for max
844 returned as -1 if no max
845 errorptr points to pointer to error message
846 cd pointer to character tables clock
847
848 Returns: pointer to '}' on success;
849 current ptr on error, with errorptr set
850 */
851
852 static const uschar *
853 read_repeat_counts(const uschar *p, int *minp, int *maxp,
854 const char **errorptr, compile_data *cd)
855 {
856 int min = 0;
857 int max = -1;
858
859 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
860
861 if (*p == '}') max = min; else
862 {
863 if (*(++p) != '}')
864 {
865 max = 0;
866 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
867 if (max < min)
868 {
869 *errorptr = ERR4;
870 return p;
871 }
872 }
873 }
874
875 /* Do paranoid checks, then fill in the required variables, and pass back the
876 pointer to the terminating '}'. */
877
878 if (min > 65535 || max > 65535)
879 *errorptr = ERR5;
880 else
881 {
882 *minp = min;
883 *maxp = max;
884 }
885 return p;
886 }
887
888
889
890 /*************************************************
891 * Find first significant op code *
892 *************************************************/
893
894 /* This is called by several functions that scan a compiled expression looking
895 for a fixed first character, or an anchoring op code etc. It skips over things
896 that do not influence this. For some calls, a change of option is important.
897
898 Arguments:
899 code pointer to the start of the group
900 options pointer to external options
901 optbit the option bit whose changing is significant, or
902 zero if none are
903
904 Returns: pointer to the first significant opcode
905 */
906
907 static const uschar*
908 first_significant_code(const uschar *code, int *options, int optbit)
909 {
910 for (;;)
911 {
912 switch ((int)*code)
913 {
914 case OP_OPT:
915 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
916 *options = (int)code[1];
917 code += 2;
918 break;
919
920 case OP_ASSERT_NOT:
921 case OP_ASSERTBACK:
922 case OP_ASSERTBACK_NOT:
923 do code += GET(code, 1); while (*code == OP_ALT);
924 /* Fall through */
925
926 case OP_CALLOUT:
927 case OP_CREF:
928 case OP_BRANUMBER:
929 case OP_WORD_BOUNDARY:
930 case OP_NOT_WORD_BOUNDARY:
931 code += OP_lengths[*code];
932 break;
933
934 default:
935 return code;
936 }
937 }
938 /* Control never reaches here */
939 }
940
941
942
943
944 /*************************************************
945 * Find the fixed length of a pattern *
946 *************************************************/
947
948 /* Scan a pattern and compute the fixed length of subject that will match it,
949 if the length is fixed. This is needed for dealing with backward assertions.
950 In UTF8 mode, the result is in characters rather than bytes.
951
952 Arguments:
953 code points to the start of the pattern (the bracket)
954 options the compiling options
955
956 Returns: the fixed length, or -1 if there is no fixed length,
957 or -2 if \C was encountered
958 */
959
960 static int
961 find_fixedlength(uschar *code, int options)
962 {
963 int length = -1;
964
965 register int branchlength = 0;
966 register uschar *cc = code + 1 + LINK_SIZE;
967
968 /* Scan along the opcodes for this branch. If we get to the end of the
969 branch, check the length against that of the other branches. */
970
971 for (;;)
972 {
973 int d;
974 register int op = *cc;
975 if (op >= OP_BRA) op = OP_BRA;
976
977 switch (op)
978 {
979 case OP_BRA:
980 case OP_ONCE:
981 case OP_COND:
982 d = find_fixedlength(cc, options);
983 if (d < 0) return d;
984 branchlength += d;
985 do cc += GET(cc, 1); while (*cc == OP_ALT);
986 cc += 1 + LINK_SIZE;
987 break;
988
989 /* Reached end of a branch; if it's a ket it is the end of a nested
990 call. If it's ALT it is an alternation in a nested call. If it is
991 END it's the end of the outer call. All can be handled by the same code. */
992
993 case OP_ALT:
994 case OP_KET:
995 case OP_KETRMAX:
996 case OP_KETRMIN:
997 case OP_END:
998 if (length < 0) length = branchlength;
999 else if (length != branchlength) return -1;
1000 if (*cc != OP_ALT) return length;
1001 cc += 1 + LINK_SIZE;
1002 branchlength = 0;
1003 break;
1004
1005 /* Skip over assertive subpatterns */
1006
1007 case OP_ASSERT:
1008 case OP_ASSERT_NOT:
1009 case OP_ASSERTBACK:
1010 case OP_ASSERTBACK_NOT:
1011 do cc += GET(cc, 1); while (*cc == OP_ALT);
1012 /* Fall through */
1013
1014 /* Skip over things that don't match chars */
1015
1016 case OP_REVERSE:
1017 case OP_BRANUMBER:
1018 case OP_CREF:
1019 case OP_OPT:
1020 case OP_CALLOUT:
1021 case OP_SOD:
1022 case OP_SOM:
1023 case OP_EOD:
1024 case OP_EODN:
1025 case OP_CIRC:
1026 case OP_DOLL:
1027 case OP_NOT_WORD_BOUNDARY:
1028 case OP_WORD_BOUNDARY:
1029 cc += OP_lengths[*cc];
1030 break;
1031
1032 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1033 This requires a scan of the string, unfortunately. We assume valid UTF-8
1034 strings, so all we do is reduce the length by one for every byte whose bits
1035 are 10xxxxxx. */
1036
1037 case OP_CHARS:
1038 branchlength += *(++cc);
1039 #ifdef SUPPORT_UTF8
1040 if ((options & PCRE_UTF8) != 0)
1041 for (d = 1; d <= *cc; d++)
1042 if ((cc[d] & 0xc0) == 0x80) branchlength--;
1043 #endif
1044 cc += *cc + 1;
1045 break;
1046
1047 /* Handle exact repetitions. The count is already in characters, but we
1048 need to skip over a multibyte character in UTF8 mode. */
1049
1050 case OP_EXACT:
1051 branchlength += GET2(cc,1);
1052 cc += 4;
1053 #ifdef SUPPORT_UTF8
1054 if ((options & PCRE_UTF8) != 0)
1055 {
1056 while((*cc & 0x80) == 0x80) cc++;
1057 }
1058 #endif
1059 break;
1060
1061 case OP_TYPEEXACT:
1062 branchlength += GET2(cc,1);
1063 cc += 4;
1064 break;
1065
1066 /* Handle single-char matchers */
1067
1068 case OP_NOT_DIGIT:
1069 case OP_DIGIT:
1070 case OP_NOT_WHITESPACE:
1071 case OP_WHITESPACE:
1072 case OP_NOT_WORDCHAR:
1073 case OP_WORDCHAR:
1074 case OP_ANY:
1075 branchlength++;
1076 cc++;
1077 break;
1078
1079 /* The single-byte matcher isn't allowed */
1080
1081 case OP_ANYBYTE:
1082 return -2;
1083
1084 /* Check a class for variable quantification */
1085
1086 #ifdef SUPPORT_UTF8
1087 case OP_XCLASS:
1088 cc += GET(cc, 1) - 33;
1089 /* Fall through */
1090 #endif
1091
1092 case OP_CLASS:
1093 case OP_NCLASS:
1094 cc += 33;
1095
1096 switch (*cc)
1097 {
1098 case OP_CRSTAR:
1099 case OP_CRMINSTAR:
1100 case OP_CRQUERY:
1101 case OP_CRMINQUERY:
1102 return -1;
1103
1104 case OP_CRRANGE:
1105 case OP_CRMINRANGE:
1106 if (GET2(cc,1) != GET2(cc,3)) return -1;
1107 branchlength += GET2(cc,1);
1108 cc += 5;
1109 break;
1110
1111 default:
1112 branchlength++;
1113 }
1114 break;
1115
1116 /* Anything else is variable length */
1117
1118 default:
1119 return -1;
1120 }
1121 }
1122 /* Control never gets here */
1123 }
1124
1125
1126
1127
1128 /*************************************************
1129 * Scan compiled regex for numbered bracket *
1130 *************************************************/
1131
1132 /* This little function scans through a compiled pattern until it finds a
1133 capturing bracket with the given number.
1134
1135 Arguments:
1136 code points to start of expression
1137 utf8 TRUE in UTF-8 mode
1138 number the required bracket number
1139
1140 Returns: pointer to the opcode for the bracket, or NULL if not found
1141 */
1142
1143 static const uschar *
1144 find_bracket(const uschar *code, BOOL utf8, int number)
1145 {
1146 #ifndef SUPPORT_UTF8
1147 utf8 = utf8; /* Stop pedantic compilers complaining */
1148 #endif
1149
1150 for (;;)
1151 {
1152 register int c = *code;
1153 if (c == OP_END) return NULL;
1154 else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1155 else if (c > OP_BRA)
1156 {
1157 int n = c - OP_BRA;
1158 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1159 if (n == number) return (uschar *)code;
1160 code += OP_lengths[OP_BRA];
1161 }
1162 else
1163 {
1164 code += OP_lengths[c];
1165
1166 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1167 by a multi-byte character. The length in the table is a minimum, so we have
1168 to scan along to skip the extra characters. All opcodes are less than 128,
1169 so we can use relatively efficient code. */
1170
1171 #ifdef SUPPORT_UTF8
1172 if (utf8) switch(c)
1173 {
1174 case OP_EXACT:
1175 case OP_UPTO:
1176 case OP_MINUPTO:
1177 case OP_STAR:
1178 case OP_MINSTAR:
1179 case OP_PLUS:
1180 case OP_MINPLUS:
1181 case OP_QUERY:
1182 case OP_MINQUERY:
1183 while ((*code & 0xc0) == 0x80) code++;
1184 break;
1185 }
1186 #endif
1187 }
1188 }
1189 }
1190
1191
1192
1193 /*************************************************
1194 * Scan compiled branch for non-emptiness *
1195 *************************************************/
1196
1197 /* This function scans through a branch of a compiled pattern to see whether it
1198 can match the empty string or not. It is called only from could_be_empty()
1199 below. Note that first_significant_code() skips over assertions. If we hit an
1200 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1201 whose current branch will already have been scanned.
1202
1203 Arguments:
1204 code points to start of search
1205 endcode points to where to stop
1206 utf8 TRUE if in UTF8 mode
1207
1208 Returns: TRUE if what is matched could be empty
1209 */
1210
1211 static BOOL
1212 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1213 {
1214 register int c;
1215 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1216 code < endcode;
1217 code = first_significant_code(code + OP_lengths[c], NULL, 0))
1218 {
1219 const uschar *ccode;
1220
1221 c = *code;
1222
1223 if (c >= OP_BRA)
1224 {
1225 BOOL empty_branch;
1226 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1227
1228 /* Scan a closed bracket */
1229
1230 empty_branch = FALSE;
1231 do
1232 {
1233 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1234 empty_branch = TRUE;
1235 code += GET(code, 1);
1236 }
1237 while (*code == OP_ALT);
1238 if (!empty_branch) return FALSE; /* All branches are non-empty */
1239 code += 1 + LINK_SIZE;
1240 c = *code;
1241 }
1242
1243 else switch (c)
1244 {
1245 /* Check for quantifiers after a class */
1246
1247 #ifdef SUPPORT_UTF8
1248 case OP_XCLASS:
1249 ccode = code + GET(code, 1);
1250 goto CHECK_CLASS_REPEAT;
1251 #endif
1252
1253 case OP_CLASS:
1254 case OP_NCLASS:
1255 ccode = code + 33;
1256
1257 #ifdef SUPPORT_UTF8
1258 CHECK_CLASS_REPEAT:
1259 #endif
1260
1261 switch (*ccode)
1262 {
1263 case OP_CRSTAR: /* These could be empty; continue */
1264 case OP_CRMINSTAR:
1265 case OP_CRQUERY:
1266 case OP_CRMINQUERY:
1267 break;
1268
1269 default: /* Non-repeat => class must match */
1270 case OP_CRPLUS: /* These repeats aren't empty */
1271 case OP_CRMINPLUS:
1272 return FALSE;
1273
1274 case OP_CRRANGE:
1275 case OP_CRMINRANGE:
1276 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1277 break;
1278 }
1279 break;
1280
1281 /* Opcodes that must match a character */
1282
1283 case OP_NOT_DIGIT:
1284 case OP_DIGIT:
1285 case OP_NOT_WHITESPACE:
1286 case OP_WHITESPACE:
1287 case OP_NOT_WORDCHAR:
1288 case OP_WORDCHAR:
1289 case OP_ANY:
1290 case OP_ANYBYTE:
1291 case OP_CHARS:
1292 case OP_NOT:
1293 case OP_PLUS:
1294 case OP_MINPLUS:
1295 case OP_EXACT:
1296 case OP_NOTPLUS:
1297 case OP_NOTMINPLUS:
1298 case OP_NOTEXACT:
1299 case OP_TYPEPLUS:
1300 case OP_TYPEMINPLUS:
1301 case OP_TYPEEXACT:
1302 return FALSE;
1303
1304 /* End of branch */
1305
1306 case OP_KET:
1307 case OP_KETRMAX:
1308 case OP_KETRMIN:
1309 case OP_ALT:
1310 return TRUE;
1311
1312 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1313 followed by a multibyte character */
1314
1315 #ifdef SUPPORT_UTF8
1316 case OP_STAR:
1317 case OP_MINSTAR:
1318 case OP_QUERY:
1319 case OP_MINQUERY:
1320 case OP_UPTO:
1321 case OP_MINUPTO:
1322 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1323 break;
1324 #endif
1325 }
1326 }
1327
1328 return TRUE;
1329 }
1330
1331
1332
1333 /*************************************************
1334 * Scan compiled regex for non-emptiness *
1335 *************************************************/
1336
1337 /* This function is called to check for left recursive calls. We want to check
1338 the current branch of the current pattern to see if it could match the empty
1339 string. If it could, we must look outwards for branches at other levels,
1340 stopping when we pass beyond the bracket which is the subject of the recursion.
1341
1342 Arguments:
1343 code points to start of the recursion
1344 endcode points to where to stop (current RECURSE item)
1345 bcptr points to the chain of current (unclosed) branch starts
1346 utf8 TRUE if in UTF-8 mode
1347
1348 Returns: TRUE if what is matched could be empty
1349 */
1350
1351 static BOOL
1352 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1353 BOOL utf8)
1354 {
1355 while (bcptr != NULL && bcptr->current >= code)
1356 {
1357 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1358 bcptr = bcptr->outer;
1359 }
1360 return TRUE;
1361 }
1362
1363
1364
1365 /*************************************************
1366 * Check for POSIX class syntax *
1367 *************************************************/
1368
1369 /* This function is called when the sequence "[:" or "[." or "[=" is
1370 encountered in a character class. It checks whether this is followed by an
1371 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1372 ".]" or "=]".
1373
1374 Argument:
1375 ptr pointer to the initial [
1376 endptr where to return the end pointer
1377 cd pointer to compile data
1378
1379 Returns: TRUE or FALSE
1380 */
1381
1382 static BOOL
1383 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1384 {
1385 int terminator; /* Don't combine these lines; the Solaris cc */
1386 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1387 if (*(++ptr) == '^') ptr++;
1388 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1389 if (*ptr == terminator && ptr[1] == ']')
1390 {
1391 *endptr = ptr;
1392 return TRUE;
1393 }
1394 return FALSE;
1395 }
1396
1397
1398
1399
1400 /*************************************************
1401 * Check POSIX class name *
1402 *************************************************/
1403
1404 /* This function is called to check the name given in a POSIX-style class entry
1405 such as [:alnum:].
1406
1407 Arguments:
1408 ptr points to the first letter
1409 len the length of the name
1410
1411 Returns: a value representing the name, or -1 if unknown
1412 */
1413
1414 static int
1415 check_posix_name(const uschar *ptr, int len)
1416 {
1417 register int yield = 0;
1418 while (posix_name_lengths[yield] != 0)
1419 {
1420 if (len == posix_name_lengths[yield] &&
1421 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1422 yield++;
1423 }
1424 return -1;
1425 }
1426
1427
1428
1429
1430 /*************************************************
1431 * Compile one branch *
1432 *************************************************/
1433
1434 /* Scan the pattern, compiling it into the code vector. If the options are
1435 changed during the branch, the pointer is used to change the external options
1436 bits.
1437
1438 Arguments:
1439 optionsptr pointer to the option bits
1440 brackets points to number of extracting brackets used
1441 code points to the pointer to the current code point
1442 ptrptr points to the current pattern pointer
1443 errorptr points to pointer to error message
1444 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1445 reqbyteptr set to the last literal character required, else < 0
1446 bcptr points to current branch chain
1447 cd contains pointers to tables etc.
1448
1449 Returns: TRUE on success
1450 FALSE, with *errorptr set on error
1451 */
1452
1453 static BOOL
1454 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1455 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1456 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1457 {
1458 int repeat_type, op_type;
1459 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1460 int bravalue = 0;
1461 int length;
1462 int greedy_default, greedy_non_default;
1463 int firstbyte, reqbyte;
1464 int zeroreqbyte, zerofirstbyte;
1465 int req_caseopt, reqvary, tempreqvary;
1466 int condcount = 0;
1467 int options = *optionsptr;
1468 register int c;
1469 register uschar *code = *codeptr;
1470 uschar *tempcode;
1471 BOOL inescq = FALSE;
1472 BOOL groupsetfirstbyte = FALSE;
1473 const uschar *ptr = *ptrptr;
1474 const uschar *tempptr;
1475 uschar *previous = NULL;
1476 uschar class[32];
1477
1478 #ifdef SUPPORT_UTF8
1479 BOOL class_utf8;
1480 BOOL utf8 = (options & PCRE_UTF8) != 0;
1481 uschar *class_utf8data;
1482 uschar utf8_char[6];
1483 #else
1484 BOOL utf8 = FALSE;
1485 #endif
1486
1487 /* Set up the default and non-default settings for greediness */
1488
1489 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1490 greedy_non_default = greedy_default ^ 1;
1491
1492 /* Initialize no first char, no required char. REQ_UNSET means "no char
1493 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1494 matches a non-fixed char first char; reqbyte just remains unset if we never
1495 find one.
1496
1497 When we hit a repeat whose minimum is zero, we may have to adjust these values
1498 to take the zero repeat into account. This is implemented by setting them to
1499 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1500 item types that can be repeated set these backoff variables appropriately. */
1501
1502 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1503
1504 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1505 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1506 value > 255. It is added into the firstbyte or reqbyte variables to record the
1507 case status of the value. */
1508
1509 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1510
1511 /* Switch on next character until the end of the branch */
1512
1513 for (;; ptr++)
1514 {
1515 BOOL negate_class;
1516 BOOL possessive_quantifier;
1517 int class_charcount;
1518 int class_lastchar;
1519 int newoptions;
1520 int recno;
1521 int skipbytes;
1522 int subreqbyte;
1523 int subfirstbyte;
1524
1525 c = *ptr;
1526 if (inescq && c != 0) goto NORMAL_CHAR;
1527
1528 if ((options & PCRE_EXTENDED) != 0)
1529 {
1530 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1531 if (c == '#')
1532 {
1533 /* The space before the ; is to avoid a warning on a silly compiler
1534 on the Macintosh. */
1535 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1536 if (c != 0) continue; /* Else fall through to handle end of string */
1537 }
1538 }
1539
1540 switch(c)
1541 {
1542 /* The branch terminates at end of string, |, or ). */
1543
1544 case 0:
1545 case '|':
1546 case ')':
1547 *firstbyteptr = firstbyte;
1548 *reqbyteptr = reqbyte;
1549 *codeptr = code;
1550 *ptrptr = ptr;
1551 return TRUE;
1552
1553 /* Handle single-character metacharacters. In multiline mode, ^ disables
1554 the setting of any following char as a first character. */
1555
1556 case '^':
1557 if ((options & PCRE_MULTILINE) != 0)
1558 {
1559 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1560 }
1561 previous = NULL;
1562 *code++ = OP_CIRC;
1563 break;
1564
1565 case '$':
1566 previous = NULL;
1567 *code++ = OP_DOLL;
1568 break;
1569
1570 /* There can never be a first char if '.' is first, whatever happens about
1571 repeats. The value of reqbyte doesn't change either. */
1572
1573 case '.':
1574 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1575 zerofirstbyte = firstbyte;
1576 zeroreqbyte = reqbyte;
1577 previous = code;
1578 *code++ = OP_ANY;
1579 break;
1580
1581 /* Character classes. If the included characters are all < 255 in value, we
1582 build a 32-byte bitmap of the permitted characters, except in the special
1583 case where there is only one such character. For negated classes, we build
1584 the map as usual, then invert it at the end. However, we use a different
1585 opcode so that data characters > 255 can be handled correctly.
1586
1587 If the class contains characters outside the 0-255 range, a different
1588 opcode is compiled. It may optionally have a bit map for characters < 256,
1589 but those above are are explicitly listed afterwards. A flag byte tells
1590 whether the bitmap is present, and whether this is a negated class or not.
1591 */
1592
1593 case '[':
1594 previous = code;
1595
1596 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1597 they are encountered at the top level, so we'll do that too. */
1598
1599 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1600 check_posix_syntax(ptr, &tempptr, cd))
1601 {
1602 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1603 goto FAILED;
1604 }
1605
1606 /* If the first character is '^', set the negation flag and skip it. */
1607
1608 if ((c = *(++ptr)) == '^')
1609 {
1610 negate_class = TRUE;
1611 c = *(++ptr);
1612 }
1613 else
1614 {
1615 negate_class = FALSE;
1616 }
1617
1618 /* Keep a count of chars with values < 256 so that we can optimize the case
1619 of just a single character (as long as it's < 256). For higher valued UTF-8
1620 characters, we don't yet do any optimization. */
1621
1622 class_charcount = 0;
1623 class_lastchar = -1;
1624
1625 #ifdef SUPPORT_UTF8
1626 class_utf8 = FALSE; /* No chars >= 256 */
1627 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1628 #endif
1629
1630 /* Initialize the 32-char bit map to all zeros. We have to build the
1631 map in a temporary bit of store, in case the class contains only 1
1632 character (< 256), because in that case the compiled code doesn't use the
1633 bit map. */
1634
1635 memset(class, 0, 32 * sizeof(uschar));
1636
1637 /* Process characters until ] is reached. By writing this as a "do" it
1638 means that an initial ] is taken as a data character. The first pass
1639 through the regex checked the overall syntax, so we don't need to be very
1640 strict here. At the start of the loop, c contains the first byte of the
1641 character. */
1642
1643 do
1644 {
1645 #ifdef SUPPORT_UTF8
1646 if (utf8 && c > 127) GETCHARLEN(c, ptr, ptr);
1647 #endif
1648
1649 /* Inside \Q...\E everything is literal except \E */
1650
1651 if (inescq)
1652 {
1653 if (c == '\\' && ptr[1] == 'E')
1654 {
1655 inescq = FALSE;
1656 ptr++;
1657 continue;
1658 }
1659 else goto LONE_SINGLE_CHARACTER;
1660 }
1661
1662 /* Handle POSIX class names. Perl allows a negation extension of the
1663 form [:^name:]. A square bracket that doesn't match the syntax is
1664 treated as a literal. We also recognize the POSIX constructions
1665 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1666 5.6 and 5.8 do. */
1667
1668 if (c == '[' &&
1669 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1670 check_posix_syntax(ptr, &tempptr, cd))
1671 {
1672 BOOL local_negate = FALSE;
1673 int posix_class, i;
1674 register const uschar *cbits = cd->cbits;
1675
1676 if (ptr[1] != ':')
1677 {
1678 *errorptr = ERR31;
1679 goto FAILED;
1680 }
1681
1682 ptr += 2;
1683 if (*ptr == '^')
1684 {
1685 local_negate = TRUE;
1686 ptr++;
1687 }
1688
1689 posix_class = check_posix_name(ptr, tempptr - ptr);
1690 if (posix_class < 0)
1691 {
1692 *errorptr = ERR30;
1693 goto FAILED;
1694 }
1695
1696 /* If matching is caseless, upper and lower are converted to
1697 alpha. This relies on the fact that the class table starts with
1698 alpha, lower, upper as the first 3 entries. */
1699
1700 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1701 posix_class = 0;
1702
1703 /* Or into the map we are building up to 3 of the static class
1704 tables, or their negations. The [:blank:] class sets up the same
1705 chars as the [:space:] class (all white space). We remove the vertical
1706 white space chars afterwards. */
1707
1708 posix_class *= 3;
1709 for (i = 0; i < 3; i++)
1710 {
1711 BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1712 int taboffset = posix_class_maps[posix_class + i];
1713 if (taboffset < 0) break;
1714 if (local_negate)
1715 {
1716 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1717 if (isblank) class[1] |= 0x3c;
1718 }
1719 else
1720 {
1721 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1722 if (isblank) class[1] &= ~0x3c;
1723 }
1724 }
1725
1726 ptr = tempptr + 1;
1727 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1728 continue; /* End of POSIX syntax handling */
1729 }
1730
1731 /* Backslash may introduce a single character, or it may introduce one
1732 of the specials, which just set a flag. Escaped items are checked for
1733 validity in the pre-compiling pass. The sequence \b is a special case.
1734 Inside a class (and only there) it is treated as backspace. Elsewhere
1735 it marks a word boundary. Other escapes have preset maps ready to
1736 or into the one we are building. We assume they have more than one
1737 character in them, so set class_charcount bigger than one. */
1738
1739 if (c == '\\')
1740 {
1741 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1742 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1743
1744 if (-c == ESC_Q) /* Handle start of quoted string */
1745 {
1746 if (ptr[1] == '\\' && ptr[2] == 'E')
1747 {
1748 ptr += 2; /* avoid empty string */
1749 }
1750 else inescq = TRUE;
1751 continue;
1752 }
1753
1754 else if (c < 0)
1755 {
1756 register const uschar *cbits = cd->cbits;
1757 class_charcount = 10; /* Greater than 1 is what matters */
1758 switch (-c)
1759 {
1760 case ESC_d:
1761 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1762 continue;
1763
1764 case ESC_D:
1765 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1766 continue;
1767
1768 case ESC_w:
1769 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1770 continue;
1771
1772 case ESC_W:
1773 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1774 continue;
1775
1776 case ESC_s:
1777 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1778 class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1779 continue;
1780
1781 case ESC_S:
1782 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1783 class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1784 continue;
1785
1786 /* Unrecognized escapes are faulted if PCRE is running in its
1787 strict mode. By default, for compatibility with Perl, they are
1788 treated as literals. */
1789
1790 default:
1791 if ((options & PCRE_EXTRA) != 0)
1792 {
1793 *errorptr = ERR7;
1794 goto FAILED;
1795 }
1796 c = *ptr; /* The final character */
1797 }
1798 }
1799
1800 /* Fall through if we have a single character (c >= 0). This may be
1801 > 256 in UTF-8 mode. */
1802
1803 } /* End of backslash handling */
1804
1805 /* A single character may be followed by '-' to form a range. However,
1806 Perl does not permit ']' to be the end of the range. A '-' character
1807 here is treated as a literal. */
1808
1809 if (ptr[1] == '-' && ptr[2] != ']')
1810 {
1811 int d;
1812 ptr += 2;
1813
1814 #ifdef SUPPORT_UTF8
1815 if (utf8)
1816 { /* Braces are required because the */
1817 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1818 }
1819 else
1820 #endif
1821 d = *ptr;
1822
1823 /* The second part of a range can be a single-character escape, but
1824 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1825 in such circumstances. */
1826
1827 if (d == '\\')
1828 {
1829 const uschar *oldptr = ptr;
1830 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1831
1832 /* \b is backslash; any other special means the '-' was literal */
1833
1834 if (d < 0)
1835 {
1836 if (d == -ESC_b) d = '\b'; else
1837 {
1838 ptr = oldptr - 2;
1839 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1840 }
1841 }
1842 }
1843
1844 /* Check that the two values are in the correct order */
1845
1846 if (d < c)
1847 {
1848 *errorptr = ERR8;
1849 goto FAILED;
1850 }
1851
1852 /* If d is greater than 255, we can't just use the bit map, so set up
1853 for the UTF-8 supporting class type. If we are not caseless, we can
1854 just set up a single range. If we are caseless, the characters < 256
1855 are handled with a bitmap, in order to get the case-insensitive
1856 handling. */
1857
1858 #ifdef SUPPORT_UTF8
1859 if (d > 255)
1860 {
1861 class_utf8 = TRUE;
1862 *class_utf8data++ = XCL_RANGE;
1863 if ((options & PCRE_CASELESS) == 0)
1864 {
1865 class_utf8data += ord2utf8(c, class_utf8data);
1866 class_utf8data += ord2utf8(d, class_utf8data);
1867 continue; /* Go get the next char in the class */
1868 }
1869 class_utf8data += ord2utf8(256, class_utf8data);
1870 class_utf8data += ord2utf8(d, class_utf8data);
1871 d = 255;
1872 /* Fall through */
1873 }
1874 #endif
1875 /* We use the bit map if the range is entirely < 255, or if part of it
1876 is < 255 and matching is caseless. */
1877
1878 for (; c <= d; c++)
1879 {
1880 class[c/8] |= (1 << (c&7));
1881 if ((options & PCRE_CASELESS) != 0)
1882 {
1883 int uc = cd->fcc[c]; /* flip case */
1884 class[uc/8] |= (1 << (uc&7));
1885 }
1886 class_charcount++; /* in case a one-char range */
1887 class_lastchar = c;
1888 }
1889
1890 continue; /* Go get the next char in the class */
1891 }
1892
1893 /* Handle a lone single character - we can get here for a normal
1894 non-escape char, or after \ that introduces a single character. */
1895
1896 LONE_SINGLE_CHARACTER:
1897
1898 /* Handle a multibyte character */
1899
1900 #ifdef SUPPORT_UTF8
1901 if (utf8 && c > 255)
1902 {
1903 class_utf8 = TRUE;
1904 *class_utf8data++ = XCL_SINGLE;
1905 class_utf8data += ord2utf8(c, class_utf8data);
1906 }
1907 else
1908 #endif
1909 /* Handle a single-byte character */
1910 {
1911 class [c/8] |= (1 << (c&7));
1912 if ((options & PCRE_CASELESS) != 0)
1913 {
1914 c = cd->fcc[c]; /* flip case */
1915 class[c/8] |= (1 << (c&7));
1916 }
1917 class_charcount++;
1918 class_lastchar = c;
1919 }
1920 }
1921
1922 /* Loop until ']' reached; the check for end of string happens inside the
1923 loop. This "while" is the end of the "do" above. */
1924
1925 while ((c = *(++ptr)) != ']' || inescq);
1926
1927 /* If class_charcount is 1, we saw precisely one character with a value <
1928 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1929 the one character is < 128. In non-UTF-8 mode we can always optimize.
1930
1931 The optimization throws away the bit map. We turn the item into a
1932 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1933 that OP_NOT does not support multibyte characters. In the positive case, it
1934 can cause firstbyte to be set. Otherwise, there can be no first char if
1935 this item is first, whatever repeat count may follow. In the case of
1936 reqbyte, save the previous value for reinstating. */
1937
1938 #ifdef SUPPORT_UTF8
1939 if (!class_utf8 && class_charcount == 1 && class_lastchar < 128)
1940 #else
1941 if (class_charcount == 1)
1942 #endif
1943 {
1944 zeroreqbyte = reqbyte;
1945 if (negate_class)
1946 {
1947 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1948 zerofirstbyte = firstbyte;
1949 *code++ = OP_NOT;
1950 }
1951 else
1952 {
1953 if (firstbyte == REQ_UNSET)
1954 {
1955 zerofirstbyte = REQ_NONE;
1956 firstbyte = class_lastchar | req_caseopt;
1957 }
1958 else
1959 {
1960 zerofirstbyte = firstbyte;
1961 reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
1962 }
1963 *code++ = OP_CHARS;
1964 *code++ = 1;
1965 }
1966 *code++ = class_lastchar;
1967 break; /* End of class handling */
1968 } /* End of 1-byte optimization */
1969
1970 /* Otherwise, if this is the first thing in the branch, there can be no
1971 first char setting, whatever the repeat count. Any reqbyte setting must
1972 remain unchanged after any kind of repeat. */
1973
1974 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1975 zerofirstbyte = firstbyte;
1976 zeroreqbyte = reqbyte;
1977
1978 /* If there are characters with values > 255, we have to compile an
1979 extended class, with its own opcode. If there are no characters < 256,
1980 we can omit the bitmap. */
1981
1982 #ifdef SUPPORT_UTF8
1983 if (class_utf8)
1984 {
1985 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
1986 *code++ = OP_XCLASS;
1987 code += LINK_SIZE;
1988 *code = negate_class? XCL_NOT : 0;
1989
1990 /* If the map is required, install it, and move on to the end of
1991 the extra data */
1992
1993 if (class_charcount > 0)
1994 {
1995 *code++ |= XCL_MAP;
1996 memcpy(code, class, 32);
1997 code = class_utf8data;
1998 }
1999
2000 /* If the map is not required, slide down the extra data. */
2001
2002 else
2003 {
2004 int len = class_utf8data - (code + 33);
2005 memmove(code + 1, code + 33, len);
2006 code += len + 1;
2007 }
2008
2009 /* Now fill in the complete length of the item */
2010
2011 PUT(previous, 1, code - previous);
2012 break; /* End of class handling */
2013 }
2014 #endif
2015
2016 /* If there are no characters > 255, negate the 32-byte map if necessary,
2017 and copy it into the code vector. If this is the first thing in the branch,
2018 there can be no first char setting, whatever the repeat count. Any reqbyte
2019 setting must remain unchanged after any kind of repeat. */
2020
2021 if (negate_class)
2022 {
2023 *code++ = OP_NCLASS;
2024 for (c = 0; c < 32; c++) code[c] = ~class[c];
2025 }
2026 else
2027 {
2028 *code++ = OP_CLASS;
2029 memcpy(code, class, 32);
2030 }
2031 code += 32;
2032 break;
2033
2034 /* Various kinds of repeat */
2035
2036 case '{':
2037 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
2038 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
2039 if (*errorptr != NULL) goto FAILED;
2040 goto REPEAT;
2041
2042 case '*':
2043 repeat_min = 0;
2044 repeat_max = -1;
2045 goto REPEAT;
2046
2047 case '+':
2048 repeat_min = 1;
2049 repeat_max = -1;
2050 goto REPEAT;
2051
2052 case '?':
2053 repeat_min = 0;
2054 repeat_max = 1;
2055
2056 REPEAT:
2057 if (previous == NULL)
2058 {
2059 *errorptr = ERR9;
2060 goto FAILED;
2061 }
2062
2063 if (repeat_min == 0)
2064 {
2065 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2066 reqbyte = zeroreqbyte; /* Ditto */
2067 }
2068
2069 /* Remember whether this is a variable length repeat */
2070
2071 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2072
2073 op_type = 0; /* Default single-char op codes */
2074 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2075
2076 /* Save start of previous item, in case we have to move it up to make space
2077 for an inserted OP_ONCE for the additional '+' extension. */
2078
2079 tempcode = previous;
2080
2081 /* If the next character is '+', we have a possessive quantifier. This
2082 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2083 If the next character is '?' this is a minimizing repeat, by default,
2084 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2085 repeat type to the non-default. */
2086
2087 if (ptr[1] == '+')
2088 {
2089 repeat_type = 0; /* Force greedy */
2090 possessive_quantifier = TRUE;
2091 ptr++;
2092 }
2093 else if (ptr[1] == '?')
2094 {
2095 repeat_type = greedy_non_default;
2096 ptr++;
2097 }
2098 else repeat_type = greedy_default;
2099
2100 /* If previous was a recursion, we need to wrap it inside brackets so that
2101 it can be replicated if necessary. */
2102
2103 if (*previous == OP_RECURSE)
2104 {
2105 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2106 code += 1 + LINK_SIZE;
2107 *previous = OP_BRA;
2108 PUT(previous, 1, code - previous);
2109 *code = OP_KET;
2110 PUT(code, 1, code - previous);
2111 code += 1 + LINK_SIZE;
2112 }
2113
2114 /* If previous was a string of characters, chop off the last one and use it
2115 as the subject of the repeat. If there was only one character, we can
2116 abolish the previous item altogether. If a one-char item has a minumum of
2117 more than one, ensure that it is set in reqbyte - it might not be if a
2118 sequence such as x{3} is the first thing in a branch because the x will
2119 have gone into firstbyte instead. */
2120
2121 if (*previous == OP_CHARS)
2122 {
2123 /* Deal with UTF-8 characters that take up more than one byte. It's
2124 easier to write this out separately than try to macrify it. Use c to
2125 hold the length of the character in bytes, plus 0x80 to flag that it's a
2126 length rather than a small character. */
2127
2128 #ifdef SUPPORT_UTF8
2129 if (utf8 && (code[-1] & 0x80) != 0)
2130 {
2131 uschar *lastchar = code - 1;
2132 while((*lastchar & 0xc0) == 0x80) lastchar--;
2133 c = code - lastchar; /* Length of UTF-8 character */
2134 memcpy(utf8_char, lastchar, c); /* Save the char */
2135 if (lastchar == previous + 2) /* There was only one character */
2136 {
2137 code = previous; /* Abolish the previous item */
2138 }
2139 else
2140 {
2141 previous[1] -= c; /* Adjust length of previous */
2142 code = lastchar; /* Lost char off the end */
2143 tempcode = code; /* Adjust position to be moved for '+' */
2144 }
2145 c |= 0x80; /* Flag c as a length */
2146 }
2147 else
2148 #endif
2149
2150 /* Handle the case of a single byte - either with no UTF8 support, or
2151 with UTF-8 disabled, or for a UTF-8 character < 128. */
2152
2153 {
2154 c = *(--code);
2155 if (code == previous + 2) /* There was only one character */
2156 {
2157 code = previous; /* Abolish the previous item */
2158 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2159 }
2160 else
2161 {
2162 previous[1]--; /* adjust length */
2163 tempcode = code; /* Adjust position to be moved for '+' */
2164 }
2165 }
2166
2167 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2168 }
2169
2170 /* If previous was a single negated character ([^a] or similar), we use
2171 one of the special opcodes, replacing it. The code is shared with single-
2172 character repeats by setting opt_type to add a suitable offset into
2173 repeat_type. OP_NOT is currently used only for single-byte chars. */
2174
2175 else if (*previous == OP_NOT)
2176 {
2177 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2178 c = previous[1];
2179 code = previous;
2180 goto OUTPUT_SINGLE_REPEAT;
2181 }
2182
2183 /* If previous was a character type match (\d or similar), abolish it and
2184 create a suitable repeat item. The code is shared with single-character
2185 repeats by setting op_type to add a suitable offset into repeat_type. */
2186
2187 else if (*previous < OP_EODN)
2188 {
2189 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2190 c = *previous;
2191 code = previous;
2192
2193 OUTPUT_SINGLE_REPEAT:
2194
2195 /* If the maximum is zero then the minimum must also be zero; Perl allows
2196 this case, so we do too - by simply omitting the item altogether. */
2197
2198 if (repeat_max == 0) goto END_REPEAT;
2199
2200 /* Combine the op_type with the repeat_type */
2201
2202 repeat_type += op_type;
2203
2204 /* A minimum of zero is handled either as the special case * or ?, or as
2205 an UPTO, with the maximum given. */
2206
2207 if (repeat_min == 0)
2208 {
2209 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2210 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2211 else
2212 {
2213 *code++ = OP_UPTO + repeat_type;
2214 PUT2INC(code, 0, repeat_max);
2215 }
2216 }
2217
2218 /* The case {1,} is handled as the special case + */
2219
2220 else if (repeat_min == 1 && repeat_max == -1)
2221 *code++ = OP_PLUS + repeat_type;
2222
2223 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2224 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2225
2226 else
2227 {
2228 if (repeat_min != 1)
2229 {
2230 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2231 PUT2INC(code, 0, repeat_min);
2232 }
2233
2234 /* If the mininum is 1 and the previous item was a character string,
2235 we either have to put back the item that got cancelled if the string
2236 length was 1, or add the character back onto the end of a longer
2237 string. For a character type nothing need be done; it will just get
2238 put back naturally. Note that the final character is always going to
2239 get added below, so we leave code ready for its insertion. */
2240
2241 else if (*previous == OP_CHARS)
2242 {
2243 if (code == previous) code += 2; else
2244
2245 /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2246 bit set as a flag. The length will always be between 2 and 6. */
2247
2248 #ifdef SUPPORT_UTF8
2249 if (utf8 && c >= 128) previous[1] += c & 7; else
2250 #endif
2251 previous[1]++;
2252 }
2253
2254 /* For a single negated character we also have to put back the
2255 item that got cancelled. At present this applies only to single byte
2256 characters in any mode. */
2257
2258 else if (*previous == OP_NOT) code++;
2259
2260 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2261 we have to insert the character for the previous code. In UTF-8 mode,
2262 long characters have their length in c, with the 0x80 bit as a flag. */
2263
2264 if (repeat_max < 0)
2265 {
2266 #ifdef SUPPORT_UTF8
2267 if (utf8 && c >= 128)
2268 {
2269 memcpy(code, utf8_char, c & 7);
2270 code += c & 7;
2271 }
2272 else
2273 #endif
2274 *code++ = c;
2275 *code++ = OP_STAR + repeat_type;
2276 }
2277
2278 /* Else insert an UPTO if the max is greater than the min, again
2279 preceded by the character, for the previously inserted code. */
2280
2281 else if (repeat_max != repeat_min)
2282 {
2283 #ifdef SUPPORT_UTF8
2284 if (utf8 && c >= 128)
2285 {
2286 memcpy(code, utf8_char, c & 7);
2287 code += c & 7;
2288 }
2289 else
2290 #endif
2291 *code++ = c;
2292 repeat_max -= repeat_min;
2293 *code++ = OP_UPTO + repeat_type;
2294 PUT2INC(code, 0, repeat_max);
2295 }
2296 }
2297
2298 /* The character or character type itself comes last in all cases. */
2299
2300 #ifdef SUPPORT_UTF8
2301 if (utf8 && c >= 128)
2302 {
2303 memcpy(code, utf8_char, c & 7);
2304 code += c & 7;
2305 }
2306 else
2307 #endif
2308
2309 *code++ = c;
2310 }
2311
2312 /* If previous was a character class or a back reference, we put the repeat
2313 stuff after it, but just skip the item if the repeat was {0,0}. */
2314
2315 else if (*previous == OP_CLASS ||
2316 *previous == OP_NCLASS ||
2317 #ifdef SUPPORT_UTF8
2318 *previous == OP_XCLASS ||
2319 #endif
2320 *previous == OP_REF)
2321 {
2322 if (repeat_max == 0)
2323 {
2324 code = previous;
2325 goto END_REPEAT;
2326 }
2327 if (repeat_min == 0 && repeat_max == -1)
2328 *code++ = OP_CRSTAR + repeat_type;
2329 else if (repeat_min == 1 && repeat_max == -1)
2330 *code++ = OP_CRPLUS + repeat_type;
2331 else if (repeat_min == 0 && repeat_max == 1)
2332 *code++ = OP_CRQUERY + repeat_type;
2333 else
2334 {
2335 *code++ = OP_CRRANGE + repeat_type;
2336 PUT2INC(code, 0, repeat_min);
2337 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2338 PUT2INC(code, 0, repeat_max);
2339 }
2340 }
2341
2342 /* If previous was a bracket group, we may have to replicate it in certain
2343 cases. */
2344
2345 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2346 *previous == OP_COND)
2347 {
2348 register int i;
2349 int ketoffset = 0;
2350 int len = code - previous;
2351 uschar *bralink = NULL;
2352
2353 /* If the maximum repeat count is unlimited, find the end of the bracket
2354 by scanning through from the start, and compute the offset back to it
2355 from the current code pointer. There may be an OP_OPT setting following
2356 the final KET, so we can't find the end just by going back from the code
2357 pointer. */
2358
2359 if (repeat_max == -1)
2360 {
2361 register uschar *ket = previous;
2362 do ket += GET(ket, 1); while (*ket != OP_KET);
2363 ketoffset = code - ket;
2364 }
2365
2366 /* The case of a zero minimum is special because of the need to stick
2367 OP_BRAZERO in front of it, and because the group appears once in the
2368 data, whereas in other cases it appears the minimum number of times. For
2369 this reason, it is simplest to treat this case separately, as otherwise
2370 the code gets far too messy. There are several special subcases when the
2371 minimum is zero. */
2372
2373 if (repeat_min == 0)
2374 {
2375 /* If the maximum is also zero, we just omit the group from the output
2376 altogether. */
2377
2378 if (repeat_max == 0)
2379 {
2380 code = previous;
2381 goto END_REPEAT;
2382 }
2383
2384 /* If the maximum is 1 or unlimited, we just have to stick in the
2385 BRAZERO and do no more at this point. */
2386
2387 if (repeat_max <= 1)
2388 {
2389 memmove(previous+1, previous, len);
2390 code++;
2391 *previous++ = OP_BRAZERO + repeat_type;
2392 }
2393
2394 /* If the maximum is greater than 1 and limited, we have to replicate
2395 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2396 The first one has to be handled carefully because it's the original
2397 copy, which has to be moved up. The remainder can be handled by code
2398 that is common with the non-zero minimum case below. We just have to
2399 adjust the value or repeat_max, since one less copy is required. */
2400
2401 else
2402 {
2403 int offset;
2404 memmove(previous + 2 + LINK_SIZE, previous, len);
2405 code += 2 + LINK_SIZE;
2406 *previous++ = OP_BRAZERO + repeat_type;
2407 *previous++ = OP_BRA;
2408
2409 /* We chain together the bracket offset fields that have to be
2410 filled in later when the ends of the brackets are reached. */
2411
2412 offset = (bralink == NULL)? 0 : previous - bralink;
2413 bralink = previous;
2414 PUTINC(previous, 0, offset);
2415 }
2416
2417 repeat_max--;
2418 }
2419
2420 /* If the minimum is greater than zero, replicate the group as many
2421 times as necessary, and adjust the maximum to the number of subsequent
2422 copies that we need. If we set a first char from the group, and didn't
2423 set a required char, copy the latter from the former. */
2424
2425 else
2426 {
2427 if (repeat_min > 1)
2428 {
2429 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2430 for (i = 1; i < repeat_min; i++)
2431 {
2432 memcpy(code, previous, len);
2433 code += len;
2434 }
2435 }
2436 if (repeat_max > 0) repeat_max -= repeat_min;
2437 }
2438
2439 /* This code is common to both the zero and non-zero minimum cases. If
2440 the maximum is limited, it replicates the group in a nested fashion,
2441 remembering the bracket starts on a stack. In the case of a zero minimum,
2442 the first one was set up above. In all cases the repeat_max now specifies
2443 the number of additional copies needed. */
2444
2445 if (repeat_max >= 0)
2446 {
2447 for (i = repeat_max - 1; i >= 0; i--)
2448 {
2449 *code++ = OP_BRAZERO + repeat_type;
2450
2451 /* All but the final copy start a new nesting, maintaining the
2452 chain of brackets outstanding. */
2453
2454 if (i != 0)
2455 {
2456 int offset;
2457 *code++ = OP_BRA;
2458 offset = (bralink == NULL)? 0 : code - bralink;
2459 bralink = code;
2460 PUTINC(code, 0, offset);
2461 }
2462
2463 memcpy(code, previous, len);
2464 code += len;
2465 }
2466
2467 /* Now chain through the pending brackets, and fill in their length
2468 fields (which are holding the chain links pro tem). */
2469
2470 while (bralink != NULL)
2471 {
2472 int oldlinkoffset;
2473 int offset = code - bralink + 1;
2474 uschar *bra = code - offset;
2475 oldlinkoffset = GET(bra, 1);
2476 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2477 *code++ = OP_KET;
2478 PUTINC(code, 0, offset);
2479 PUT(bra, 1, offset);
2480 }
2481 }
2482
2483 /* If the maximum is unlimited, set a repeater in the final copy. We
2484 can't just offset backwards from the current code point, because we
2485 don't know if there's been an options resetting after the ket. The
2486 correct offset was computed above. */
2487
2488 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2489 }
2490
2491 /* Else there's some kind of shambles */
2492
2493 else
2494 {
2495 *errorptr = ERR11;
2496 goto FAILED;
2497 }
2498
2499 /* If the character following a repeat is '+', we wrap the entire repeated
2500 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2501 Sun's Java package. The repeated item starts at tempcode, not at previous,
2502 which might be the first part of a string whose (former) last char we
2503 repeated. However, we don't support '+' after a greediness '?'. */
2504
2505 if (possessive_quantifier)
2506 {
2507 int len = code - tempcode;
2508 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2509 code += 1 + LINK_SIZE;
2510 len += 1 + LINK_SIZE;
2511 tempcode[0] = OP_ONCE;
2512 *code++ = OP_KET;
2513 PUTINC(code, 0, len);
2514 PUT(tempcode, 1, len);
2515 }
2516
2517 /* In all case we no longer have a previous item. We also set the
2518 "follows varying string" flag for subsequently encountered reqbytes if
2519 it isn't already set and we have just passed a varying length item. */
2520
2521 END_REPEAT:
2522 previous = NULL;
2523 cd->req_varyopt |= reqvary;
2524 break;
2525
2526
2527 /* Start of nested bracket sub-expression, or comment or lookahead or
2528 lookbehind or option setting or condition. First deal with special things
2529 that can come after a bracket; all are introduced by ?, and the appearance
2530 of any of them means that this is not a referencing group. They were
2531 checked for validity in the first pass over the string, so we don't have to
2532 check for syntax errors here. */
2533
2534 case '(':
2535 newoptions = options;
2536 skipbytes = 0;
2537
2538 if (*(++ptr) == '?')
2539 {
2540 int set, unset;
2541 int *optset;
2542
2543 switch (*(++ptr))
2544 {
2545 case '#': /* Comment; skip to ket */
2546 ptr++;
2547 while (*ptr != ')') ptr++;
2548 continue;
2549
2550 case ':': /* Non-extracting bracket */
2551 bravalue = OP_BRA;
2552 ptr++;
2553 break;
2554
2555 case '(':
2556 bravalue = OP_COND; /* Conditional group */
2557
2558 /* Condition to test for recursion */
2559
2560 if (ptr[1] == 'R')
2561 {
2562 code[1+LINK_SIZE] = OP_CREF;
2563 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2564 skipbytes = 3;
2565 ptr += 3;
2566 }
2567
2568 /* Condition to test for a numbered subpattern match */
2569
2570 else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
2571 {
2572 int condref; /* Don't amalgamate; some compilers */
2573 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2574 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2575 if (condref == 0)
2576 {
2577 *errorptr = ERR35;
2578 goto FAILED;
2579 }
2580 ptr++;
2581 code[1+LINK_SIZE] = OP_CREF;
2582 PUT2(code, 2+LINK_SIZE, condref);
2583 skipbytes = 3;
2584 }
2585 /* For conditions that are assertions, we just fall through, having
2586 set bravalue above. */
2587 break;
2588
2589 case '=': /* Positive lookahead */
2590 bravalue = OP_ASSERT;
2591 ptr++;
2592 break;
2593
2594 case '!': /* Negative lookahead */
2595 bravalue = OP_ASSERT_NOT;
2596 ptr++;
2597 break;
2598
2599 case '<': /* Lookbehinds */
2600 switch (*(++ptr))
2601 {
2602 case '=': /* Positive lookbehind */
2603 bravalue = OP_ASSERTBACK;
2604 ptr++;
2605 break;
2606
2607 case '!': /* Negative lookbehind */
2608 bravalue = OP_ASSERTBACK_NOT;
2609 ptr++;
2610 break;
2611 }
2612 break;
2613
2614 case '>': /* One-time brackets */
2615 bravalue = OP_ONCE;
2616 ptr++;
2617 break;
2618
2619 case 'C': /* Callout - may be followed by digits */
2620 *code++ = OP_CALLOUT;
2621 {
2622 int n = 0;
2623 while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
2624 n = n * 10 + *ptr - '0';
2625 if (n > 255)
2626 {
2627 *errorptr = ERR38;
2628 goto FAILED;
2629 }
2630 *code++ = n;
2631 }
2632 previous = NULL;
2633 continue;
2634
2635 case 'P': /* Named subpattern handling */
2636 if (*(++ptr) == '<') /* Definition */
2637 {
2638 int i, namelen;
2639 uschar *slot = cd->name_table;
2640 const uschar *name; /* Don't amalgamate; some compilers */
2641 name = ++ptr; /* grumble at autoincrement in declaration */
2642
2643 while (*ptr++ != '>');
2644 namelen = ptr - name - 1;
2645
2646 for (i = 0; i < cd->names_found; i++)
2647 {
2648 int c = memcmp(name, slot+2, namelen);
2649 if (c == 0)
2650 {
2651 if (slot[2+namelen] == 0)
2652 {
2653 *errorptr = ERR43;
2654 goto FAILED;
2655 }
2656 c = -1; /* Current name is substring */
2657 }
2658 if (c < 0)
2659 {
2660 memmove(slot + cd->name_entry_size, slot,
2661 (cd->names_found - i) * cd->name_entry_size);
2662 break;
2663 }
2664 slot += cd->name_entry_size;
2665 }
2666
2667 PUT2(slot, 0, *brackets + 1);
2668 memcpy(slot + 2, name, namelen);
2669 slot[2+namelen] = 0;
2670 cd->names_found++;
2671 goto NUMBERED_GROUP;
2672 }
2673
2674 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2675 {
2676 int i, namelen;
2677 int type = *ptr++;
2678 const uschar *name = ptr;
2679 uschar *slot = cd->name_table;
2680
2681 while (*ptr != ')') ptr++;
2682 namelen = ptr - name;
2683
2684 for (i = 0; i < cd->names_found; i++)
2685 {
2686 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2687 slot += cd->name_entry_size;
2688 }
2689 if (i >= cd->names_found)
2690 {
2691 *errorptr = ERR15;
2692 goto FAILED;
2693 }
2694
2695 recno = GET2(slot, 0);
2696
2697 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2698
2699 /* Back reference */
2700
2701 previous = code;
2702 *code++ = OP_REF;
2703 PUT2INC(code, 0, recno);
2704 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2705 if (recno > cd->top_backref) cd->top_backref = recno;
2706 continue;
2707 }
2708
2709 /* Should never happen */
2710 break;
2711
2712 case 'R': /* Pattern recursion */
2713 ptr++; /* Same as (?0) */
2714 /* Fall through */
2715
2716 /* Recursion or "subroutine" call */
2717
2718 case '0': case '1': case '2': case '3': case '4':
2719 case '5': case '6': case '7': case '8': case '9':
2720 {
2721 const uschar *called;
2722 recno = 0;
2723
2724 while ((cd->ctypes[*ptr] & ctype_digit) != 0)
2725 recno = recno * 10 + *ptr++ - '0';
2726
2727 /* Come here from code above that handles a named recursion */
2728
2729 HANDLE_RECURSION:
2730
2731 previous = code;
2732
2733 /* Find the bracket that is being referenced. Temporarily end the
2734 regex in case it doesn't exist. */
2735
2736 *code = OP_END;
2737 called = (recno == 0)?
2738 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2739
2740 if (called == NULL)
2741 {
2742 *errorptr = ERR15;
2743 goto FAILED;
2744 }
2745
2746 /* If the subpattern is still open, this is a recursive call. We
2747 check to see if this is a left recursion that could loop for ever,
2748 and diagnose that case. */
2749
2750 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2751 {
2752 *errorptr = ERR40;
2753 goto FAILED;
2754 }
2755
2756 /* Insert the recursion/subroutine item */
2757
2758 *code = OP_RECURSE;
2759 PUT(code, 1, called - cd->start_code);
2760 code += 1 + LINK_SIZE;
2761 }
2762 continue;
2763
2764 /* Character after (? not specially recognized */
2765
2766 default: /* Option setting */
2767 set = unset = 0;
2768 optset = &set;
2769
2770 while (*ptr != ')' && *ptr != ':')
2771 {
2772 switch (*ptr++)
2773 {
2774 case '-': optset = &unset; break;
2775
2776 case 'i': *optset |= PCRE_CASELESS; break;
2777 case 'm': *optset |= PCRE_MULTILINE; break;
2778 case 's': *optset |= PCRE_DOTALL; break;
2779 case 'x': *optset |= PCRE_EXTENDED; break;
2780 case 'U': *optset |= PCRE_UNGREEDY; break;
2781 case 'X': *optset |= PCRE_EXTRA; break;
2782 }
2783 }
2784
2785 /* Set up the changed option bits, but don't change anything yet. */
2786
2787 newoptions = (options | set) & (~unset);
2788
2789 /* If the options ended with ')' this is not the start of a nested
2790 group with option changes, so the options change at this level. Compile
2791 code to change the ims options if this setting actually changes any of
2792 them. We also pass the new setting back so that it can be put at the
2793 start of any following branches, and when this group ends (if we are in
2794 a group), a resetting item can be compiled.
2795
2796 Note that if this item is right at the start of the pattern, the
2797 options will have been abstracted and made global, so there will be no
2798 change to compile. */
2799
2800 if (*ptr == ')')
2801 {
2802 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
2803 {
2804 *code++ = OP_OPT;
2805 *code++ = newoptions & PCRE_IMS;
2806 }
2807
2808 /* Change options at this level, and pass them back for use
2809 in subsequent branches. Reset the greedy defaults and the case
2810 value for firstbyte and reqbyte. */
2811
2812 *optionsptr = options = newoptions;
2813 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2814 greedy_non_default = greedy_default ^ 1;
2815 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2816
2817 previous = NULL; /* This item can't be repeated */
2818 continue; /* It is complete */
2819 }
2820
2821 /* If the options ended with ':' we are heading into a nested group
2822 with possible change of options. Such groups are non-capturing and are
2823 not assertions of any kind. All we need to do is skip over the ':';
2824 the newoptions value is handled below. */
2825
2826 bravalue = OP_BRA;
2827 ptr++;
2828 }
2829 }
2830
2831 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2832 non-capturing and behave like (?:...) brackets */
2833
2834 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2835 {
2836 bravalue = OP_BRA;
2837 }
2838
2839 /* Else we have a referencing group; adjust the opcode. If the bracket
2840 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2841 arrange for the true number to follow later, in an OP_BRANUMBER item. */
2842
2843 else
2844 {
2845 NUMBERED_GROUP:
2846 if (++(*brackets) > EXTRACT_BASIC_MAX)
2847 {
2848 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2849 code[1+LINK_SIZE] = OP_BRANUMBER;
2850 PUT2(code, 2+LINK_SIZE, *brackets);
2851 skipbytes = 3;
2852 }
2853 else bravalue = OP_BRA + *brackets;
2854 }
2855
2856 /* Process nested bracketed re. Assertions may not be repeated, but other
2857 kinds can be. We copy code into a non-register variable in order to be able
2858 to pass its address because some compilers complain otherwise. Pass in a
2859 new setting for the ims options if they have changed. */
2860
2861 previous = (bravalue >= OP_ONCE)? code : NULL;
2862 *code = bravalue;
2863 tempcode = code;
2864 tempreqvary = cd->req_varyopt; /* Save value before bracket */
2865
2866 if (!compile_regex(
2867 newoptions, /* The complete new option state */
2868 options & PCRE_IMS, /* The previous ims option state */
2869 brackets, /* Extracting bracket count */
2870 &tempcode, /* Where to put code (updated) */
2871 &ptr, /* Input pointer (updated) */
2872 errorptr, /* Where to put an error message */
2873 (bravalue == OP_ASSERTBACK ||
2874 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2875 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
2876 &subfirstbyte, /* For possible first char */
2877 &subreqbyte, /* For possible last char */
2878 bcptr, /* Current branch chain */
2879 cd)) /* Tables block */
2880 goto FAILED;
2881
2882 /* At the end of compiling, code is still pointing to the start of the
2883 group, while tempcode has been updated to point past the end of the group
2884 and any option resetting that may follow it. The pattern pointer (ptr)
2885 is on the bracket. */
2886
2887 /* If this is a conditional bracket, check that there are no more than
2888 two branches in the group. */
2889
2890 else if (bravalue == OP_COND)
2891 {
2892 uschar *tc = code;
2893 condcount = 0;
2894
2895 do {
2896 condcount++;
2897 tc += GET(tc,1);
2898 }
2899 while (*tc != OP_KET);
2900
2901 if (condcount > 2)
2902 {
2903 *errorptr = ERR27;
2904 goto FAILED;
2905 }
2906
2907 /* If there is just one branch, we must not make use of its firstbyte or
2908 reqbyte, because this is equivalent to an empty second branch. */
2909
2910 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2911 }
2912
2913 /* Handle updating of the required and first characters. Update for normal
2914 brackets of all kinds, and conditions with two branches (see code above).
2915 If the bracket is followed by a quantifier with zero repeat, we have to
2916 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2917 main loop so that they can be accessed for the back off. */
2918
2919 zeroreqbyte = reqbyte;
2920 zerofirstbyte = firstbyte;
2921 groupsetfirstbyte = FALSE;
2922
2923 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2924 {
2925 /* If we have not yet set a firstbyte in this branch, take it from the
2926 subpattern, remembering that it was set here so that a repeat of more
2927 than one can replicate it as reqbyte if necessary. If the subpattern has
2928 no firstbyte, set "none" for the whole branch. In both cases, a zero
2929 repeat forces firstbyte to "none". */
2930
2931 if (firstbyte == REQ_UNSET)
2932 {
2933 if (subfirstbyte >= 0)
2934 {
2935 firstbyte = subfirstbyte;
2936 groupsetfirstbyte = TRUE;
2937 }
2938 else firstbyte = REQ_NONE;
2939 zerofirstbyte = REQ_NONE;
2940 }
2941
2942 /* If firstbyte was previously set, convert the subpattern's firstbyte
2943 into reqbyte if there wasn't one, using the vary flag that was in
2944 existence beforehand. */
2945
2946 else if (subfirstbyte >= 0 && subreqbyte < 0)
2947 subreqbyte = subfirstbyte | tempreqvary;
2948
2949 /* If the subpattern set a required byte (or set a first byte that isn't
2950 really the first byte - see above), set it. */
2951
2952 if (subreqbyte >= 0) reqbyte = subreqbyte;
2953 }
2954
2955 /* For a forward assertion, we take the reqbyte, if set. This can be
2956 helpful if the pattern that follows the assertion doesn't set a different
2957 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
2958 for an assertion, however because it leads to incorrect effect for patterns
2959 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
2960 of a firstbyte. This is overcome by a scan at the end if there's no
2961 firstbyte, looking for an asserted first char. */
2962
2963 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
2964
2965 /* Now update the main code pointer to the end of the group. */
2966
2967 code = tempcode;
2968
2969 /* Error if hit end of pattern */
2970
2971 if (*ptr != ')')
2972 {
2973 *errorptr = ERR14;
2974 goto FAILED;
2975 }
2976 break;
2977
2978 /* Check \ for being a real metacharacter; if not, fall through and handle
2979 it as a data character at the start of a string. Escape items are checked
2980 for validity in the pre-compiling pass. */
2981
2982 case '\\':
2983 tempptr = ptr;
2984 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2985
2986 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
2987 are arranged to be the negation of the corresponding OP_values. For the
2988 back references, the values are ESC_REF plus the reference number. Only
2989 back references and those types that consume a character may be repeated.
2990 We can test for values between ESC_b and ESC_Z for the latter; this may
2991 have to change if any new ones are ever created. */
2992
2993 if (c < 0)
2994 {
2995 if (-c == ESC_Q) /* Handle start of quoted string */
2996 {
2997 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
2998 else inescq = TRUE;
2999 continue;
3000 }
3001
3002 /* For metasequences that actually match a character, we disable the
3003 setting of a first character if it hasn't already been set. */
3004
3005 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3006 firstbyte = REQ_NONE;
3007
3008 /* Set values to reset to if this is followed by a zero repeat. */
3009
3010 zerofirstbyte = firstbyte;
3011 zeroreqbyte = reqbyte;
3012
3013 /* Back references are handled specially */
3014
3015 if (-c >= ESC_REF)
3016 {
3017 int number = -c - ESC_REF;
3018 previous = code;
3019 *code++ = OP_REF;
3020 PUT2INC(code, 0, number);
3021 }
3022 else
3023 {
3024 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3025 *code++ = -c;
3026 }
3027 continue;
3028 }
3029
3030 /* Data character: reset and fall through */
3031
3032 ptr = tempptr;
3033 c = '\\';
3034
3035 /* Handle a run of data characters until a metacharacter is encountered.
3036 The first character is guaranteed not to be whitespace or # when the
3037 extended flag is set. */
3038
3039 NORMAL_CHAR:
3040 default:
3041 previous = code;
3042 *code = OP_CHARS;
3043 code += 2;
3044 length = 0;
3045
3046 do
3047 {
3048 /* If in \Q...\E, check for the end; if not, we always have a literal */
3049
3050 if (inescq)
3051 {
3052 if (c == '\\' && ptr[1] == 'E')
3053 {
3054 inescq = FALSE;
3055 ptr++;
3056 }
3057 else
3058 {
3059 *code++ = c;
3060 length++;
3061 }
3062 continue;
3063 }
3064
3065 /* Skip white space and comments for /x patterns */
3066
3067 if ((options & PCRE_EXTENDED) != 0)
3068 {
3069 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3070 if (c == '#')
3071 {
3072 /* The space before the ; is to avoid a warning on a silly compiler
3073 on the Macintosh. */
3074 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3075 if (c == 0) break;
3076 continue;
3077 }
3078 }
3079
3080 /* Backslash may introduce a data char or a metacharacter. Escaped items
3081 are checked for validity in the pre-compiling pass. Stop the string
3082 before a metaitem. */
3083
3084 if (c == '\\')
3085 {
3086 tempptr = ptr;
3087 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3088 if (c < 0) { ptr = tempptr; break; }
3089
3090 /* If a character is > 127 in UTF-8 mode, we have to turn it into
3091 two or more characters in the UTF-8 encoding. */
3092
3093 #ifdef SUPPORT_UTF8
3094 if (utf8 && c > 127)
3095 {
3096 uschar buffer[8];
3097 int len = ord2utf8(c, buffer);
3098 for (c = 0; c < len; c++) *code++ = buffer[c];
3099 length += len;
3100 continue;
3101 }
3102 #endif
3103 }
3104
3105 /* Ordinary character or single-char escape */
3106
3107 *code++ = c;
3108 length++;
3109 }
3110
3111 /* This "while" is the end of the "do" above. */
3112
3113 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3114
3115 /* Update the first and last requirements. These are always bytes, even in
3116 UTF-8 mode. However, there is a special case to be considered when there
3117 are only one or two characters. Because this gets messy in UTF-8 mode, the
3118 code is kept separate. When we get here "length" contains the number of
3119 bytes. */
3120
3121 #ifdef SUPPORT_UTF8
3122 if (utf8 && length > 1)
3123 {
3124 uschar *t = previous + 3; /* After this code, t */
3125 while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
3126
3127 /* Handle the case when there is only one multibyte character. It must
3128 have at least two bytes because of the "length > 1" test above. */
3129
3130 if (t == code)
3131 {
3132 /* If no previous first byte, set it from this character, but revert to
3133 none on a zero repeat. */
3134
3135 if (firstbyte == REQ_UNSET)
3136 {
3137 zerofirstbyte = REQ_NONE;
3138 firstbyte = previous[2];
3139 }
3140
3141 /* Otherwise, leave the first byte value alone, and don't change it on
3142 a zero repeat */
3143
3144 else zerofirstbyte = firstbyte;
3145
3146 /* In both cases, a zero repeat resets the previous required byte */
3147
3148 zeroreqbyte = reqbyte;
3149 }
3150
3151 /* Handle the case when there is more than one character. These may be
3152 single-byte or multibyte characters */
3153
3154 else
3155 {
3156 uschar *t = code - 1; /* After this code, t is at the */
3157 while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
3158
3159 /* If no previous first byte, set it from the first character, and
3160 retain it on a zero repeat (of the last character). The required byte
3161 is reset on a zero repeat, either to the byte before the last
3162 character, unless this is the first byte of the string. In that case,
3163 it reverts to its previous value. */
3164
3165 if (firstbyte == REQ_UNSET)
3166 {
3167 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3168 zeroreqbyte = (t - 1 == previous + 2)?
3169 reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3170 }
3171
3172 /* If there was a previous first byte, leave it alone, and don't change
3173 it on a zero repeat. The required byte is reset on a zero repeat to the
3174 byte before the last character. */
3175
3176 else
3177 {
3178 zerofirstbyte = firstbyte;
3179 zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3180 }
3181 }
3182
3183 /* In all cases (we know length > 1), the new required byte is the last
3184 byte of the string. */
3185
3186 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3187 }
3188
3189 else /* End of UTF-8 coding */
3190 #endif
3191
3192 /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3193 or when UTF-8 is not enabled. */
3194
3195 {
3196 /* firstbyte was not previously set; take it from this string */
3197
3198 if (firstbyte == REQ_UNSET)
3199 {
3200 if (length == 1)
3201 {
3202 zerofirstbyte = REQ_NONE;
3203 firstbyte = previous[2] | req_caseopt;
3204 zeroreqbyte = reqbyte;
3205 }
3206 else
3207 {
3208 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3209 zeroreqbyte = (length > 2)?
3210 (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3211 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3212 }
3213 }
3214
3215 /* firstbyte was previously set */
3216
3217 else
3218 {
3219 zerofirstbyte = firstbyte;
3220 zeroreqbyte = (length == 1)? reqbyte :
3221 code[-2] | req_caseopt | cd->req_varyopt;
3222 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3223 }
3224 }
3225
3226 /* Set the length in the data vector, and advance to the next state. */
3227
3228 previous[1] = length;
3229 if (length < MAXLIT) ptr--;
3230 break;
3231 }
3232 } /* end of big loop */
3233
3234 /* Control never reaches here by falling through, only by a goto for all the
3235 error states. Pass back the position in the pattern so that it can be displayed
3236 to the user for diagnosing the error. */
3237
3238 FAILED:
3239 *ptrptr = ptr;
3240 return FALSE;
3241 }
3242
3243
3244
3245
3246 /*************************************************
3247 * Compile sequence of alternatives *
3248 *************************************************/
3249
3250 /* On entry, ptr is pointing past the bracket character, but on return
3251 it points to the closing bracket, or vertical bar, or end of string.
3252 The code variable is pointing at the byte into which the BRA operator has been
3253 stored. If the ims options are changed at the start (for a (?ims: group) or
3254 during any branch, we need to insert an OP_OPT item at the start of every
3255 following branch to ensure they get set correctly at run time, and also pass
3256 the new options into every subsequent branch compile.
3257
3258 Argument:
3259 options option bits, including any changes for this subpattern
3260 oldims previous settings of ims option bits
3261 brackets -> int containing the number of extracting brackets used
3262 codeptr -> the address of the current code pointer
3263 ptrptr -> the address of the current pattern pointer
3264 errorptr -> pointer to error message
3265 lookbehind TRUE if this is a lookbehind assertion
3266 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3267 firstbyteptr place to put the first required character, or a negative number
3268 reqbyteptr place to put the last required character, or a negative number
3269 bcptr pointer to the chain of currently open branches
3270 cd points to the data block with tables pointers etc.
3271
3272 Returns: TRUE on success
3273 */
3274
3275 static BOOL
3276 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3277 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3278 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3279 {
3280 const uschar *ptr = *ptrptr;
3281 uschar *code = *codeptr;
3282 uschar *last_branch = code;
3283 uschar *start_bracket = code;
3284 uschar *reverse_count = NULL;
3285 int firstbyte, reqbyte;
3286 int branchfirstbyte, branchreqbyte;
3287 branch_chain bc;
3288
3289 bc.outer = bcptr;
3290 bc.current = code;
3291
3292 firstbyte = reqbyte = REQ_UNSET;
3293
3294 /* Offset is set zero to mark that this bracket is still open */
3295
3296 PUT(code, 1, 0);
3297 code += 1 + LINK_SIZE + skipbytes;
3298
3299 /* Loop for each alternative branch */
3300
3301 for (;;)
3302 {
3303 /* Handle a change of ims options at the start of the branch */
3304
3305 if ((options & PCRE_IMS) != oldims)
3306 {
3307 *code++ = OP_OPT;
3308 *code++ = options & PCRE_IMS;
3309 }
3310
3311 /* Set up dummy OP_REVERSE if lookbehind assertion */
3312
3313 if (lookbehind)
3314 {
3315 *code++ = OP_REVERSE;
3316 reverse_count = code;
3317 PUTINC(code, 0, 0);
3318 }
3319
3320 /* Now compile the branch */
3321
3322 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3323 &branchfirstbyte, &branchreqbyte, &bc, cd))
3324 {
3325 *ptrptr = ptr;
3326 return FALSE;
3327 }
3328
3329 /* If this is the first branch, the firstbyte and reqbyte values for the
3330 branch become the values for the regex. */
3331
3332 if (*last_branch != OP_ALT)
3333 {
3334 firstbyte = branchfirstbyte;
3335 reqbyte = branchreqbyte;
3336 }
3337
3338 /* If this is not the first branch, the first char and reqbyte have to
3339 match the values from all the previous branches, except that if the previous
3340 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3341 REQ_VARY for the regex. */
3342
3343 else
3344 {
3345 /* If we previously had a firstbyte, but it doesn't match the new branch,
3346 we have to abandon the firstbyte for the regex, but if there was previously
3347 no reqbyte, it takes on the value of the old firstbyte. */
3348
3349 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3350 {
3351 if (reqbyte < 0) reqbyte = firstbyte;
3352 firstbyte = REQ_NONE;
3353 }
3354
3355 /* If we (now or from before) have no firstbyte, a firstbyte from the
3356 branch becomes a reqbyte if there isn't a branch reqbyte. */
3357
3358 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3359 branchreqbyte = branchfirstbyte;
3360
3361 /* Now ensure that the reqbytes match */
3362
3363 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3364 reqbyte = REQ_NONE;
3365 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3366 }
3367
3368 /* If lookbehind, check that this branch matches a fixed-length string,
3369 and put the length into the OP_REVERSE item. Temporarily mark the end of
3370 the branch with OP_END. */
3371
3372 if (lookbehind)
3373 {
3374 int length;
3375 *code = OP_END;
3376 length = find_fixedlength(last_branch, options);
3377 DPRINTF(("fixed length = %d\n", length));
3378 if (length < 0)
3379 {
3380 *errorptr = (length == -2)? ERR36 : ERR25;
3381 *ptrptr = ptr;
3382 return FALSE;
3383 }
3384 PUT(reverse_count, 0, length);
3385 }
3386
3387 /* Reached end of expression, either ')' or end of pattern. Go back through
3388 the alternative branches and reverse the chain of offsets, with the field in
3389 the BRA item now becoming an offset to the first alternative. If there are
3390 no alternatives, it points to the end of the group. The length in the
3391 terminating ket is always the length of the whole bracketed item. If any of
3392 the ims options were changed inside the group, compile a resetting op-code
3393 following, except at the very end of the pattern. Return leaving the pointer
3394 at the terminating char. */
3395
3396 if (*ptr != '|')
3397 {
3398 int length = code - last_branch;
3399 do
3400 {
3401 int prev_length = GET(last_branch, 1);
3402 PUT(last_branch, 1, length);
3403 length = prev_length;
3404 last_branch -= length;
3405 }
3406 while (length > 0);
3407
3408 /* Fill in the ket */
3409
3410 *code = OP_KET;
3411 PUT(code, 1, code - start_bracket);
3412 code += 1 + LINK_SIZE;
3413
3414 /* Resetting option if needed */
3415
3416 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3417 {
3418 *code++ = OP_OPT;
3419 *code++ = oldims;
3420 }
3421
3422 /* Set values to pass back */
3423
3424 *codeptr = code;
3425 *ptrptr = ptr;
3426 *firstbyteptr = firstbyte;
3427 *reqbyteptr = reqbyte;
3428 return TRUE;
3429 }
3430
3431 /* Another branch follows; insert an "or" node. Its length field points back
3432 to the previous branch while the bracket remains open. At the end the chain
3433 is reversed. It's done like this so that the start of the bracket has a
3434 zero offset until it is closed, making it possible to detect recursion. */
3435
3436 *code = OP_ALT;
3437 PUT(code, 1, code - last_branch);
3438 bc.current = last_branch = code;
3439 code += 1 + LINK_SIZE;
3440 ptr++;
3441 }
3442 /* Control never reaches here */
3443 }
3444
3445
3446
3447
3448 /*************************************************
3449 * Check for anchored expression *
3450 *************************************************/
3451
3452 /* Try to find out if this is an anchored regular expression. Consider each
3453 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3454 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3455 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3456 counts, since OP_CIRC can match in the middle.
3457
3458 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3459 This is the code for \G, which means "match at start of match position, taking
3460 into account the match offset".
3461
3462 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3463 because that will try the rest of the pattern at all possible matching points,
3464 so there is no point trying again.... er ....
3465
3466 .... except when the .* appears inside capturing parentheses, and there is a
3467 subsequent back reference to those parentheses. We haven't enough information
3468 to catch that case precisely.
3469
3470 At first, the best we could do was to detect when .* was in capturing brackets
3471 and the highest back reference was greater than or equal to that level.
3472 However, by keeping a bitmap of the first 31 back references, we can catch some
3473 of the more common cases more precisely.
3474
3475 Arguments:
3476 code points to start of expression (the bracket)
3477 options points to the options setting
3478 bracket_map a bitmap of which brackets we are inside while testing; this
3479 handles up to substring 31; after that we just have to take
3480 the less precise approach
3481 backref_map the back reference bitmap
3482
3483 Returns: TRUE or FALSE
3484 */
3485
3486 static BOOL
3487 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3488 unsigned int backref_map)
3489 {
3490 do {
3491 const uschar *scode =
3492 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3493 register int op = *scode;
3494
3495 /* Capturing brackets */
3496
3497 if (op > OP_BRA)
3498 {
3499 int new_map;
3500 op -= OP_BRA;
3501 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3502 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3503 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3504 }
3505
3506 /* Other brackets */
3507
3508 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3509 {
3510 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3511 }
3512
3513 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3514 are or may be referenced. */
3515
3516 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3517 (*options & PCRE_DOTALL) != 0)
3518 {
3519 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3520 }
3521
3522 /* Check for explicit anchoring */
3523
3524 else if (op != OP_SOD && op != OP_SOM &&
3525 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3526 return FALSE;
3527 code += GET(code, 1);
3528 }
3529 while (*code == OP_ALT); /* Loop for each alternative */
3530 return TRUE;
3531 }
3532
3533
3534
3535 /*************************************************
3536 * Check for starting with ^ or .* *
3537 *************************************************/
3538
3539 /* This is called to find out if every branch starts with ^ or .* so that
3540 "first char" processing can be done to speed things up in multiline
3541 matching and for non-DOTALL patterns that start with .* (which must start at
3542 the beginning or after \n). As in the case of is_anchored() (see above), we
3543 have to take account of back references to capturing brackets that contain .*
3544 because in that case we can't make the assumption.
3545
3546 Arguments:
3547 code points to start of expression (the bracket)
3548 bracket_map a bitmap of which brackets we are inside while testing; this
3549 handles up to substring 31; after that we just have to take
3550 the less precise approach
3551 backref_map the back reference bitmap
3552
3553 Returns: TRUE or FALSE
3554 */
3555
3556 static BOOL
3557 is_startline(const uschar *code, unsigned int bracket_map,
3558 unsigned int backref_map)
3559 {
3560 do {
3561 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3562 register int op = *scode;
3563
3564 /* Capturing brackets */
3565
3566 if (op > OP_BRA)
3567 {
3568 int new_map;
3569 op -= OP_BRA;
3570 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3571 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3572 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3573 }
3574
3575 /* Other brackets */
3576
3577 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3578 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3579
3580 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3581 may be referenced. */
3582
3583 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3584 {
3585 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3586 }
3587
3588 /* Check for explicit circumflex */
3589
3590 else if (op != OP_CIRC) return FALSE;
3591 code += GET(code, 1);
3592 }
3593 while (*code == OP_ALT); /* Loop for each alternative */
3594 return TRUE;
3595 }
3596
3597
3598
3599 /*************************************************
3600 * Check for asserted fixed first char *
3601 *************************************************/
3602
3603 /* During compilation, the "first char" settings from forward assertions are
3604 discarded, because they can cause conflicts with actual literals that follow.
3605 However, if we end up without a first char setting for an unanchored pattern,
3606 it is worth scanning the regex to see if there is an initial asserted first
3607 char. If all branches start with the same asserted char, or with a bracket all
3608 of whose alternatives start with the same asserted char (recurse ad lib), then
3609 we return that char, otherwise -1.
3610
3611 Arguments:
3612 code points to start of expression (the bracket)
3613 options pointer to the options (used to check casing changes)
3614 inassert TRUE if in an assertion
3615
3616 Returns: -1 or the fixed first char
3617 */
3618
3619 static int
3620 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3621 {
3622 register int c = -1;
3623 do {
3624 int d;
3625 const uschar *scode =
3626 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3627 register int op = *scode;
3628
3629 if (op >= OP_BRA) op = OP_BRA;
3630
3631 switch(op)
3632 {
3633 default:
3634 return -1;
3635
3636 case OP_BRA:
3637 case OP_ASSERT:
3638 case OP_ONCE:
3639 case OP_COND:
3640 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3641 return -1;
3642 if (c < 0) c = d; else if (c != d) return -1;
3643 break;
3644
3645 case OP_EXACT: /* Fall through */
3646 scode++;
3647
3648 case OP_CHARS: /* Fall through */
3649 scode++;
3650
3651 case OP_PLUS:
3652 case OP_MINPLUS:
3653 if (!inassert) return -1;
3654 if (c < 0)
3655 {
3656 c = scode[1];
3657 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3658 }
3659 else if (c != scode[1]) return -1;
3660 break;
3661 }
3662
3663 code += GET(code, 1);
3664 }
3665 while (*code == OP_ALT);
3666 return c;
3667 }
3668
3669
3670
3671
3672 /*************************************************
3673 * Compile a Regular Expression *
3674 *************************************************/
3675
3676 /* This function takes a string and returns a pointer to a block of store
3677 holding a compiled version of the expression.
3678
3679 Arguments:
3680 pattern the regular expression
3681 options various option bits
3682 errorptr pointer to pointer to error text
3683 erroroffset ptr offset in pattern where error was detected
3684 tables pointer to character tables or NULL
3685
3686 Returns: pointer to compiled data block, or NULL on error,
3687 with errorptr and erroroffset set
3688 */
3689
3690 pcre *
3691 pcre_compile(const char *pattern, int options, const char **errorptr,
3692 int *erroroffset, const unsigned char *tables)
3693 {
3694 real_pcre *re;
3695 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3696 int runlength;
3697 int c, firstbyte, reqbyte;
3698 int bracount = 0;
3699 int branch_extra = 0;
3700 int branch_newextra;
3701 int item_count = -1;
3702 int name_count = 0;
3703 int max_name_size = 0;
3704 #ifdef SUPPORT_UTF8
3705 int lastcharlength = 0;
3706 BOOL utf8;
3707 BOOL class_utf8;
3708 #endif
3709 BOOL inescq = FALSE;
3710 unsigned int brastackptr = 0;
3711 size_t size;
3712 uschar *code;
3713 const uschar *codestart;
3714 const uschar *ptr;
3715 compile_data compile_block;
3716 int brastack[BRASTACK_SIZE];
3717 uschar bralenstack[BRASTACK_SIZE];
3718
3719 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3720 can do is just return NULL. */
3721
3722 if (errorptr == NULL) return NULL;
3723 *errorptr = NULL;
3724
3725 /* However, we can give a message for this error */
3726
3727 if (erroroffset == NULL)
3728 {
3729 *errorptr = ERR16;
3730 return NULL;
3731 }
3732 *erroroffset = 0;
3733
3734 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3735
3736 #ifdef SUPPORT_UTF8
3737 utf8 = (options & PCRE_UTF8) != 0;
3738 #else
3739 if ((options & PCRE_UTF8) != 0)
3740 {
3741 *errorptr = ERR32;
3742 return NULL;
3743 }
3744 #endif
3745
3746 if ((options & ~PUBLIC_OPTIONS) != 0)
3747 {
3748 *errorptr = ERR17;
3749 return NULL;
3750 }
3751
3752 /* Set up pointers to the individual character tables */
3753
3754 if (tables == NULL) tables = pcre_default_tables;
3755 compile_block.lcc = tables + lcc_offset;
3756 compile_block.fcc = tables + fcc_offset;
3757 compile_block.cbits = tables + cbits_offset;
3758 compile_block.ctypes = tables + ctypes_offset;
3759
3760 /* Maximum back reference and backref bitmap. This is updated for numeric
3761 references during the first pass, but for named references during the actual
3762 compile pass. The bitmap records up to 31 back references to help in deciding
3763 whether (.*) can be treated as anchored or not. */
3764
3765 compile_block.top_backref = 0;
3766 compile_block.backref_map = 0;
3767
3768 /* Reflect pattern for debugging output */
3769
3770 DPRINTF(("------------------------------------------------------------------\n"));
3771 DPRINTF(("%s\n", pattern));
3772
3773 /* The first thing to do is to make a pass over the pattern to compute the
3774 amount of store required to hold the compiled code. This does not have to be
3775 perfect as long as errors are overestimates. At the same time we can detect any
3776 flag settings right at the start, and extract them. Make an attempt to correct
3777 for any counted white space if an "extended" flag setting appears late in the
3778 pattern. We can't be so clever for #-comments. */
3779
3780 ptr = (const uschar *)(pattern - 1);
3781 while ((c = *(++ptr)) != 0)
3782 {
3783 int min, max;
3784 int class_optcount;
3785 int bracket_length;
3786 int duplength;
3787
3788 /* If we are inside a \Q...\E sequence, all chars are literal */
3789
3790 if (inescq) goto NORMAL_CHAR;
3791
3792 /* Otherwise, first check for ignored whitespace and comments */
3793
3794 if ((options & PCRE_EXTENDED) != 0)
3795 {
3796 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3797 if (c == '#')
3798 {
3799 /* The space before the ; is to avoid a warning on a silly compiler
3800 on the Macintosh. */
3801 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3802 if (c == 0) break;
3803 continue;
3804 }
3805 }
3806
3807 item_count++; /* Is zero for the first non-comment item */
3808
3809 switch(c)
3810 {
3811 /* A backslashed item may be an escaped "normal" character or a
3812 character type. For a "normal" character, put the pointers and
3813 character back so that tests for whitespace etc. in the input
3814 are done correctly. */
3815
3816 case '\\':
3817 {
3818 const uschar *save_ptr = ptr;
3819 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
3820 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3821 if (c >= 0)
3822 {
3823 ptr = save_ptr;
3824 c = '\\';
3825 goto NORMAL_CHAR;
3826 }
3827 }
3828
3829 /* If \Q, enter "literal" mode */
3830
3831 if (-c == ESC_Q)
3832 {
3833 inescq = TRUE;
3834 continue;
3835 }
3836
3837 /* Other escapes need one byte, and are of length one for repeats */
3838
3839 length++;
3840 #ifdef SUPPORT_UTF8
3841 lastcharlength = 1;
3842 #endif
3843
3844 /* A back reference needs an additional 2 bytes, plus either one or 5
3845 bytes for a repeat. We also need to keep the value of the highest
3846 back reference. */
3847
3848 if (c <= -ESC_REF)
3849 {
3850 int refnum = -c - ESC_REF;
3851 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3852 if (refnum > compile_block.top_backref)
3853 compile_block.top_backref = refnum;
3854 length += 2; /* For single back reference */
3855 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3856 {
3857 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
3858 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3859 if ((min == 0 && (max == 1 || max == -1)) ||
3860 (min == 1 && max == -1))
3861 length++;
3862 else length += 5;
3863 if (ptr[1] == '?') ptr++;
3864 }
3865 }
3866 continue;
3867
3868 case '^': /* Single-byte metacharacters */
3869 case '.':
3870 case '$':
3871 length++;
3872 #ifdef SUPPORT_UTF8
3873 lastcharlength = 1;
3874 #endif
3875 continue;
3876
3877 case '*': /* These repeats won't be after brackets; */
3878 case '+': /* those are handled separately */
3879 case '?':
3880 length++;
3881 goto POSESSIVE; /* A few lines below */
3882
3883 /* This covers the cases of braced repeats after a single char, metachar,
3884 class, or back reference. */
3885
3886 case '{':
3887 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3888 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3889 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3890
3891 /* These special cases just insert one extra opcode */
3892
3893 if ((min == 0 && (max == 1 || max == -1)) ||
3894 (min == 1 && max == -1))
3895 length++;
3896
3897 /* These cases might insert additional copies of a preceding character. */
3898
3899 else
3900 {
3901 #ifdef SUPPORT_UTF8
3902 /* In UTF-8 mode, we should find the length in lastcharlength */
3903 if (utf8)
3904 {
3905 if (min != 1)
3906 {
3907 length -= lastcharlength; /* Uncount the original char or metachar */
3908 if (min > 0) length += 3 + lastcharlength;
3909 }
3910 length += lastcharlength + ((max > 0)? 3 : 1);
3911 }
3912 else
3913 #endif
3914
3915 /* Not UTF-8 mode: all characters are one byte */
3916 {
3917 if (min != 1)
3918 {
3919 length--; /* Uncount the original char or metachar */
3920 if (min > 0) length += 4;
3921 }
3922
3923 length += (max > 0)? 4 : 2;
3924 }
3925 }
3926
3927 if (ptr[1] == '?') ptr++; /* Needs no extra length */
3928
3929 POSESSIVE: /* Test for possessive quantifier */
3930 if (ptr[1] == '+')
3931 {
3932 ptr++;
3933 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
3934 }
3935 continue;
3936
3937 /* An alternation contains an offset to the next branch or ket. If any ims
3938 options changed in the previous branch(es), and/or if we are in a
3939 lookbehind assertion, extra space will be needed at the start of the
3940 branch. This is handled by branch_extra. */
3941
3942 case '|':
3943 length += 1 + LINK_SIZE + branch_extra;
3944 continue;
3945
3946 /* A character class uses 33 characters provided that all the character
3947 values are less than 256. Otherwise, it uses a bit map for low valued
3948 characters, and individual items for others. Don't worry about character
3949 types that aren't allowed in classes - they'll get picked up during the
3950 compile. A character class that contains only one single-byte character
3951 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
3952 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
3953
3954 case '[':
3955 class_optcount = 0;
3956
3957 #ifdef SUPPORT_UTF8
3958 class_utf8 = FALSE;
3959 #endif
3960
3961 if (*(++ptr) == '^') ptr++;
3962
3963 /* Written as a "do" so that an initial ']' is taken as data */
3964
3965 if (*ptr != 0) do
3966 {
3967 /* Inside \Q...\E everything is literal except \E */
3968
3969 if (inescq)
3970 {
3971 if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
3972 inescq = FALSE;
3973 ptr += 1;
3974 continue;
3975 }
3976
3977 /* Outside \Q...\E, check for escapes */
3978
3979 if (*ptr == '\\')
3980 {
3981 #ifdef SUPPORT_UTF8
3982 int prevchar = ptr[-1];
3983 #endif
3984 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
3985 &compile_block);
3986 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3987
3988 /* \b is backspace inside a class */
3989
3990 if (-ch == ESC_b) ch = '\b';
3991
3992 /* \Q enters quoting mode */
3993
3994 if (-ch == ESC_Q)
3995 {
3996 inescq = TRUE;
3997 continue;
3998 }
3999
4000 /* Handle escapes that turn into characters */
4001
4002 if (ch >= 0)
4003 {
4004 #ifdef SUPPORT_UTF8
4005 if (utf8)
4006 {
4007 if (ch > 127) class_optcount = 10; /* Ensure > 1 */
4008 if (ch > 255)
4009 {
4010 uschar buffer[6];
4011 if (!class_utf8)
4012 {
4013 class_utf8 = TRUE;
4014 length += LINK_SIZE + 1 + 1;
4015 }
4016 length += 1 + ord2utf8(ch, buffer);
4017
4018 /* If this wide character is preceded by '-', add an extra 2 to
4019 the length in case the previous character was < 128, because in
4020 this case the whole range will be put into the list. */
4021
4022 if (prevchar == '-') length += 2;
4023 }
4024 }
4025 #endif
4026 class_optcount++; /* for possible optimization */
4027 }
4028 else class_optcount = 10; /* \d, \s etc; make sure > 1 */
4029 }
4030
4031 /* Check the syntax for POSIX stuff. The bits we actually handle are
4032 checked during the real compile phase. */
4033
4034 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4035 {
4036 ptr++;
4037 class_optcount = 10; /* Make sure > 1 */
4038 }
4039
4040 /* Anything else just increments the possible optimization count. If
4041 there are wide characters, we are going to have to use an XCLASS. */
4042
4043 else
4044 {
4045 NON_SPECIAL_CHARACTER:
4046 class_optcount++;
4047
4048 #ifdef SUPPORT_UTF8
4049 if (utf8)
4050 {
4051 int c;
4052 int extra = 0;
4053 GETCHARLEN(c, ptr, extra);
4054 if (c > 127) class_optcount = 10; /* No optimization possible */
4055 if (c > 255)
4056 {
4057 if (!class_utf8)
4058 {
4059 class_utf8 = TRUE;
4060 length += LINK_SIZE + 1 + 1;
4061 }
4062 length += 2 + extra;
4063
4064 /* If this wide character is preceded by '-', add an extra 2 to
4065 the length in case the previous character was < 128, because in
4066 this case the whole range will be put into the list. */
4067
4068 if (ptr[-1] == '-') length += 2;
4069
4070 /* Advance to the end of this character */
4071
4072 ptr += extra;
4073 }
4074 }
4075 #endif
4076 }
4077 }
4078 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4079
4080 if (*ptr == 0) /* Missing terminating ']' */
4081 {
4082 *errorptr = ERR6;
4083 goto PCRE_ERROR_RETURN;
4084 }
4085
4086 /* We can optimize when there was only one optimizable character. Repeats
4087 for positive and negated single one-byte chars are handled by the general
4088 code. Here, we handle repeats for the class opcodes. */
4089
4090 if (class_optcount == 1) length += 3; else
4091 {
4092 length += 33;
4093
4094 /* A repeat needs either 1 or 5 bytes. */
4095
4096 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
4097 {
4098 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4099 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4100 if ((min == 0 && (max == 1 || max == -1)) ||
4101 (min == 1 && max == -1))
4102 length++;
4103 else length += 5;
4104 if (ptr[1] == '?') ptr++;
4105 }
4106 }
4107 continue;
4108
4109 /* Brackets may be genuine groups or special things */
4110
4111 case '(':
4112 branch_newextra = 0;
4113 bracket_length = 1 + LINK_SIZE;
4114
4115 /* Handle special forms of bracket, which all start (? */
4116
4117 if (ptr[1] == '?')
4118 {
4119 int set, unset;
4120 int *optset;
4121
4122 switch (c = ptr[2])
4123 {
4124 /* Skip over comments entirely */
4125 case '#':
4126 ptr += 3;
4127 while (*ptr != 0 && *ptr != ')') ptr++;
4128 if (*ptr == 0)
4129 {
4130 *errorptr = ERR18;
4131 goto PCRE_ERROR_RETURN;
4132 }
4133 continue;
4134
4135 /* Non-referencing groups and lookaheads just move the pointer on, and
4136 then behave like a non-special bracket, except that they don't increment
4137 the count of extracting brackets. Ditto for the "once only" bracket,
4138 which is in Perl from version 5.005. */
4139
4140 case ':':
4141 case '=':
4142 case '!':
4143 case '>':
4144 ptr += 2;
4145 break;
4146
4147 /* (?R) specifies a recursive call to the regex, which is an extension
4148 to provide the facility which can be obtained by (?p{perl-code}) in
4149 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4150
4151 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4152 the appropriate numbered brackets. This includes both recursive and
4153 non-recursive calls. (?R) is now synonymous with (?0). */
4154
4155 case 'R':
4156 ptr++;
4157
4158 case '0': case '1': case '2': case '3': case '4':
4159 case '5': case '6': case '7': case '8': case '9':
4160 ptr += 2;
4161 if (c != 'R')
4162 while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
4163 if (*ptr != ')')
4164 {
4165 *errorptr = ERR29;
4166 goto PCRE_ERROR_RETURN;
4167 }
4168 length += 1 + LINK_SIZE;
4169
4170 /* If this item is quantified, it will get wrapped inside brackets so
4171 as to use the code for quantified brackets. We jump down and use the
4172 code that handles this for real brackets. */
4173
4174 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4175 {
4176 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4177 duplength = 5 + 3 * LINK_SIZE;
4178 goto HANDLE_QUANTIFIED_BRACKETS;
4179 }
4180 continue;
4181
4182 /* (?C) is an extension which provides "callout" - to provide a bit of
4183 the functionality of the Perl (?{...}) feature. An optional number may
4184 follow (default is zero). */
4185
4186 case 'C':
4187 ptr += 2;
4188 while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
4189 if (*ptr != ')')
4190 {
4191 *errorptr = ERR39;
4192 goto PCRE_ERROR_RETURN;
4193 }
4194 length += 2;
4195 continue;
4196
4197 /* Named subpatterns are an extension copied from Python */
4198
4199 case 'P':
4200 ptr += 3;
4201 if (*ptr == '<')
4202 {
4203 const uschar *p; /* Don't amalgamate; some compilers */
4204 p = ++ptr; /* grumble at autoincrement in declaration */
4205 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4206 if (*ptr != '>')
4207 {
4208 *errorptr = ERR42;
4209 goto PCRE_ERROR_RETURN;
4210 }
4211 name_count++;
4212 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4213 break;
4214 }
4215
4216 if (*ptr == '=' || *ptr == '>')
4217 {
4218 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4219 if (*ptr != ')')
4220 {
4221 *errorptr = ERR42;
4222 goto PCRE_ERROR_RETURN;
4223 }
4224 break;
4225 }
4226
4227 /* Unknown character after (?P */
4228
4229 *errorptr = ERR41;
4230 goto PCRE_ERROR_RETURN;
4231
4232 /* Lookbehinds are in Perl from version 5.005 */
4233
4234 case '<':
4235 ptr += 3;
4236 if (*ptr == '=' || *ptr == '!')
4237 {
4238 branch_newextra = 1 + LINK_SIZE;
4239 length += 1 + LINK_SIZE; /* For the first branch */
4240 break;
4241 }
4242 *errorptr = ERR24;
4243 goto PCRE_ERROR_RETURN;
4244
4245 /* Conditionals are in Perl from version 5.005. The bracket must either
4246 be followed by a number (for bracket reference) or by an assertion
4247 group, or (a PCRE extension) by 'R' for a recursion test. */
4248
4249 case '(':
4250 if (ptr[3] == 'R' && ptr[4] == ')')
4251 {
4252 ptr += 4;
4253 length += 3;
4254 }
4255 else if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
4256 {
4257 ptr += 4;
4258 length += 3;
4259 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
4260 if (*ptr != ')')
4261 {
4262 *errorptr = ERR26;
4263 goto PCRE_ERROR_RETURN;
4264 }
4265 }
4266 else /* An assertion must follow */
4267 {
4268 ptr++; /* Can treat like ':' as far as spacing is concerned */
4269 if (ptr[2] != '?' ||
4270 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4271 {
4272 ptr += 2; /* To get right offset in message */
4273 *errorptr = ERR28;
4274 goto PCRE_ERROR_RETURN;
4275 }
4276 }
4277 break;
4278
4279 /* Else loop checking valid options until ) is met. Anything else is an
4280 error. If we are without any brackets, i.e. at top level, the settings
4281 act as if specified in the options, so massage the options immediately.
4282 This is for backward compatibility with Perl 5.004. */
4283
4284 default:
4285 set = unset = 0;
4286 optset = &set;
4287 ptr += 2;
4288
4289 for (;; ptr++)
4290 {
4291 c = *ptr;
4292 switch (c)
4293 {
4294 case 'i':
4295 *optset |= PCRE_CASELESS;
4296 continue;
4297
4298 case 'm':
4299 *optset |= PCRE_MULTILINE;
4300 continue;
4301
4302 case 's':
4303 *optset |= PCRE_DOTALL;
4304 continue;
4305
4306 case 'x':
4307 *optset |= PCRE_EXTENDED;
4308 continue;
4309
4310 case 'X':
4311 *optset |= PCRE_EXTRA;
4312 continue;
4313
4314 case 'U':
4315 *optset |= PCRE_UNGREEDY;
4316 continue;
4317
4318 case '-':
4319 optset = &unset;
4320 continue;
4321
4322 /* A termination by ')' indicates an options-setting-only item; if
4323 this is at the very start of the pattern (indicated by item_count
4324 being zero), we use it to set the global options. This is helpful
4325 when analyzing the pattern for first characters, etc. Otherwise
4326 nothing is done here and it is handled during the compiling
4327 process.
4328
4329 [Historical note: Up to Perl 5.8, options settings at top level
4330 were always global settings, wherever they appeared in the pattern.
4331 That is, they were equivalent to an external setting. From 5.8
4332 onwards, they apply only to what follows (which is what you might
4333 expect).] */
4334
4335 case ')':
4336 if (item_count == 0)
4337 {
4338 options = (options | set) & (~unset);
4339 set = unset = 0; /* To save length */
4340 item_count--; /* To allow for several */
4341 }
4342
4343 /* Fall through */
4344
4345 /* A termination by ':' indicates the start of a nested group with
4346 the given options set. This is again handled at compile time, but
4347 we must allow for compiled space if any of the ims options are
4348 set. We also have to allow for resetting space at the end of
4349 the group, which is why 4 is added to the length and not just 2.
4350 If there are several changes of options within the same group, this
4351 will lead to an over-estimate on the length, but this shouldn't
4352 matter very much. We also have to allow for resetting options at
4353 the start of any alternations, which we do by setting
4354 branch_newextra to 2. Finally, we record whether the case-dependent
4355 flag ever changes within the regex. This is used by the "required
4356 character" code. */
4357
4358 case ':':
4359 if (((set|unset) & PCRE_IMS) != 0)
4360 {
4361 length += 4;
4362 branch_newextra = 2;
4363 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4364 }
4365 goto END_OPTIONS;
4366
4367 /* Unrecognized option character */
4368
4369 default:
4370 *errorptr = ERR12;
4371 goto PCRE_ERROR_RETURN;
4372 }
4373 }
4374
4375 /* If we hit a closing bracket, that's it - this is a freestanding
4376 option-setting. We need to ensure that branch_extra is updated if
4377 necessary. The only values branch_newextra can have here are 0 or 2.
4378 If the value is 2, then branch_extra must either be 2 or 5, depending
4379 on whether this is a lookbehind group or not. */
4380
4381 END_OPTIONS:
4382 if (c == ')')
4383 {
4384 if (branch_newextra == 2 &&
4385 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4386 branch_extra += branch_newextra;
4387 continue;
4388 }
4389
4390 /* If options were terminated by ':' control comes here. Fall through
4391 to handle the group below. */
4392 }
4393 }
4394
4395 /* Extracting brackets must be counted so we can process escapes in a
4396 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4397 need an additional 3 bytes of store per extracting bracket. However, if
4398 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4399 must leave the count alone (it will aways be zero). */
4400
4401 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4402 {
4403 bracount++;
4404 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4405 }
4406
4407 /* Save length for computing whole length at end if there's a repeat that
4408 requires duplication of the group. Also save the current value of
4409 branch_extra, and start the new group with the new value. If non-zero, this
4410 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4411
4412 if (brastackptr >= sizeof(brastack)/sizeof(int))
4413 {
4414 *errorptr = ERR19;
4415 goto PCRE_ERROR_RETURN;
4416 }
4417
4418 bralenstack[brastackptr] = branch_extra;
4419 branch_extra = branch_newextra;
4420
4421 brastack[brastackptr++] = length;
4422 length += bracket_length;
4423 continue;
4424
4425 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4426 have to replicate this bracket up to that many times. If brastackptr is
4427 0 this is an unmatched bracket which will generate an error, but take care
4428 not to try to access brastack[-1] when computing the length and restoring
4429 the branch_extra value. */
4430
4431 case ')':
4432 length += 1 + LINK_SIZE;
4433 if (brastackptr > 0)
4434 {
4435 duplength = length - brastack[--brastackptr];
4436 branch_extra = bralenstack[brastackptr];
4437 }
4438 else duplength = 0;
4439
4440 /* The following code is also used when a recursion such as (?3) is
4441 followed by a quantifier, because in that case, it has to be wrapped inside
4442 brackets so that the quantifier works. The value of duplength must be
4443 set before arrival. */
4444
4445 HANDLE_QUANTIFIED_BRACKETS:
4446
4447 /* Leave ptr at the final char; for read_repeat_counts this happens
4448 automatically; for the others we need an increment. */
4449
4450 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
4451 {
4452 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4453 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4454 }
4455 else if (c == '*') { min = 0; max = -1; ptr++; }
4456 else if (c == '+') { min = 1; max = -1; ptr++; }
4457 else if (c == '?') { min = 0; max = 1; ptr++; }
4458 else { min = 1; max = 1; }
4459
4460 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4461 group, and if the maximum is greater than zero, we have to replicate
4462 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4463 bracket set. */
4464
4465 if (min == 0)
4466 {
4467 length++;
4468 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4469 }
4470
4471 /* When the minimum is greater than zero, we have to replicate up to
4472 minval-1 times, with no additions required in the copies. Then, if there
4473 is a limited maximum we have to replicate up to maxval-1 times allowing
4474 for a BRAZERO item before each optional copy and nesting brackets for all
4475 but one of the optional copies. */
4476
4477 else
4478 {
4479 length += (min - 1) * duplength;
4480 if (max > min) /* Need this test as max=-1 means no limit */
4481 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4482 - (2 + 2*LINK_SIZE);
4483 }
4484
4485 /* Allow space for once brackets for "possessive quantifier" */
4486
4487 if (ptr[1] == '+')
4488 {
4489 ptr++;
4490 length += 2 + 2*LINK_SIZE;
4491 }
4492 continue;
4493
4494 /* Non-special character. For a run of such characters the length required
4495 is the number of characters + 2, except that the maximum run length is
4496 MAXLIT. We won't get a skipped space or a non-data escape or the start of a
4497 # comment as the first character, so the length can't be zero. */
4498
4499 NORMAL_CHAR:
4500 default:
4501 length += 2;
4502 runlength = 0;
4503 do
4504 {
4505 #ifdef SUPPORT_UTF8
4506 lastcharlength = 1; /* Need length of last char for UTF-8 repeats */
4507 #endif
4508
4509 /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
4510 if (inescq)
4511 {
4512 if (c == '\\' && ptr[1] == 'E')
4513 {
4514 inescq = FALSE;
4515 ptr++;
4516 }
4517 else runlength++;
4518 continue;
4519 }
4520
4521 /* Skip whitespace and comments for /x */
4522
4523 if ((options & PCRE_EXTENDED) != 0)
4524 {
4525 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4526 if (c == '#')
4527 {
4528 /* The space before the ; is to avoid a warning on a silly compiler
4529 on the Macintosh. */
4530 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4531 continue;
4532 }
4533 }
4534
4535 /* Backslash may introduce a data char or a metacharacter; stop the
4536 string before the latter. */
4537
4538 if (c == '\\')
4539 {
4540 const uschar *saveptr = ptr;
4541 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
4542 &compile_block);
4543 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4544 if (c < 0) { ptr = saveptr; break; }
4545
4546 /* In UTF-8 mode, add on the number of additional bytes needed to
4547 encode this character, and save the total length in case this is a
4548 final char that is repeated. */
4549
4550 #ifdef SUPPORT_UTF8
4551 if (utf8 && c > 127)
4552 {
4553 int i;
4554 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4555 if (c <= utf8_table1[i]) break;
4556 runlength += i;
4557 lastcharlength += i;
4558 }
4559 #endif
4560 }
4561
4562 /* Ordinary character or single-char escape */
4563
4564 runlength++;
4565 }
4566
4567 /* This "while" is the end of the "do" above. */
4568
4569 while (runlength < MAXLIT &&
4570 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
4571
4572 /* If we hit a meta-character, back off to point to it */
4573
4574 if (runlength < MAXLIT) ptr--;
4575
4576 /* If the last char in the string is a UTF-8 multibyte character, we must
4577 set lastcharlength correctly. If it was specified as an escape, this will
4578 already have been done above. However, we also have to support in-line
4579 UTF-8 characters, so check backwards from where we are. */
4580
4581 #ifdef SUPPORT_UTF8
4582 if (utf8)
4583 {
4584 const uschar *lastptr = ptr - 1;
4585 if ((*lastptr & 0x80) != 0)
4586 {
4587 while((*lastptr & 0xc0) == 0x80) lastptr--;
4588 lastcharlength = ptr - lastptr;
4589 }
4590 }
4591 #endif
4592
4593 length += runlength;
4594 continue;
4595 }
4596 }
4597
4598 length += 2 + LINK_SIZE; /* For final KET and END */
4599
4600 if (length > MAX_PATTERN_SIZE)
4601 {
4602 *errorptr = ERR20;
4603 return NULL;
4604 }
4605
4606 /* Compute the size of data block needed and get it, either from malloc or
4607 externally provided function. */
4608
4609 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4610 re = (real_pcre *)(pcre_malloc)(size);
4611
4612 if (re == NULL)
4613 {
4614 *errorptr = ERR21;
4615 return NULL;
4616 }
4617
4618 /* Put in the magic number, and save the size, options, and table pointer */
4619
4620 re->magic_number = MAGIC_NUMBER;
4621 re->size = size;
4622 re->options = options;
4623 re->tables = tables;
4624 re->name_entry_size = max_name_size + 3;
4625 re->name_count = name_count;
4626
4627 /* The starting points of the name/number translation table and of the code are
4628 passed around in the compile data block. */
4629
4630 compile_block.names_found = 0;
4631 compile_block.name_entry_size = max_name_size + 3;
4632 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
4633 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4634 compile_block.start_code = codestart;
4635 compile_block.req_varyopt = 0;
4636
4637 /* Set up a starting, non-extracting bracket, then compile the expression. On
4638 error, *errorptr will be set non-NULL, so we don't need to look at the result
4639 of the function here. */
4640
4641 ptr = (const uschar *)pattern;
4642 code = (uschar *)codestart;
4643 *code = OP_BRA;
4644 bracount = 0;
4645 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4646 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4647 re->top_bracket = bracount;
4648 re->top_backref = compile_block.top_backref;
4649
4650 /* If not reached end of pattern on success, there's an excess bracket. */
4651
4652 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
4653
4654 /* Fill in the terminating state and check for disastrous overflow, but
4655 if debugging, leave the test till after things are printed out. */
4656
4657 *code++ = OP_END;
4658
4659 #ifndef DEBUG
4660 if (code - codestart > length) *errorptr = ERR23;
4661 #endif
4662
4663 /* Give an error if there's back reference to a non-existent capturing
4664 subpattern. */
4665
4666 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
4667
4668 /* Failed to compile, or error while post-processing */
4669
4670 if (*errorptr != NULL)
4671 {
4672 (pcre_free)(re);
4673 PCRE_ERROR_RETURN:
4674 *erroroffset = ptr - (const uschar *)pattern;
4675 return NULL;
4676 }
4677
4678 /* If the anchored option was not passed, set the flag if we can determine that
4679 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4680 as starting with .* when DOTALL is set).
4681
4682 Otherwise, if we know what the first character has to be, save it, because that
4683 speeds up unanchored matches no end. If not, see if we can set the
4684 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4685 start with ^. and also when all branches start with .* for non-DOTALL matches.
4686 */
4687
4688 if ((options & PCRE_ANCHORED) == 0)
4689 {
4690 int temp_options = options;
4691 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4692 re->options |= PCRE_ANCHORED;
4693 else
4694 {
4695 if (firstbyte < 0)
4696 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4697 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4698 {
4699 int ch = firstbyte & 255;
4700 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4701 compile_block.fcc[ch] == ch)? ch : firstbyte;
4702 re->options |= PCRE_FIRSTSET;
4703 }
4704 else if (is_startline(codestart, 0, compile_block.backref_map))
4705 re->options |= PCRE_STARTLINE;
4706 }
4707 }
4708
4709 /* For an anchored pattern, we use the "required byte" only if it follows a
4710 variable length item in the regex. Remove the caseless flag for non-caseable
4711 chars. */
4712
4713 if (reqbyte >= 0 &&
4714 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4715 {
4716 int ch = reqbyte & 255;
4717 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4718 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4719 re->options |= PCRE_REQCHSET;
4720 }
4721
4722 /* Print out the compiled data for debugging */
4723
4724 #ifdef DEBUG
4725
4726 printf("Length = %d top_bracket = %d top_backref = %d\n",
4727 length, re->top_bracket, re->top_backref);
4728
4729 if (re->options != 0)
4730 {
4731 printf("%s%s%s%s%s%s%s%s%s\n",
4732 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4733 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4734 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4735 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4736 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4737 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4738 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4739 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4740 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4741 }
4742
4743 if ((re->options & PCRE_FIRSTSET) != 0)
4744 {
4745 int ch = re->first_byte & 255;
4746 char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4747 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4748 else printf("First char = \\x%02x%s\n", ch, caseless);
4749 }
4750
4751 if ((re->options & PCRE_REQCHSET) != 0)
4752 {
4753 int ch = re->req_byte & 255;
4754 char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4755 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
4756 else printf("Req char = \\x%02x%s\n", ch, caseless);
4757 }
4758
4759 print_internals(re, stdout);
4760
4761 /* This check is done here in the debugging case so that the code that
4762 was compiled can be seen. */
4763
4764 if (code - codestart > length)
4765 {
4766 *errorptr = ERR23;
4767 (pcre_free)(re);
4768 *erroroffset = ptr - (uschar *)pattern;
4769 return NULL;
4770 }
4771 #endif
4772
4773 return (pcre *)re;
4774 }
4775
4776
4777
4778 /*************************************************
4779 * Match a back-reference *
4780 *************************************************/
4781
4782 /* If a back reference hasn't been set, the length that is passed is greater
4783 than the number of characters left in the string, so the match fails.
4784
4785 Arguments:
4786 offset index into the offset vector
4787 eptr points into the subject
4788 length length to be matched
4789 md points to match data block
4790 ims the ims flags
4791
4792 Returns: TRUE if matched
4793 */
4794
4795 static BOOL
4796 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
4797 unsigned long int ims)
4798 {
4799 const uschar *p = md->start_subject + md->offset_vector[offset];
4800
4801 #ifdef DEBUG
4802 if (eptr >= md->end_subject)
4803 printf("matching subject <null>");
4804 else
4805 {
4806 printf("matching subject ");
4807 pchars(eptr, length, TRUE, md);
4808 }
4809 printf(" against backref ");
4810 pchars(p, length, FALSE, md);
4811 printf("\n");
4812 #endif
4813
4814 /* Always fail if not enough characters left */
4815
4816 if (length > md->end_subject - eptr) return FALSE;
4817
4818 /* Separate the caselesss case for speed */
4819
4820 if ((ims & PCRE_CASELESS) != 0)
4821 {
4822 while (length-- > 0)
4823 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
4824 }
4825 else
4826 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
4827
4828 return TRUE;
4829 }
4830
4831
4832 #ifdef SUPPORT_UTF8
4833 /*************************************************
4834 * Match character against an XCLASS *
4835 *************************************************/
4836
4837 /* This function is called from within the XCLASS code below, to match a
4838 character against an extended class which might match values > 255.
4839
4840 Arguments:
4841 c the character
4842 data points to the flag byte of the XCLASS data
4843
4844 Returns: TRUE if character matches, else FALSE
4845 */
4846
4847 static BOOL
4848 match_xclass(int c, const uschar *data)
4849 {
4850 int t;
4851 BOOL negated = (*data & XCL_NOT) != 0;
4852
4853 /* Character values < 256 are matched against a bitmap, if one is present. If
4854 not, we still carry on, because there may be ranges that start below 256 in the
4855 additional data. */
4856
4857 if (c < 256)
4858 {
4859 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
4860 return !negated; /* char found */
4861 }
4862
4863 /* Now match against the list of large chars or ranges that end with a large
4864 char. First skip the bit map if present. */
4865
4866 if ((*data++ & XCL_MAP) != 0) data += 32;
4867
4868 while ((t = *data++) != XCL_END)
4869 {
4870 int x, y;
4871 GETCHARINC(x, data);
4872 if (t == XCL_SINGLE)
4873 {
4874 if (c == x) return !negated;
4875 }
4876 else
4877 {
4878 GETCHARINC(y, data);
4879 if (c >= x && c <= y) return !negated;
4880 }
4881 }
4882
4883 return negated; /* char was not found */
4884 }
4885 #endif
4886
4887
4888
4889
4890 /*************************************************
4891 * Match from current position *
4892 *************************************************/
4893
4894 /* On entry ecode points to the first opcode, and eptr to the first character
4895 in the subject string, while eptrb holds the value of eptr at the start of the
4896 last bracketed group - used for breaking infinite loops matching zero-length
4897 strings. This function is called recursively in many circumstances. Whenever it
4898 returns a negative (error) response, the outer incarnation must also return the
4899 same response.
4900
4901 Performance note: It might be tempting to extract commonly used fields from the
4902 md structure (e.g. utf8, end_subject) into individual variables to improve
4903 performance. Tests using gcc on a SPARC disproved this; in the first case, it
4904 made performance worse.
4905
4906 Arguments:
4907 eptr pointer in subject
4908 ecode position in code
4909 offset_top current top pointer
4910 md pointer to "static" info for the match
4911 ims current /i, /m, and /s options
4912 eptrb pointer to chain of blocks containing eptr at start of
4913 brackets - for testing for empty matches
4914 flags can contain
4915 match_condassert - this is an assertion condition
4916 match_isgroup - this is the start of a bracketed group
4917
4918 Returns: MATCH_MATCH if matched ) these values are >= 0
4919 MATCH_NOMATCH if failed to match )
4920 a negative PCRE_ERROR_xxx value if aborted by an error condition
4921 (e.g. stopped by recursion limit)
4922 */
4923
4924 static int
4925 match(register const uschar *eptr, register const uschar *ecode,
4926 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
4927 int flags)
4928 {
4929 unsigned long int original_ims = ims; /* Save for resetting on ')' */
4930 register int rrc;
4931 eptrblock newptrb;
4932
4933 if (md->match_call_count++ >= md->match_limit) return PCRE_ERROR_MATCHLIMIT;
4934
4935 /* At the start of a bracketed group, add the current subject pointer to the
4936 stack of such pointers, to be re-instated at the end of the group when we hit
4937 the closing ket. When match() is called in other circumstances, we don't add to
4938 the stack. */
4939
4940 if ((flags & match_isgroup) != 0)
4941 {
4942 newptrb.prev = eptrb;
4943 newptrb.saved_eptr = eptr;
4944 eptrb = &newptrb;
4945 }
4946
4947 /* Now start processing the operations. */
4948
4949 for (;;)
4950 {
4951 int op = (int)*ecode;
4952 int min, max, ctype;
4953 register int i;
4954 register int c;
4955 BOOL minimize = FALSE;
4956
4957 /* Opening capturing bracket. If there is space in the offset vector, save
4958 the current subject position in the working slot at the top of the vector. We
4959 mustn't change the current values of the data slot, because they may be set
4960 from a previous iteration of this group, and be referred to by a reference
4961 inside the group.
4962
4963 If the bracket fails to match, we need to restore this value and also the
4964 values of the final offsets, in case they were set by a previous iteration of
4965 the same bracket.
4966
4967 If there isn't enough space in the offset vector, treat this as if it were a
4968 non-capturing bracket. Don't worry about setting the flag for the error case
4969 here; that is handled in the code for KET. */
4970
4971 if (op > OP_BRA)
4972 {
4973 int offset;
4974 int number = op - OP_BRA;
4975
4976 /* For extended extraction brackets (large number), we have to fish out the
4977 number from a dummy opcode at the start. */
4978
4979 if (number > EXTRACT_BASIC_MAX)
4980 number = GET2(ecode, 2+LINK_SIZE);
4981 offset = number << 1;
4982
4983 #ifdef DEBUG
4984 printf("start bracket %d subject=", number);
4985 pchars(eptr, 16, TRUE, md);
4986 printf("\n");
4987 #endif
4988
4989 if (offset < md->offset_max)
4990 {
4991 int save_offset1 = md->offset_vector[offset];
4992 int save_offset2 = md->offset_vector[offset+1];
4993 int save_offset3 = md->offset_vector[md->offset_end - number];
4994 int save_capture_last = md->capture_last;
4995
4996 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
4997 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
4998
4999 do
5000 {
5001 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5002 eptrb, match_isgroup)) != MATCH_NOMATCH) return rrc;
5003 md->capture_last = save_capture_last;
5004 ecode += GET(ecode, 1);
5005 }
5006 while (*ecode == OP_ALT);
5007
5008 DPRINTF(("bracket %d failed\n", number));
5009
5010 md->offset_vector[offset] = save_offset1;
5011 md->offset_vector[offset+1] = save_offset2;
5012 md->offset_vector[md->offset_end - number] = save_offset3;
5013
5014 return MATCH_NOMATCH;
5015 }
5016
5017 /* Insufficient room for saving captured contents */
5018
5019 else op = OP_BRA;
5020 }
5021
5022 /* Other types of node can be handled by a switch */
5023
5024 switch(op)
5025 {
5026 case OP_BRA: /* Non-capturing bracket: optimized */
5027 DPRINTF(("start bracket 0\n"));
5028 do
5029 {
5030 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5031 match_isgroup)) != MATCH_NOMATCH) return rrc;
5032 ecode += GET(ecode, 1);
5033 }
5034 while (*ecode == OP_ALT);
5035 DPRINTF(("bracket 0 failed\n"));
5036 return MATCH_NOMATCH;
5037
5038 /* Conditional group: compilation checked that there are no more than
5039 two branches. If the condition is false, skipping the first branch takes us
5040 past the end if there is only one branch, but that's OK because that is
5041 exactly what going to the ket would do. */
5042
5043 case OP_COND:
5044 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
5045 {
5046 int offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
5047 BOOL condition = (offset == CREF_RECURSE * 2)?
5048 (md->recursive != NULL) :
5049 (offset < offset_top && md->offset_vector[offset] >= 0);
5050 return match(eptr, ecode + (condition?
5051 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
5052 offset_top, md, ims, eptrb, match_isgroup);
5053 }
5054
5055 /* The condition is an assertion. Call match() to evaluate it - setting
5056 the final argument TRUE causes it to stop at the end of an assertion. */
5057
5058 else
5059 {
5060 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5061 match_condassert | match_isgroup)) == MATCH_MATCH)
5062 {
5063 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
5064 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
5065 }
5066 else if (rrc != MATCH_NOMATCH) return rrc;
5067 else ecode += GET(ecode, 1);
5068 return match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5069 match_isgroup);
5070 }
5071 /* Control never reaches here */
5072
5073 /* Skip over conditional reference or large extraction number data if
5074 encountered. */
5075
5076 case OP_CREF:
5077 case OP_BRANUMBER:
5078 ecode += 3;
5079 break;
5080
5081 /* End of the pattern. If we are in a recursion, we should restore the
5082 offsets appropriately and continue from after the call. */
5083
5084 case OP_END:
5085 if (md->recursive != NULL && md->recursive->group_num == 0)
5086 {
5087 recursion_info *rec = md->recursive;
5088 DPRINTF(("Hit the end in a (?0) recursion\n"));
5089 md->recursive = rec->prev;
5090 memmove(md->offset_vector, rec->offset_save,
5091 rec->saved_max * sizeof(int));
5092 md->start_match = rec->save_start;
5093 ims = original_ims;
5094 ecode = rec->after_call;
5095 break;
5096 }
5097
5098 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
5099 string - backtracking will then try other alternatives, if any. */
5100
5101 if (md->notempty && eptr == md->start_match) return MATCH_NOMATCH;
5102 md->end_match_ptr = eptr; /* Record where we ended */
5103 md->end_offset_top = offset_top; /* and how many extracts were taken */
5104 return MATCH_MATCH;
5105
5106 /* Change option settings */
5107
5108 case OP_OPT:
5109 ims = ecode[1];
5110 ecode += 2;
5111 DPRINTF(("ims set to %02lx\n", ims));
5112 break;
5113
5114 /* Assertion brackets. Check the alternative branches in turn - the
5115 matching won't pass the KET for an assertion. If any one branch matches,
5116 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
5117 start of each branch to move the current point backwards, so the code at
5118 this level is identical to the lookahead case. */
5119
5120 case OP_ASSERT:
5121 case OP_ASSERTBACK:
5122 do
5123 {
5124 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5125 match_isgroup)) == MATCH_MATCH) break;
5126 if (rrc != MATCH_NOMATCH) return rrc;
5127 ecode += GET(ecode, 1);
5128 }
5129 while (*ecode == OP_ALT);
5130 if (*ecode == OP_KET) return MATCH_NOMATCH;
5131
5132 /* If checking an assertion for a condition, return MATCH_MATCH. */
5133
5134 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5135
5136 /* Continue from after the assertion, updating the offsets high water
5137 mark, since extracts may have been taken during the assertion. */
5138
5139 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5140 ecode += 1 + LINK_SIZE;
5141 offset_top = md->end_offset_top;
5142 continue;
5143
5144 /* Negative assertion: all branches must fail to match */
5145
5146 case OP_ASSERT_NOT:
5147 case OP_ASSERTBACK_NOT:
5148 do
5149 {
5150 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5151 match_isgroup)) == MATCH_MATCH) return MATCH_NOMATCH;
5152 if (rrc != MATCH_NOMATCH) return rrc;
5153 ecode += GET(ecode,1);
5154 }
5155 while (*ecode == OP_ALT);
5156
5157 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5158
5159 ecode += 1 + LINK_SIZE;
5160 continue;
5161
5162 /* Move the subject pointer back. This occurs only at the start of
5163 each branch of a lookbehind assertion. If we are too close to the start to
5164 move back, this match function fails. When working with UTF-8 we move
5165 back a number of characters, not bytes. */
5166
5167 case OP_REVERSE:
5168 #ifdef SUPPORT_UTF8
5169 c = GET(ecode,1);
5170 for (i = 0; i < c; i++)
5171 {
5172 eptr--;
5173 BACKCHAR(eptr)
5174 }
5175 #else
5176 eptr -= GET(ecode,1);
5177 #endif
5178
5179 if (eptr < md->start_subject) return MATCH_NOMATCH;
5180 ecode += 1 + LINK_SIZE;
5181 break;
5182
5183 /* The callout item calls an external function, if one is provided, passing
5184 details of the match so far. This is mainly for debugging, though the
5185 function is able to force a failure. */
5186
5187 case OP_CALLOUT:
5188 if (pcre_callout != NULL)
5189 {
5190 pcre_callout_block cb;
5191 cb.version = 0; /* Version 0 of the callout block */
5192 cb.callout_number = ecode[1];
5193 cb.offset_vector = md->offset_vector;
5194 cb.subject = (const char *)md->start_subject;
5195 cb.subject_length = md->end_subject - md->start_subject;
5196 cb.start_match = md->start_match - md->start_subject;
5197 cb.current_position = eptr - md->start_subject;
5198 cb.capture_top = offset_top/2;
5199 cb.capture_last = md->capture_last;
5200 cb.callout_data = md->callout_data;
5201 if ((rrc = (*pcre_callout)(&cb)) > 0) return MATCH_NOMATCH;
5202 if (rrc < 0) return rrc;
5203 }
5204 ecode += 2;
5205 break;
5206
5207 /* Recursion either matches the current regex, or some subexpression. The
5208 offset data is the offset to the starting bracket from the start of the
5209 whole pattern. However, it is possible that a BRAZERO was inserted before
5210 this bracket after we took the offset - we just skip it if encountered.
5211
5212 If there are any capturing brackets started but not finished, we have to
5213 save their starting points and reinstate them after the recursion. However,
5214 we don't know how many such there are (offset_top records the completed
5215 total) so we just have to save all the potential data. There may be up to
5216 65535 such values, which is too large to put on the stack, but using malloc
5217 for small numbers seems expensive. As a compromise, the stack is used when
5218 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
5219 is used. A problem is what to do if the malloc fails ... there is no way of
5220 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
5221 values on the stack, and accept that the rest may be wrong.
5222
5223 There are also other values that have to be saved. We use a chained
5224 sequence of blocks that actually live on the stack. Thanks to Robin Houston
5225 for the original version of this logic. */
5226
5227 case OP_RECURSE:
5228 {
5229 int stacksave[REC_STACK_SAVE_MAX];
5230 recursion_info new_recursive;
5231 const uschar *callpat = md->start_code + GET(ecode, 1);
5232
5233 if (*callpat == OP_BRAZERO) callpat++;
5234
5235 new_recursive.group_num = *callpat - OP_BRA;
5236
5237 /* For extended extraction brackets (large number), we have to fish out
5238 the number from a dummy opcode at the start. */
5239
5240 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
5241 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
5242
5243 /* Add to "recursing stack" */
5244
5245 new_recursive.prev = md->recursive;
5246 md->recursive = &new_recursive;
5247
5248 /* Find where to continue from afterwards */
5249
5250 ecode += 1 + LINK_SIZE;
5251 new_recursive.after_call = ecode;
5252
5253 /* Now save the offset data. */
5254
5255 new_recursive.saved_max = md->offset_end;
5256 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
5257 new_recursive.offset_save = stacksave;
5258 else
5259 {
5260 new_recursive.offset_save =
5261 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
5262 if (new_recursive.offset_save == NULL) return PCRE_ERROR_NOMEMORY;
5263 }
5264
5265 memcpy(new_recursive.offset_save, md->offset_vector,
5266 new_recursive.saved_max * sizeof(int));
5267 new_recursive.save_start = md->start_match;
5268 md->start_match = eptr;
5269
5270 /* OK, now we can do the recursion. For each top-level alternative we
5271 restore the offset and recursion data. */
5272
5273 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
5274 do
5275 {
5276 if ((rrc = match(eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
5277 eptrb, match_isgroup)) == MATCH_MATCH)
5278 {
5279 md->recursive = new_recursive.prev;
5280 if (new_recursive.offset_save != stacksave)
5281 (pcre_free)(new_recursive.offset_save);
5282 return MATCH_MATCH;
5283 }
5284 else if (rrc != MATCH_NOMATCH) return rrc;
5285
5286 md->recursive = &new_recursive;
5287 memcpy(md->offset_vector, new_recursive.offset_save,
5288 new_recursive.saved_max * sizeof(int));
5289 callpat += GET(callpat, 1);
5290 }
5291 while (*callpat == OP_ALT);
5292
5293 DPRINTF(("Recursion didn't match\n"));
5294 md->recursive = new_recursive.prev;
5295 if (new_recursive.offset_save != stacksave)
5296 (pcre_free)(new_recursive.offset_save);
5297 return MATCH_NOMATCH;
5298 }
5299 /* Control never reaches here */
5300
5301 /* "Once" brackets are like assertion brackets except that after a match,
5302 the point in the subject string is not moved back. Thus there can never be
5303 a move back into the brackets. Friedl calls these "atomic" subpatterns.
5304 Check the alternative branches in turn - the matching won't pass the KET
5305 for this kind of subpattern. If any one branch matches, we carry on as at
5306 the end of a normal bracket, leaving the subject pointer. */
5307
5308 case OP_ONCE:
5309 {
5310 const uschar *prev = ecode;
5311 const uschar *saved_eptr = eptr;
5312
5313 do
5314 {
5315 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5316 eptrb, match_isgroup)) == MATCH_MATCH) break;
5317 if (rrc != MATCH_NOMATCH) return rrc;
5318 ecode += GET(ecode,1);
5319 }
5320 while (*ecode == OP_ALT);
5321
5322 /* If hit the end of the group (which could be repeated), fail */
5323
5324 if (*ecode != OP_ONCE && *ecode != OP_ALT) return MATCH_NOMATCH;
5325
5326 /* Continue as from after the assertion, updating the offsets high water
5327 mark, since extracts may have been taken. */
5328
5329 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5330
5331 offset_top = md->end_offset_top;
5332 eptr = md->end_match_ptr;
5333
5334 /* For a non-repeating ket, just continue at this level. This also
5335 happens for a repeating ket if no characters were matched in the group.
5336 This is the forcible breaking of infinite loops as implemented in Perl
5337 5.005. If there is an options reset, it will get obeyed in the normal
5338 course of events. */
5339
5340 if (*ecode == OP_KET || eptr == saved_eptr)
5341 {
5342 ecode += 1+LINK_SIZE;
5343 break;
5344 }
5345
5346 /* The repeating kets try the rest of the pattern or restart from the
5347 preceding bracket, in the appropriate order. We need to reset any options
5348 that changed within the bracket before re-running it, so check the next
5349 opcode. */
5350
5351 if (ecode[1+LINK_SIZE] == OP_OPT)
5352 {
5353 ims = (ims & ~PCRE_IMS) | ecode[4];
5354 DPRINTF(("ims set to %02lx at group repeat\n", ims));
5355 }
5356
5357 if (*ecode == OP_KETRMIN)
5358 {
5359 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5360 eptrb, 0)) != MATCH_NOMATCH) return rrc;
5361 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5362 match_isgroup)) != MATCH_NOMATCH) return rrc;
5363 }
5364 else /* OP_KETRMAX */
5365 {
5366 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5367 match_isgroup)) != MATCH_NOMATCH) return rrc;
5368 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5369 0)) != MATCH_NOMATCH) return rrc;
5370 }
5371 }
5372 return MATCH_NOMATCH;
5373
5374 /* An alternation is the end of a branch; scan along to find the end of the
5375 bracketed group and go to there. */
5376
5377 case OP_ALT:
5378 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5379 break;
5380
5381 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
5382 that it may occur zero times. It may repeat infinitely, or not at all -
5383 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
5384 repeat limits are compiled as a number of copies, with the optional ones
5385 preceded by BRAZERO or BRAMINZERO. */
5386
5387 case OP_BRAZERO:
5388 {
5389 const uschar *next = ecode+1;
5390 if ((rrc = match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
5391 != MATCH_NOMATCH) return rrc;
5392 do next += GET(next,1); while (*next == OP_ALT);
5393 ecode = next + 1+LINK_SIZE;
5394 }
5395 break;
5396
5397 case OP_BRAMINZERO:
5398 {
5399 const uschar *next = ecode+1;
5400 do next += GET(next,1); while (*next == OP_ALT);
5401 if ((rrc = match(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5402 match_isgroup)) != MATCH_NOMATCH) return rrc;
5403 ecode++;
5404 }
5405 break;
5406
5407 /* End of a group, repeated or non-repeating. If we are at the end of
5408 an assertion "group", stop matching and return MATCH_MATCH, but record the
5409 current high water mark for use by positive assertions. Do this also
5410 for the "once" (not-backup up) groups. */
5411
5412 case OP_KET:
5413 case OP_KETRMIN:
5414 case OP_KETRMAX:
5415 {
5416 const uschar *prev = ecode - GET(ecode, 1);
5417 const uschar *saved_eptr = eptrb->saved_eptr;
5418
5419 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
5420
5421 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
5422 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
5423 *prev == OP_ONCE)
5424 {
5425 md->end_match_ptr = eptr; /* For ONCE */
5426 md->end_offset_top = offset_top;
5427 return MATCH_MATCH;
5428 }
5429
5430 /* In all other cases except a conditional group we have to check the
5431 group number back at the start and if necessary complete handling an
5432 extraction by setting the offsets and bumping the high water mark. */
5433
5434 if (*prev != OP_COND)
5435 {
5436 int offset;
5437 int number = *prev - OP_BRA;
5438
5439 /* For extended extraction brackets (large number), we have to fish out
5440 the number from a dummy opcode at the start. */
5441
5442 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
5443 offset = number << 1;
5444
5445 #ifdef DEBUG
5446 printf("end bracket %d", number);
5447 printf("\n");
5448 #endif
5449
5450 /* Test for a numbered group. This includes groups called as a result
5451 of recursion. Note that whole-pattern recursion is coded as a recurse
5452 into group 0, so it won't be picked up here. Instead, we catch it when
5453 the OP_END is reached. */
5454
5455 if (number > 0)
5456 {
5457 md->capture_last = number;
5458 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
5459 {
5460 md->offset_vector[offset] =
5461 md->offset_vector[md->offset_end - number];
5462 md->offset_vector[offset+1] = eptr - md->start_subject;
5463 if (offset_top <= offset) offset_top = offset + 2;
5464 }
5465
5466 /* Handle a recursively called group. Restore the offsets
5467 appropriately and continue from after the call. */
5468
5469 if (md->recursive != NULL && md->recursive->group_num == number)
5470 {
5471 recursion_info *rec = md->recursive;
5472 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
5473 md->recursive = rec->prev;
5474 md->start_match = rec->save_start;
5475 memcpy(md->offset_vector, rec->offset_save,
5476 rec->saved_max * sizeof(int));
5477 ecode = rec->after_call;
5478 ims = original_ims;
5479 break;
5480 }
5481 }
5482 }
5483
5484 /* Reset the value of the ims flags, in case they got changed during
5485 the group. */
5486
5487 ims = original_ims;
5488 DPRINTF(("ims reset to %02lx\n", ims));
5489
5490 /* For a non-repeating ket, just continue at this level. This also
5491 happens for a repeating ket if no characters were matched in the group.
5492 This is the forcible breaking of infinite loops as implemented in Perl
5493 5.005. If there is an options reset, it will get obeyed in the normal
5494 course of events. */
5495
5496 if (*ecode == OP_KET || eptr == saved_eptr)
5497 {
5498 ecode += 1 + LINK_SIZE;
5499 break;
5500 }
5501
5502 /* The repeating kets try the rest of the pattern or restart from the
5503 preceding bracket, in the appropriate order. */
5504
5505 if (*ecode == OP_KETRMIN)
5506 {
5507 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5508 0)) != MATCH_NOMATCH) return rrc;
5509 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5510 match_isgroup)) != MATCH_NOMATCH) return rrc;
5511 }
5512 else /* OP_KETRMAX */
5513 {
5514 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5515 match_isgroup)) != MATCH_NOMATCH) return rrc;
5516 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5517 0)) != MATCH_NOMATCH) return rrc;
5518 }
5519 }
5520 return MATCH_NOMATCH;
5521
5522 /* Start of subject unless notbol, or after internal newline if multiline */
5523
5524 case OP_CIRC:
5525 if (md->notbol && eptr == md->start_subject) return MATCH_NOMATCH;
5526 if ((ims & PCRE_MULTILINE) != 0)
5527 {
5528 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
5529 return MATCH_NOMATCH;
5530 ecode++;
5531 break;
5532 }
5533 /* ... else fall through */
5534
5535 /* Start of subject assertion */
5536
5537 case OP_SOD:
5538 if (eptr != md->start_subject) return MATCH_NOMATCH;
5539 ecode++;
5540 break;
5541
5542 /* Start of match assertion */
5543
5544 case OP_SOM:
5545 if (eptr != md->start_subject + md->start_offset) return MATCH_NOMATCH;
5546 ecode++;
5547 break;
5548
5549 /* Assert before internal newline if multiline, or before a terminating
5550 newline unless endonly is set, else end of subject unless noteol is set. */
5551
5552 case OP_DOLL:
5553 if ((ims & PCRE_MULTILINE) != 0)
5554 {
5555 if (eptr < md->end_subject)
5556 { if (*eptr != NEWLINE) return MATCH_NOMATCH; }
5557 else
5558 { if (md->noteol) return MATCH_NOMATCH; }
5559 ecode++;
5560 break;
5561 }
5562 else
5563 {
5564 if (md->noteol) return MATCH_NOMATCH;
5565 if (!md->endonly)
5566 {
5567 if (eptr < md->end_subject - 1 ||
5568 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
5569 return MATCH_NOMATCH;
5570 ecode++;
5571 break;
5572 }
5573 }
5574 /* ... else fall through */
5575
5576 /* End of subject assertion (\z) */
5577
5578 case OP_EOD:
5579 if (eptr < md->end_subject) return MATCH_NOMATCH;
5580 ecode++;
5581 break;
5582
5583 /* End of subject or ending \n assertion (\Z) */
5584
5585 case OP_EODN:
5586 if (eptr < md->end_subject - 1 ||
5587 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return MATCH_NOMATCH;
5588 ecode++;
5589 break;
5590
5591 /* Word boundary assertions */
5592
5593 case OP_NOT_WORD_BOUNDARY:
5594 case OP_WORD_BOUNDARY:
5595 {
5596 BOOL prev_is_word, cur_is_word;
5597
5598 /* Find out if the previous and current characters are "word" characters.
5599 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
5600 be "non-word" characters. */
5601
5602 #if SUPPORT_UTF8
5603 if (md->utf8)
5604 {
5605 if (eptr == md->start_subject) prev_is_word = FALSE; else
5606 {
5607 const uschar *lastptr = eptr - 1;
5608 while((*lastptr & 0xc0) == 0x80) lastptr--;
5609 GETCHAR(c, lastptr);
5610 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5611 }
5612 if (eptr >= md->end_subject) cur_is_word = FALSE; else
5613 {
5614 GETCHAR(c, eptr);
5615 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5616 }
5617 }
5618 else
5619 #endif
5620
5621 /* More streamlined when not in UTF-8 mode */
5622
5623 {
5624 prev_is_word = (eptr != md->start_subject) &&
5625 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
5626 cur_is_word = (eptr < md->end_subject) &&
5627 ((md->ctypes[*eptr] & ctype_word) != 0);
5628 }
5629
5630 /* Now see if the situation is what we want */
5631
5632 if ((*ecode++ == OP_WORD_BOUNDARY)?
5633 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5634 return MATCH_NOMATCH;
5635 }
5636 break;
5637
5638 /* Match a single character type; inline for speed */
5639
5640 case OP_ANY:
5641 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
5642 return MATCH_NOMATCH;
5643 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5644 #ifdef SUPPORT_UTF8
5645 if (md->utf8)
5646 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5647 #endif
5648 ecode++;
5649 break;
5650
5651 /* Match a single byte, even in UTF-8 mode. This opcode really does match
5652 any byte, even newline, independent of the setting of PCRE_DOTALL. */
5653
5654 case OP_ANYBYTE:
5655 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5656 ecode++;
5657 break;
5658
5659 case OP_NOT_DIGIT:
5660 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5661 GETCHARINCTEST(c, eptr);
5662 if (
5663 #ifdef SUPPORT_UTF8
5664 c < 256 &&
5665 #endif
5666 (md->ctypes[c] & ctype_digit) != 0
5667 )
5668 return MATCH_NOMATCH;
5669 ecode++;
5670 break;
5671
5672 case OP_DIGIT:
5673 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5674 GETCHARINCTEST(c, eptr);
5675 if (
5676 #ifdef SUPPORT_UTF8
5677 c >= 256 ||
5678 #endif
5679 (md->ctypes[c] & ctype_digit) == 0
5680 )
5681 return MATCH_NOMATCH;
5682 ecode++;
5683 break;
5684
5685 case OP_NOT_WHITESPACE:
5686 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5687 GETCHARINCTEST(c, eptr);
5688 if (
5689 #ifdef SUPPORT_UTF8
5690 c < 256 &&
5691 #endif
5692 (md->ctypes[c] & ctype_space) != 0
5693 )
5694 return MATCH_NOMATCH;
5695 ecode++;
5696 break;
5697
5698 case OP_WHITESPACE:
5699 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5700 GETCHARINCTEST(c, eptr);
5701 if (
5702 #ifdef SUPPORT_UTF8
5703 c >= 256 ||
5704 #endif
5705 (md->ctypes[c] & ctype_space) == 0
5706 )
5707 return MATCH_NOMATCH;
5708 ecode++;
5709 break;
5710
5711 case OP_NOT_WORDCHAR:
5712 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5713 GETCHARINCTEST(c, eptr);
5714 if (
5715 #ifdef SUPPORT_UTF8
5716 c < 256 &&
5717 #endif
5718 (md->ctypes[c] & ctype_word) != 0
5719 )
5720 return MATCH_NOMATCH;
5721 ecode++;
5722 break;
5723
5724 case OP_WORDCHAR:
5725 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5726 GETCHARINCTEST(c, eptr);
5727 if (
5728 #ifdef SUPPORT_UTF8
5729 c >= 256 ||
5730 #endif
5731 (md->ctypes[c] & ctype_word) == 0
5732 )
5733 return MATCH_NOMATCH;
5734 ecode++;
5735 break;
5736
5737 /* Match a back reference, possibly repeatedly. Look past the end of the
5738 item to see if there is repeat information following. The code is similar
5739 to that for character classes, but repeated for efficiency. Then obey
5740 similar code to character type repeats - written out again for speed.
5741 However, if the referenced string is the empty string, always treat
5742 it as matched, any number of times (otherwise there could be infinite
5743 loops). */
5744
5745 case OP_REF:
5746 {
5747 int length;
5748 int offset = GET2(ecode, 1) << 1; /* Doubled ref number */
5749 ecode += 3; /* Advance past item */
5750
5751 /* If the reference is unset, set the length to be longer than the amount
5752 of subject left; this ensures that every attempt at a match fails. We
5753 can't just fail here, because of the possibility of quantifiers with zero
5754 minima. */
5755
5756 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
5757 md->end_subject - eptr + 1 :
5758 md->offset_vector[offset+1] - md->offset_vector[offset];
5759
5760 /* Set up for repetition, or handle the non-repeated case */
5761
5762 switch (*ecode)
5763 {
5764 case OP_CRSTAR:
5765 case OP_CRMINSTAR:
5766 case OP_CRPLUS:
5767 case OP_CRMINPLUS:
5768 case OP_CRQUERY:
5769 case OP_CRMINQUERY:
5770 c = *ecode++ - OP_CRSTAR;
5771 minimize = (c & 1) != 0;
5772 min = rep_min[c]; /* Pick up values from tables; */
5773 max = rep_max[c]; /* zero for max => infinity */
5774 if (max == 0) max = INT_MAX;
5775 break;
5776
5777 case OP_CRRANGE:
5778 case OP_CRMINRANGE:
5779 minimize = (*ecode == OP_CRMINRANGE);
5780 min = GET2(ecode, 1);
5781 max = GET2(ecode, 3);
5782 if (max == 0) max = INT_MAX;
5783 ecode += 5;
5784 break;
5785
5786 default: /* No repeat follows */
5787 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5788 eptr += length;
5789 continue; /* With the main loop */
5790 }
5791
5792 /* If the length of the reference is zero, just continue with the
5793 main loop. */
5794
5795 if (length == 0) continue;
5796
5797 /* First, ensure the minimum number of matches are present. We get back
5798 the length of the reference string explicitly rather than passing the
5799 address of eptr, so that eptr can be a register variable. */
5800
5801 for (i = 1; i <= min; i++)
5802 {
5803 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5804 eptr += length;
5805 }
5806
5807 /* If min = max, continue at the same level without recursion.
5808 They are not both allowed to be zero. */
5809
5810 if (min == max) continue;
5811
5812 /* If minimizing, keep trying and advancing the pointer */
5813
5814 if (minimize)
5815 {
5816 for (i = min;; i++)
5817 {
5818 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5819 MATCH_NOMATCH) return rrc;
5820 if (i >= max || !match_ref(offset, eptr, length, md, ims))
5821 return MATCH_NOMATCH;
5822 eptr += length;
5823 }
5824 /* Control never gets here */
5825 }
5826
5827 /* If maximizing, find the longest string and work backwards */
5828
5829 else
5830 {
5831 const uschar *pp = eptr;
5832 for (i = min; i < max; i++)
5833 {
5834 if (!match_ref(offset, eptr, length, md, ims)) break;
5835 eptr += length;
5836 }
5837 while (eptr >= pp)
5838 {
5839 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5840 MATCH_NOMATCH) return rrc;
5841 eptr -= length;
5842 }
5843 return MATCH_NOMATCH;
5844 }
5845 }
5846 /* Control never gets here */
5847
5848
5849
5850 /* Match a bit-mapped character class, possibly repeatedly. This op code is
5851 used when all the characters in the class have values in the range 0-255.
5852 The only difference between OP_CLASS and OP_NCLASS occurs when a data
5853 character outside the range is encountered.
5854
5855 First, look past the end of the item to see if there is repeat information
5856 following. Then obey similar code to character type repeats - written out
5857 again for speed. */
5858
5859 case OP_NCLASS:
5860 case OP_CLASS:
5861 {
5862 const uschar *data = ecode + 1; /* Save for matching */
5863 ecode += 33; /* Advance past the item */
5864
5865 switch (*ecode)
5866 {
5867 case OP_CRSTAR:
5868 case OP_CRMINSTAR:
5869 case OP_CRPLUS:
5870 case OP_CRMINPLUS:
5871 case OP_CRQUERY:
5872 case OP_CRMINQUERY:
5873 c = *ecode++ - OP_CRSTAR;
5874 minimize = (c & 1) != 0;
5875 min = rep_min[c]; /* Pick up values from tables; */
5876 max = rep_max[c]; /* zero for max => infinity */
5877 if (max == 0) max = INT_MAX;
5878 break;
5879
5880 case OP_CRRANGE:
5881 case OP_CRMINRANGE:
5882 minimize = (*ecode == OP_CRMINRANGE);
5883 min = GET2(ecode, 1);
5884 max = GET2(ecode, 3);
5885 if (max == 0) max = INT_MAX;
5886 ecode += 5;
5887 break;
5888
5889 default: /* No repeat follows */
5890 min = max = 1;
5891 break;
5892 }
5893
5894 /* First, ensure the minimum number of matches are present. */
5895
5896 #ifdef SUPPORT_UTF8
5897 /* UTF-8 mode */
5898 if (md->utf8)
5899 {
5900 for (i = 1; i <= min; i++)
5901 {
5902 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5903 GETCHARINC(c, eptr);
5904 if (c > 255)
5905 {
5906 if (op == OP_CLASS) return MATCH_NOMATCH;
5907 }
5908 else
5909 {
5910 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5911 }
5912 }
5913 }
5914 else
5915 #endif
5916 /* Not UTF-8 mode */
5917 {
5918 for (i = 1; i <= min; i++)
5919 {
5920 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5921 c = *eptr++;
5922 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5923 }
5924 }
5925
5926 /* If max == min we can continue with the main loop without the
5927 need to recurse. */
5928
5929 if (min == max) continue;
5930
5931 /* If minimizing, keep testing the rest of the expression and advancing
5932 the pointer while it matches the class. */
5933
5934 if (minimize)
5935 {
5936 #ifdef SUPPORT_UTF8
5937 /* UTF-8 mode */
5938 if (md->utf8)
5939 {
5940 for (i = min;; i++)
5941 {
5942 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5943 MATCH_NOMATCH) return rrc;
5944 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
5945 GETCHARINC(c, eptr);
5946 if (c > 255)
5947 {
5948 if (op == OP_CLASS) return MATCH_NOMATCH;
5949 }
5950 else
5951 {
5952 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5953 }
5954 }
5955 }
5956 else
5957 #endif
5958 /* Not UTF-8 mode */
5959 {
5960 for (i = min;; i++)
5961 {
5962 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5963 MATCH_NOMATCH) return rrc;
5964 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
5965 c = *eptr++;
5966 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5967 }
5968 }
5969 /* Control never gets here */
5970 }
5971
5972 /* If maximizing, find the longest possible run, then work backwards. */
5973
5974 else
5975 {
5976 const uschar *pp = eptr;
5977
5978 #ifdef SUPPORT_UTF8
5979 /* UTF-8 mode */
5980 if (md->utf8)
5981 {
5982 for (i = min; i < max; i++)
5983 {
5984 int len = 1;
5985 if (eptr >= md->end_subject) break;
5986 GETCHARLEN(c, eptr, len);
5987 if (c > 255)
5988 {
5989 if (op == OP_CLASS) break;
5990 }
5991 else
5992 {
5993 if ((data[c/8] & (1 << (c&7))) == 0) break;
5994 }
5995 eptr += len;
5996 }
5997 while (eptr >= pp)
5998 {
5999 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6000 MATCH_NOMATCH) return rrc;
6001 BACKCHAR(eptr)
6002 }
6003 }
6004 else
6005 #endif
6006 /* Not UTF-8 mode */
6007 {
6008 for (i = min; i < max; i++)
6009 {
6010 if (eptr >= md->end_subject) break;
6011 c = *eptr;
6012 if ((data[c/8] & (1 << (c&7))) == 0) break;
6013 eptr++;
6014 }
6015 while (eptr >= pp)
6016 {
6017 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6018 MATCH_NOMATCH) return rrc;
6019 }
6020 }
6021
6022 return MATCH_NOMATCH;
6023 }
6024 }
6025 /* Control never gets here */
6026
6027
6028 /* Match an extended character class. This opcode is encountered only
6029 in UTF-8 mode, because that's the only time it is compiled. */
6030
6031 #ifdef SUPPORT_UTF8
6032 case OP_XCLASS:
6033 {
6034 const uschar *data = ecode + 1 + LINK_SIZE; /* Save for matching */
6035 ecode += GET(ecode, 1); /* Advance past the item */
6036
6037 switch (*ecode)
6038 {
6039 case OP_CRSTAR:
6040 case OP_CRMINSTAR:
6041 case OP_CRPLUS:
6042 case OP_CRMINPLUS:
6043 case OP_CRQUERY:
6044 case OP_CRMINQUERY:
6045 c = *ecode++ - OP_CRSTAR;
6046 minimize = (c & 1) != 0;
6047 min = rep_min[c]; /* Pick up values from tables; */
6048 max = rep_max[c]; /* zero for max => infinity */
6049 if (max == 0) max = INT_MAX;
6050 break;
6051
6052 case OP_CRRANGE:
6053 case OP_CRMINRANGE:
6054 minimize = (*ecode == OP_CRMINRANGE);
6055 min = GET2(ecode, 1);
6056 max = GET2(ecode, 3);
6057 if (max == 0) max = INT_MAX;
6058 ecode += 5;
6059 break;
6060
6061 default: /* No repeat follows */
6062 min = max = 1;
6063 break;
6064 }
6065
6066 /* First, ensure the minimum number of matches are present. */
6067
6068 for (i = 1; i <= min; i++)
6069 {
6070 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6071 GETCHARINC(c, eptr);
6072 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6073 }
6074
6075 /* If max == min we can continue with the main loop without the
6076 need to recurse. */
6077
6078 if (min == max) continue;
6079
6080 /* If minimizing, keep testing the rest of the expression and advancing
6081 the pointer while it matches the class. */
6082
6083 if (minimize)
6084 {
6085 for (i = min;; i++)
6086 {
6087 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6088 MATCH_NOMATCH) return rrc;
6089 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6090 GETCHARINC(c, eptr);
6091 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6092 }
6093 /* Control never gets here */
6094 }
6095
6096 /* If maximizing, find the longest possible run, then work backwards. */
6097
6098 else
6099 {
6100 const uschar *pp = eptr;
6101 for (i = min; i < max; i++)
6102 {
6103 int len = 1;
6104 if (eptr >= md->end_subject) break;
6105 GETCHARLEN(c, eptr, len);
6106 if (!match_xclass(c, data)) break;
6107 eptr += len;
6108 }
6109 while (eptr >= pp)
6110 {
6111 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6112 MATCH_NOMATCH) return rrc;
6113 BACKCHAR(eptr)
6114 }
6115 return MATCH_NOMATCH;
6116 }
6117
6118 /* Control never gets here */
6119 }
6120 #endif /* End of XCLASS */
6121
6122 /* Match a run of characters */
6123
6124 case OP_CHARS:
6125 {
6126 register int length = ecode[1];
6127 ecode += 2;
6128
6129 #ifdef DEBUG /* Sigh. Some compilers never learn. */
6130 if (eptr >= md->end_subject)
6131 printf("matching subject <null> against pattern ");
6132 else
6133 {
6134 printf("matching subject ");
6135 pchars(eptr, length, TRUE, md);
6136 printf(" against pattern ");
6137 }
6138 pchars(ecode, length, FALSE, md);
6139 printf("\n");
6140 #endif
6141
6142 if (length > md->end_subject - eptr) return MATCH_NOMATCH;
6143 if ((ims & PCRE_CASELESS) != 0)
6144 {
6145 while (length-- > 0)
6146 if (md->lcc[*ecode++] != md->lcc[*eptr++])
6147 return MATCH_NOMATCH;
6148 }
6149 else
6150 {
6151 while (length-- > 0) if (*ecode++ != *eptr++) return MATCH_NOMATCH;
6152 }
6153 }
6154 break;
6155
6156 /* Match a single character repeatedly; different opcodes share code. */
6157
6158 case OP_EXACT:
6159 min = max = GET2(ecode, 1);
6160 ecode += 3;
6161 goto REPEATCHAR;
6162
6163 case OP_UPTO:
6164 case OP_MINUPTO:
6165 min = 0;
6166 max = GET2(ecode, 1);
6167 minimize = *ecode == OP_MINUPTO;
6168 ecode += 3;
6169 goto REPEATCHAR;
6170
6171 case OP_STAR:
6172 case OP_MINSTAR:
6173 case OP_PLUS:
6174 case OP_MINPLUS:
6175 case OP_QUERY:
6176 case OP_MINQUERY:
6177 c = *ecode++ - OP_STAR;
6178 minimize = (c & 1) != 0;
6179 min = rep_min[c]; /* Pick up values from tables; */
6180 max = rep_max[c]; /* zero for max => infinity */
6181 if (max == 0) max = INT_MAX;
6182
6183 /* Common code for all repeated single-character matches. We can give
6184 up quickly if there are fewer than the minimum number of characters left in
6185 the subject. */
6186
6187 REPEATCHAR:
6188 #ifdef SUPPORT_UTF8
6189 if (md->utf8)
6190 {
6191 int len = 1;
6192 const uschar *charptr = ecode;
6193 GETCHARLEN(c, ecode, len);
6194 if (min * len > md->end_subject - eptr) return MATCH_NOMATCH;
6195 ecode += len;
6196
6197 /* Handle multibyte character matching specially here. There is no
6198 support for any kind of casing for multibyte characters. */
6199
6200 if (len > 1)
6201 {
6202 for (i = 1; i <= min; i++)
6203 {
6204 if (memcmp(eptr, charptr, len) != 0) return MATCH_NOMATCH;
6205 eptr += len;
6206 }
6207
6208 if (min == max) continue;
6209
6210 if (minimize)
6211 {
6212 for (i = min;; i++)
6213 {
6214 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6215 MATCH_NOMATCH) return rrc;
6216 if (i >= max ||
6217 eptr >= md->end_subject ||
6218 memcmp(eptr, charptr, len) != 0)
6219 return MATCH_NOMATCH;
6220 eptr += len;
6221 }
6222 /* Control never gets here */
6223 }
6224 else
6225 {
6226 const uschar *pp = eptr;
6227 for (i = min; i < max; i++)
6228 {
6229 if (eptr > md->end_subject - len ||
6230 memcmp(eptr, charptr, len) != 0)
6231 break;
6232 eptr += len;
6233 }
6234 while (eptr >= pp)
6235 {
6236 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6237 MATCH_NOMATCH) return rrc;
6238 eptr -= len;
6239 }
6240 return MATCH_NOMATCH;
6241 }
6242 /* Control never gets here */
6243 }
6244
6245 /* If the length of a UTF-8 character is 1, we fall through here, and
6246 obey the code as for non-UTF-8 characters below, though in this case the
6247 value of c will always be < 128. */
6248 }
6249 else
6250 #endif
6251
6252 /* When not in UTF-8 mode, load a single-byte character. */
6253 {
6254 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6255 c = *ecode++;
6256 }
6257
6258 /* The value of c at this point is always less than 256, though we may or
6259 may not be in UTF-8 mode. The code is duplicated for the caseless and
6260 caseful cases, for speed, since matching characters is likely to be quite
6261 common. First, ensure the minimum number of matches are present. If min =
6262 max, continue at the same level without recursing. Otherwise, if
6263 minimizing, keep trying the rest of the expression and advancing one
6264 matching character if failing, up to the maximum. Alternatively, if
6265 maximizing, find the maximum number of characters and work backwards. */
6266
6267 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
6268 max, eptr));
6269
6270 if ((ims & PCRE_CASELESS) != 0)
6271 {
6272 c = md->lcc[c];
6273 for (i = 1; i <= min; i++)
6274 if (c != md->lcc[*eptr++]) return MATCH_NOMATCH;
6275 if (min == max) continue;
6276 if (minimize)
6277 {
6278 for (i = min;; i++)
6279 {
6280 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6281 MATCH_NOMATCH) return rrc;
6282 if (i >= max || eptr >= md->end_subject ||
6283 c != md->lcc[*eptr++])
6284 return MATCH_NOMATCH;
6285 }
6286 /* Control never gets here */
6287 }
6288 else
6289 {
6290 const uschar *pp = eptr;
6291 for (i = min; i < max; i++)
6292 {
6293 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
6294 eptr++;
6295 }
6296 while (eptr >= pp)
6297 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6298 MATCH_NOMATCH) return rrc;
6299 return MATCH_NOMATCH;
6300 }
6301 /* Control never gets here */
6302 }
6303
6304 /* Caseful comparisons (includes all multi-byte characters) */
6305
6306 else
6307 {
6308 for (i = 1; i <= min; i++) if (c != *eptr++) return MATCH_NOMATCH;
6309 if (min == max) continue;
6310 if (minimize)
6311 {
6312 for (i = min;; i++)
6313 {
6314 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6315 MATCH_NOMATCH) return rrc;
6316 if (i >= max || eptr >= md->end_subject || c != *eptr++)
6317 return MATCH_NOMATCH;
6318 }
6319 /* Control never gets here */
6320 }
6321 else
6322 {
6323 const uschar *pp = eptr;
6324 for (i = min; i < max; i++)
6325 {
6326 if (eptr >= md->end_subject || c != *eptr) break;
6327 eptr++;
6328 }
6329 while (eptr >= pp)
6330 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6331 MATCH_NOMATCH) return rrc;
6332 return MATCH_NOMATCH;
6333 }
6334 }
6335 /* Control never gets here */
6336
6337 /* Match a negated single one-byte character. The character we are
6338 checking can be multibyte. */
6339
6340 case OP_NOT:
6341 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6342 ecode++;
6343 GETCHARINCTEST(c, eptr);
6344 if ((ims & PCRE_CASELESS) != 0)
6345 {
6346 #ifdef SUPPORT_UTF8
6347 if (c < 256)
6348 #endif
6349 c = md->lcc[c];
6350 if (md->lcc[*ecode++] == c) return MATCH_NOMATCH;
6351 }
6352 else
6353 {
6354 if (*ecode++ == c) return MATCH_NOMATCH;
6355 }
6356 break;
6357
6358 /* Match a negated single one-byte character repeatedly. This is almost a
6359 repeat of the code for a repeated single character, but I haven't found a
6360 nice way of commoning these up that doesn't require a test of the
6361 positive/negative option for each character match. Maybe that wouldn't add
6362 very much to the time taken, but character matching *is* what this is all
6363 about... */
6364
6365 case OP_NOTEXACT:
6366 min = max = GET2(ecode, 1);
6367 ecode += 3;
6368 goto REPEATNOTCHAR;
6369
6370 case OP_NOTUPTO:
6371 case OP_NOTMINUPTO:
6372 min = 0;
6373 max = GET2(ecode, 1);
6374 minimize = *ecode == OP_NOTMINUPTO;
6375 ecode += 3;
6376 goto REPEATNOTCHAR;
6377
6378 case OP_NOTSTAR:
6379 case OP_NOTMINSTAR:
6380 case OP_NOTPLUS:
6381 case OP_NOTMINPLUS:
6382 case OP_NOTQUERY:
6383 case OP_NOTMINQUERY:
6384 c = *ecode++ - OP_NOTSTAR;
6385 minimize = (c & 1) != 0;
6386 min = rep_min[c]; /* Pick up values from tables; */
6387 max = rep_max[c]; /* zero for max => infinity */
6388 if (max == 0) max = INT_MAX;
6389
6390 /* Common code for all repeated single-character (less than 255) matches.
6391 We can give up quickly if there are fewer than the minimum number of
6392 characters left in the subject. */
6393
6394 REPEATNOTCHAR:
6395 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6396 c = *ecode++;
6397
6398 /* The code is duplicated for the caseless and caseful cases, for speed,
6399 since matching characters is likely to be quite common. First, ensure the
6400 minimum number of matches are present. If min = max, continue at the same
6401 level without recursing. Otherwise, if minimizing, keep trying the rest of
6402 the expression and advancing one matching character if failing, up to the
6403 maximum. Alternatively, if maximizing, find the maximum number of
6404 characters and work backwards. */
6405
6406 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
6407 max, eptr));
6408
6409 if ((ims & PCRE_CASELESS) != 0)
6410 {
6411 c = md->lcc[c];
6412
6413 #ifdef SUPPORT_UTF8
6414 /* UTF-8 mode */
6415 if (md->utf8)
6416 {
6417 register int d;
6418 for (i = 1; i <= min; i++)
6419 {
6420 GETCHARINC(d, eptr);
6421 if (d < 256) d = md->lcc[d];
6422 if (c == d) return MATCH_NOMATCH;
6423 }
6424 }
6425 else
6426 #endif
6427
6428 /* Not UTF-8 mode */
6429 {
6430 for (i = 1; i <= min; i++)
6431 if (c == md->lcc[*eptr++]) return MATCH_NOMATCH;
6432 }
6433
6434 if (min == max) continue;
6435
6436 if (minimize)
6437 {
6438 #ifdef SUPPORT_UTF8
6439 /* UTF-8 mode */
6440 if (md->utf8)
6441 {
6442 register int d;
6443 for (i = min;; i++)
6444 {
6445 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6446 MATCH_NOMATCH) return rrc;
6447 GETCHARINC(d, eptr);
6448 if (d < 256) d = md->lcc[d];
6449 if (i >= max || eptr >= md->end_subject || c == d)
6450 return MATCH_NOMATCH;
6451 }
6452 }
6453 else
6454 #endif
6455 /* Not UTF-8 mode */
6456 {
6457 for (i = min;; i++)
6458 {
6459 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6460 MATCH_NOMATCH) return rrc;
6461 if (i >= max || eptr >= md->end_subject || c == md->lcc[*eptr++])
6462 return MATCH_NOMATCH;
6463 }
6464 }
6465 /* Control never gets here */
6466 }
6467
6468 /* Maximize case */
6469
6470 else
6471 {
6472 const uschar *pp = eptr;
6473
6474 #ifdef SUPPORT_UTF8
6475 /* UTF-8 mode */
6476 if (md->utf8)
6477 {
6478 register int d;
6479 for (i = min; i < max; i++)
6480 {
6481 int len = 1;
6482 if (eptr >= md->end_subject) break;
6483 GETCHARLEN(d, eptr, len);
6484 if (d < 256) d = md->lcc[d];
6485 if (c == d) break;
6486 eptr += len;
6487 }
6488 while (eptr >= pp)
6489 {
6490 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6491 MATCH_NOMATCH) return rrc;
6492 eptr--;
6493 BACKCHAR(eptr);
6494 }
6495 }
6496 else
6497 #endif
6498 /* Not UTF-8 mode */
6499 {
6500 for (i = min; i < max; i++)
6501 {
6502 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
6503 eptr++;
6504 }
6505 while (eptr >= pp)
6506 {
6507 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6508 MATCH_NOMATCH) return rrc;
6509 eptr--;
6510 }
6511 }
6512
6513 return MATCH_NOMATCH;
6514 }
6515 /* Control never gets here */
6516 }
6517
6518 /* Caseful comparisons */
6519
6520 else
6521 {
6522 #ifdef SUPPORT_UTF8
6523 /* UTF-8 mode */
6524 if (md->utf8)
6525 {
6526 register int d;
6527 for (i = 1; i <= min; i++)
6528 {
6529 GETCHARINC(d, eptr);
6530 if (c == d) return MATCH_NOMATCH;
6531 }
6532 }
6533 else
6534 #endif
6535 /* Not UTF-8 mode */
6536 {
6537 for (i = 1; i <= min; i++)
6538 if (c == *eptr++) return MATCH_NOMATCH;
6539 }
6540
6541 if (min == max) continue;
6542
6543 if (minimize)
6544 {
6545 #ifdef SUPPORT_UTF8
6546 /* UTF-8 mode */
6547 if (md->utf8)
6548 {
6549 register int d;
6550 for (i = min;; i++)
6551 {
6552 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6553 MATCH_NOMATCH) return rrc;
6554 GETCHARINC(d, eptr);
6555 if (i >= max || eptr >= md->end_subject || c == d)
6556 return MATCH_NOMATCH;
6557 }
6558 }
6559 else
6560 #endif
6561 /* Not UTF-8 mode */
6562 {
6563 for (i = min;; i++)
6564 {
6565 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6566 MATCH_NOMATCH) return rrc;
6567 if (i >= max || eptr >= md->end_subject || c == *eptr++)
6568 return MATCH_NOMATCH;
6569 }
6570 }
6571 /* Control never gets here */
6572 }
6573
6574 /* Maximize case */
6575
6576 else
6577 {
6578 const uschar *pp = eptr;
6579
6580 #ifdef SUPPORT_UTF8
6581 /* UTF-8 mode */
6582 if (md->utf8)
6583 {
6584 register int d;
6585 for (i = min; i < max; i++)
6586 {
6587 int len = 1;
6588 if (eptr >= md->end_subject) break;
6589 GETCHARLEN(d, eptr, len);
6590 if (c == d) break;
6591 eptr += len;
6592 }
6593 while (eptr >= pp)
6594 {
6595 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6596 MATCH_NOMATCH) return rrc;
6597 eptr--;
6598 BACKCHAR(eptr);
6599 }
6600 }
6601 else
6602 #endif
6603 /* Not UTF-8 mode */
6604 {
6605 for (i = min; i < max; i++)
6606 {
6607 if (eptr >= md->end_subject || c == *eptr) break;
6608 eptr++;
6609 }
6610 while (eptr >= pp)
6611 {
6612 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6613 MATCH_NOMATCH) return rrc;
6614 eptr--;
6615 }
6616 }
6617
6618 return MATCH_NOMATCH;
6619 }
6620 }
6621 /* Control never gets here */
6622
6623 /* Match a single character type repeatedly; several different opcodes
6624 share code. This is very similar to the code for single characters, but we
6625 repeat it in the interests of efficiency. */
6626
6627 case OP_TYPEEXACT:
6628 min = max = GET2(ecode, 1);
6629 minimize = TRUE;
6630 ecode += 3;
6631 goto REPEATTYPE;
6632
6633 case OP_TYPEUPTO:
6634 case OP_TYPEMINUPTO:
6635 min = 0;
6636 max = GET2(ecode, 1);
6637 minimize = *ecode == OP_TYPEMINUPTO;
6638 ecode += 3;
6639 goto REPEATTYPE;
6640
6641 case OP_TYPESTAR:
6642 case OP_TYPEMINSTAR:
6643 case OP_TYPEPLUS:
6644 case OP_TYPEMINPLUS:
6645 case OP_TYPEQUERY:
6646 case OP_TYPEMINQUERY:
6647 c = *ecode++ - OP_TYPESTAR;
6648 minimize = (c & 1) != 0;
6649 min = rep_min[c]; /* Pick up values from tables; */
6650 max = rep_max[c]; /* zero for max => infinity */
6651 if (max == 0) max = INT_MAX;
6652
6653 /* Common code for all repeated single character type matches. Note that
6654 in UTF-8 mode, '.' matches a character of any length, but for the other
6655 character types, the valid characters are all one-byte long. */
6656
6657 REPEATTYPE:
6658 ctype = *ecode++; /* Code for the character type */
6659
6660 /* First, ensure the minimum number of matches are present. Use inline
6661 code for maximizing the speed, and do the type test once at the start
6662 (i.e. keep it out of the loop). Also we can test that there are at least
6663 the minimum number of bytes before we start. This isn't as effective in
6664 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
6665 is tidier. */
6666
6667 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6668 if (min > 0)
6669 {
6670 #ifdef SUPPORT_UTF8
6671 if (md->utf8) switch(ctype)
6672 {
6673 case OP_ANY:
6674 for (i = 1; i <= min; i++)
6675 {
6676 if (eptr >= md->end_subject ||
6677 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
6678 return MATCH_NOMATCH;
6679 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6680 }
6681 break;
6682
6683 case OP_ANYBYTE:
6684 eptr += min;
6685 break;
6686
6687 case OP_NOT_DIGIT:
6688 for (i = 1; i <= min; i++)
6689 {
6690 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6691 GETCHARINC(c, eptr);
6692 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
6693 return MATCH_NOMATCH;
6694 }
6695 break;
6696
6697 case OP_DIGIT:
6698 for (i = 1; i <= min; i++)
6699 {
6700 if (eptr >= md->end_subject ||
6701 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
6702 return MATCH_NOMATCH;
6703 /* No need to skip more bytes - we know it's a 1-byte character */
6704 }
6705 break;
6706
6707 case OP_NOT_WHITESPACE:
6708 for (i = 1; i <= min; i++)
6709 {
6710 if (eptr >= md->end_subject ||
6711 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
6712 return MATCH_NOMATCH;
6713 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6714 }
6715 break;
6716
6717 case OP_WHITESPACE:
6718 for (i = 1; i <= min; i++)
6719 {
6720 if (eptr >= md->end_subject ||
6721 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
6722 return MATCH_NOMATCH;
6723 /* No need to skip more bytes - we know it's a 1-byte character */
6724 }
6725 break;
6726
6727 case OP_NOT_WORDCHAR:
6728 for (i = 1; i <= min; i++)
6729 {
6730 if (eptr >= md->end_subject ||
6731 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
6732 return MATCH_NOMATCH;
6733 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6734 }
6735 break;
6736
6737 case OP_WORDCHAR:
6738 for (i = 1; i <= min; i++)