/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 514 - (show annotations)
Mon May 3 12:54:22 2010 UTC (5 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 225209 byte(s)
Error occurred while calculating annotation data.
Add support for \N.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57 also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. */
59
60 #ifdef PCRE_DEBUG
61 #include "pcre_printint.src"
62 #endif
63
64
65 /* Macro for setting individual bits in class bitmaps. */
66
67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68
69 /* Maximum length value to check against when making sure that the integer that
70 holds the compiled pattern length does not overflow. We make it a bit less than
71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
72 to check them every time. */
73
74 #define OFLOW_MAX (INT_MAX - 20)
75
76
77 /*************************************************
78 * Code parameters and static tables *
79 *************************************************/
80
81 /* This value specifies the size of stack workspace that is used during the
82 first pre-compile phase that determines how much memory is required. The regex
83 is partly compiled into this space, but the compiled parts are discarded as
84 soon as they can be, so that hopefully there will never be an overrun. The code
85 does, however, check for an overrun. The largest amount I've seen used is 218,
86 so this number is very generous.
87
88 The same workspace is used during the second, actual compile phase for
89 remembering forward references to groups so that they can be filled in at the
90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91 is 4 there is plenty of room. */
92
93 #define COMPILE_WORK_SIZE (4096)
94
95 /* The overrun tests check for a slightly smaller size so that they detect the
96 overrun before it actually does run off the end of the data block. */
97
98 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99
100
101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102 are simple data values; negative values are for special things like \d and so
103 on. Zero means further processing is needed (for things like \x), or the escape
104 is invalid. */
105
106 #ifndef EBCDIC
107
108 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 in UTF-8 mode. */
110
111 static const short int escapes[] = {
112 0, 0,
113 0, 0,
114 0, 0,
115 0, 0,
116 0, 0,
117 CHAR_COLON, CHAR_SEMICOLON,
118 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 CHAR_COMMERCIAL_AT, -ESC_A,
121 -ESC_B, -ESC_C,
122 -ESC_D, -ESC_E,
123 0, -ESC_G,
124 -ESC_H, 0,
125 0, -ESC_K,
126 0, 0,
127 -ESC_N, 0,
128 -ESC_P, -ESC_Q,
129 -ESC_R, -ESC_S,
130 0, 0,
131 -ESC_V, -ESC_W,
132 -ESC_X, 0,
133 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 CHAR_GRAVE_ACCENT, 7,
137 -ESC_b, 0,
138 -ESC_d, ESC_e,
139 ESC_f, 0,
140 -ESC_h, 0,
141 0, -ESC_k,
142 0, 0,
143 ESC_n, 0,
144 -ESC_p, 0,
145 ESC_r, -ESC_s,
146 ESC_tee, 0,
147 -ESC_v, -ESC_w,
148 0, 0,
149 -ESC_z
150 };
151
152 #else
153
154 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155
156 static const short int escapes[] = {
157 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180 };
181 #endif
182
183
184 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185 searched linearly. Put all the names into a single string, in order to reduce
186 the number of relocations when a shared library is dynamically linked. The
187 string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 platforms. */
189
190 typedef struct verbitem {
191 int len; /* Length of verb name */
192 int op; /* Op when no arg, or -1 if arg mandatory */
193 int op_arg; /* Op when arg present, or -1 if not allowed */
194 } verbitem;
195
196 static const char verbnames[] =
197 "\0" /* Empty name is a shorthand for MARK */
198 STRING_MARK0
199 STRING_ACCEPT0
200 STRING_COMMIT0
201 STRING_F0
202 STRING_FAIL0
203 STRING_PRUNE0
204 STRING_SKIP0
205 STRING_THEN;
206
207 static const verbitem verbs[] = {
208 { 0, -1, OP_MARK },
209 { 4, -1, OP_MARK },
210 { 6, OP_ACCEPT, -1 },
211 { 6, OP_COMMIT, -1 },
212 { 1, OP_FAIL, -1 },
213 { 4, OP_FAIL, -1 },
214 { 5, OP_PRUNE, OP_PRUNE_ARG },
215 { 4, OP_SKIP, OP_SKIP_ARG },
216 { 4, OP_THEN, OP_THEN_ARG }
217 };
218
219 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220
221
222 /* Tables of names of POSIX character classes and their lengths. The names are
223 now all in a single string, to reduce the number of relocations when a shared
224 library is dynamically loaded. The list of lengths is terminated by a zero
225 length entry. The first three must be alpha, lower, upper, as this is assumed
226 for handling case independence. */
227
228 static const char posix_names[] =
229 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232 STRING_word0 STRING_xdigit;
233
234 static const uschar posix_name_lengths[] = {
235 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236
237 /* Table of class bit maps for each POSIX class. Each class is formed from a
238 base map, with an optional addition or removal of another map. Then, for some
239 classes, there is some additional tweaking: for [:blank:] the vertical space
240 characters are removed, and for [:alpha:] and [:alnum:] the underscore
241 character is removed. The triples in the table consist of the base map offset,
242 second map offset or -1 if no second map, and a non-negative value for map
243 addition or a negative value for map subtraction (if there are two maps). The
244 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245 remove vertical space characters, 2 => remove underscore. */
246
247 static const int posix_class_maps[] = {
248 cbit_word, cbit_digit, -2, /* alpha */
249 cbit_lower, -1, 0, /* lower */
250 cbit_upper, -1, 0, /* upper */
251 cbit_word, -1, 2, /* alnum - word without underscore */
252 cbit_print, cbit_cntrl, 0, /* ascii */
253 cbit_space, -1, 1, /* blank - a GNU extension */
254 cbit_cntrl, -1, 0, /* cntrl */
255 cbit_digit, -1, 0, /* digit */
256 cbit_graph, -1, 0, /* graph */
257 cbit_print, -1, 0, /* print */
258 cbit_punct, -1, 0, /* punct */
259 cbit_space, -1, 0, /* space */
260 cbit_word, -1, 0, /* word - a Perl extension */
261 cbit_xdigit,-1, 0 /* xdigit */
262 };
263
264
265 #define STRING(a) # a
266 #define XSTRING(s) STRING(s)
267
268 /* The texts of compile-time error messages. These are "char *" because they
269 are passed to the outside world. Do not ever re-use any error number, because
270 they are documented. Always add a new error instead. Messages marked DEAD below
271 are no longer used. This used to be a table of strings, but in order to reduce
272 the number of relocations needed when a shared library is loaded dynamically,
273 it is now one long string. We cannot use a table of offsets, because the
274 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
275 simply count through to the one we want - this isn't a performance issue
276 because these strings are used only when there is a compilation error.
277
278 Each substring ends with \0 to insert a null character. This includes the final
279 substring, so that the whole string ends with \0\0, which can be detected when
280 counting through. */
281
282 static const char error_texts[] =
283 "no error\0"
284 "\\ at end of pattern\0"
285 "\\c at end of pattern\0"
286 "unrecognized character follows \\\0"
287 "numbers out of order in {} quantifier\0"
288 /* 5 */
289 "number too big in {} quantifier\0"
290 "missing terminating ] for character class\0"
291 "invalid escape sequence in character class\0"
292 "range out of order in character class\0"
293 "nothing to repeat\0"
294 /* 10 */
295 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
296 "internal error: unexpected repeat\0"
297 "unrecognized character after (? or (?-\0"
298 "POSIX named classes are supported only within a class\0"
299 "missing )\0"
300 /* 15 */
301 "reference to non-existent subpattern\0"
302 "erroffset passed as NULL\0"
303 "unknown option bit(s) set\0"
304 "missing ) after comment\0"
305 "parentheses nested too deeply\0" /** DEAD **/
306 /* 20 */
307 "regular expression is too large\0"
308 "failed to get memory\0"
309 "unmatched parentheses\0"
310 "internal error: code overflow\0"
311 "unrecognized character after (?<\0"
312 /* 25 */
313 "lookbehind assertion is not fixed length\0"
314 "malformed number or name after (?(\0"
315 "conditional group contains more than two branches\0"
316 "assertion expected after (?(\0"
317 "(?R or (?[+-]digits must be followed by )\0"
318 /* 30 */
319 "unknown POSIX class name\0"
320 "POSIX collating elements are not supported\0"
321 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
322 "spare error\0" /** DEAD **/
323 "character value in \\x{...} sequence is too large\0"
324 /* 35 */
325 "invalid condition (?(0)\0"
326 "\\C not allowed in lookbehind assertion\0"
327 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
328 "number after (?C is > 255\0"
329 "closing ) for (?C expected\0"
330 /* 40 */
331 "recursive call could loop indefinitely\0"
332 "unrecognized character after (?P\0"
333 "syntax error in subpattern name (missing terminator)\0"
334 "two named subpatterns have the same name\0"
335 "invalid UTF-8 string\0"
336 /* 45 */
337 "support for \\P, \\p, and \\X has not been compiled\0"
338 "malformed \\P or \\p sequence\0"
339 "unknown property name after \\P or \\p\0"
340 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
341 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
342 /* 50 */
343 "repeated subpattern is too long\0" /** DEAD **/
344 "octal value is greater than \\377 (not in UTF-8 mode)\0"
345 "internal error: overran compiling workspace\0"
346 "internal error: previously-checked referenced subpattern not found\0"
347 "DEFINE group contains more than one branch\0"
348 /* 55 */
349 "repeating a DEFINE group is not allowed\0"
350 "inconsistent NEWLINE options\0"
351 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
352 "a numbered reference must not be zero\0"
353 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
354 /* 60 */
355 "(*VERB) not recognized\0"
356 "number is too big\0"
357 "subpattern name expected\0"
358 "digit expected after (?+\0"
359 "] is an invalid data character in JavaScript compatibility mode\0"
360 /* 65 */
361 "different names for subpatterns of the same number are not allowed\0"
362 "(*MARK) must have an argument\0"
363 ;
364
365 /* Table to identify digits and hex digits. This is used when compiling
366 patterns. Note that the tables in chartables are dependent on the locale, and
367 may mark arbitrary characters as digits - but the PCRE compiling code expects
368 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
369 a private table here. It costs 256 bytes, but it is a lot faster than doing
370 character value tests (at least in some simple cases I timed), and in some
371 applications one wants PCRE to compile efficiently as well as match
372 efficiently.
373
374 For convenience, we use the same bit definitions as in chartables:
375
376 0x04 decimal digit
377 0x08 hexadecimal digit
378
379 Then we can use ctype_digit and ctype_xdigit in the code. */
380
381 #ifndef EBCDIC
382
383 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
384 UTF-8 mode. */
385
386 static const unsigned char digitab[] =
387 {
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
394 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
395 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
396 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
400 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
406 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
414 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
416 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
417 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
418 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
419 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
420
421 #else
422
423 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
424
425 static const unsigned char digitab[] =
426 {
427 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
431 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
433 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
435 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
439 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
441 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
442 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
443 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
444 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
445 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
446 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
447 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
449 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
450 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
451 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
452 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
453 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
454 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
456 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
457 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
458 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
459
460 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
461 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
462 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
463 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
464 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
465 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
466 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
467 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
468 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
469 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
470 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
471 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
472 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
473 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
474 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
475 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
476 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
477 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
478 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
479 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
480 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
481 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
482 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
483 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
484 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
485 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
486 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
487 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
488 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
489 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
490 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
491 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
492 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
493 #endif
494
495
496 /* Definition to allow mutual recursion */
497
498 static BOOL
499 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
500 int *, int *, branch_chain *, compile_data *, int *);
501
502
503
504 /*************************************************
505 * Find an error text *
506 *************************************************/
507
508 /* The error texts are now all in one long string, to save on relocations. As
509 some of the text is of unknown length, we can't use a table of offsets.
510 Instead, just count through the strings. This is not a performance issue
511 because it happens only when there has been a compilation error.
512
513 Argument: the error number
514 Returns: pointer to the error string
515 */
516
517 static const char *
518 find_error_text(int n)
519 {
520 const char *s = error_texts;
521 for (; n > 0; n--)
522 {
523 while (*s++ != 0) {};
524 if (*s == 0) return "Error text not found (please report)";
525 }
526 return s;
527 }
528
529
530 /*************************************************
531 * Handle escapes *
532 *************************************************/
533
534 /* This function is called when a \ has been encountered. It either returns a
535 positive value for a simple escape such as \n, or a negative value which
536 encodes one of the more complicated things such as \d. A backreference to group
537 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
538 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
539 ptr is pointing at the \. On exit, it is on the final character of the escape
540 sequence.
541
542 Arguments:
543 ptrptr points to the pattern position pointer
544 errorcodeptr points to the errorcode variable
545 bracount number of previous extracting brackets
546 options the options bits
547 isclass TRUE if inside a character class
548
549 Returns: zero or positive => a data character
550 negative => a special escape sequence
551 on error, errorcodeptr is set
552 */
553
554 static int
555 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
556 int options, BOOL isclass)
557 {
558 BOOL utf8 = (options & PCRE_UTF8) != 0;
559 const uschar *ptr = *ptrptr + 1;
560 int c, i;
561
562 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
563 ptr--; /* Set pointer back to the last byte */
564
565 /* If backslash is at the end of the pattern, it's an error. */
566
567 if (c == 0) *errorcodeptr = ERR1;
568
569 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
570 in a table. A non-zero result is something that can be returned immediately.
571 Otherwise further processing may be required. */
572
573 #ifndef EBCDIC /* ASCII/UTF-8 coding */
574 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
575 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
576
577 #else /* EBCDIC coding */
578 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
579 else if ((i = escapes[c - 0x48]) != 0) c = i;
580 #endif
581
582 /* Escapes that need further processing, or are illegal. */
583
584 else
585 {
586 const uschar *oldptr;
587 BOOL braced, negated;
588
589 switch (c)
590 {
591 /* A number of Perl escapes are not handled by PCRE. We give an explicit
592 error. */
593
594 case CHAR_l:
595 case CHAR_L:
596 case CHAR_u:
597 case CHAR_U:
598 *errorcodeptr = ERR37;
599 break;
600
601 /* \g must be followed by one of a number of specific things:
602
603 (1) A number, either plain or braced. If positive, it is an absolute
604 backreference. If negative, it is a relative backreference. This is a Perl
605 5.10 feature.
606
607 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
608 is part of Perl's movement towards a unified syntax for back references. As
609 this is synonymous with \k{name}, we fudge it up by pretending it really
610 was \k.
611
612 (3) For Oniguruma compatibility we also support \g followed by a name or a
613 number either in angle brackets or in single quotes. However, these are
614 (possibly recursive) subroutine calls, _not_ backreferences. Just return
615 the -ESC_g code (cf \k). */
616
617 case CHAR_g:
618 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
619 {
620 c = -ESC_g;
621 break;
622 }
623
624 /* Handle the Perl-compatible cases */
625
626 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
627 {
628 const uschar *p;
629 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
630 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
631 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
632 {
633 c = -ESC_k;
634 break;
635 }
636 braced = TRUE;
637 ptr++;
638 }
639 else braced = FALSE;
640
641 if (ptr[1] == CHAR_MINUS)
642 {
643 negated = TRUE;
644 ptr++;
645 }
646 else negated = FALSE;
647
648 c = 0;
649 while ((digitab[ptr[1]] & ctype_digit) != 0)
650 c = c * 10 + *(++ptr) - CHAR_0;
651
652 if (c < 0) /* Integer overflow */
653 {
654 *errorcodeptr = ERR61;
655 break;
656 }
657
658 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
659 {
660 *errorcodeptr = ERR57;
661 break;
662 }
663
664 if (c == 0)
665 {
666 *errorcodeptr = ERR58;
667 break;
668 }
669
670 if (negated)
671 {
672 if (c > bracount)
673 {
674 *errorcodeptr = ERR15;
675 break;
676 }
677 c = bracount - (c - 1);
678 }
679
680 c = -(ESC_REF + c);
681 break;
682
683 /* The handling of escape sequences consisting of a string of digits
684 starting with one that is not zero is not straightforward. By experiment,
685 the way Perl works seems to be as follows:
686
687 Outside a character class, the digits are read as a decimal number. If the
688 number is less than 10, or if there are that many previous extracting
689 left brackets, then it is a back reference. Otherwise, up to three octal
690 digits are read to form an escaped byte. Thus \123 is likely to be octal
691 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
692 value is greater than 377, the least significant 8 bits are taken. Inside a
693 character class, \ followed by a digit is always an octal number. */
694
695 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
696 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
697
698 if (!isclass)
699 {
700 oldptr = ptr;
701 c -= CHAR_0;
702 while ((digitab[ptr[1]] & ctype_digit) != 0)
703 c = c * 10 + *(++ptr) - CHAR_0;
704 if (c < 0) /* Integer overflow */
705 {
706 *errorcodeptr = ERR61;
707 break;
708 }
709 if (c < 10 || c <= bracount)
710 {
711 c = -(ESC_REF + c);
712 break;
713 }
714 ptr = oldptr; /* Put the pointer back and fall through */
715 }
716
717 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
718 generates a binary zero byte and treats the digit as a following literal.
719 Thus we have to pull back the pointer by one. */
720
721 if ((c = *ptr) >= CHAR_8)
722 {
723 ptr--;
724 c = 0;
725 break;
726 }
727
728 /* \0 always starts an octal number, but we may drop through to here with a
729 larger first octal digit. The original code used just to take the least
730 significant 8 bits of octal numbers (I think this is what early Perls used
731 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
732 than 3 octal digits. */
733
734 case CHAR_0:
735 c -= CHAR_0;
736 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
737 c = c * 8 + *(++ptr) - CHAR_0;
738 if (!utf8 && c > 255) *errorcodeptr = ERR51;
739 break;
740
741 /* \x is complicated. \x{ddd} is a character number which can be greater
742 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
743 treated as a data character. */
744
745 case CHAR_x:
746 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
747 {
748 const uschar *pt = ptr + 2;
749 int count = 0;
750
751 c = 0;
752 while ((digitab[*pt] & ctype_xdigit) != 0)
753 {
754 register int cc = *pt++;
755 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
756 count++;
757
758 #ifndef EBCDIC /* ASCII/UTF-8 coding */
759 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
760 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
761 #else /* EBCDIC coding */
762 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
763 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
764 #endif
765 }
766
767 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
768 {
769 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
770 ptr = pt;
771 break;
772 }
773
774 /* If the sequence of hex digits does not end with '}', then we don't
775 recognize this construct; fall through to the normal \x handling. */
776 }
777
778 /* Read just a single-byte hex-defined char */
779
780 c = 0;
781 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
782 {
783 int cc; /* Some compilers don't like */
784 cc = *(++ptr); /* ++ in initializers */
785 #ifndef EBCDIC /* ASCII/UTF-8 coding */
786 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
787 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
788 #else /* EBCDIC coding */
789 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
790 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
791 #endif
792 }
793 break;
794
795 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
796 This coding is ASCII-specific, but then the whole concept of \cx is
797 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
798
799 case CHAR_c:
800 c = *(++ptr);
801 if (c == 0)
802 {
803 *errorcodeptr = ERR2;
804 break;
805 }
806
807 #ifndef EBCDIC /* ASCII/UTF-8 coding */
808 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
809 c ^= 0x40;
810 #else /* EBCDIC coding */
811 if (c >= CHAR_a && c <= CHAR_z) c += 64;
812 c ^= 0xC0;
813 #endif
814 break;
815
816 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
817 other alphanumeric following \ is an error if PCRE_EXTRA was set;
818 otherwise, for Perl compatibility, it is a literal. This code looks a bit
819 odd, but there used to be some cases other than the default, and there may
820 be again in future, so I haven't "optimized" it. */
821
822 default:
823 if ((options & PCRE_EXTRA) != 0) switch(c)
824 {
825 default:
826 *errorcodeptr = ERR3;
827 break;
828 }
829 break;
830 }
831 }
832
833 /* Perl supports \N{name} for character names, as well as plain \N for "not
834 newline". PCRE does not support \N{name}. */
835
836 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
837 *errorcodeptr = ERR37;
838
839 *ptrptr = ptr;
840 return c;
841 }
842
843
844
845 #ifdef SUPPORT_UCP
846 /*************************************************
847 * Handle \P and \p *
848 *************************************************/
849
850 /* This function is called after \P or \p has been encountered, provided that
851 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
852 pointing at the P or p. On exit, it is pointing at the final character of the
853 escape sequence.
854
855 Argument:
856 ptrptr points to the pattern position pointer
857 negptr points to a boolean that is set TRUE for negation else FALSE
858 dptr points to an int that is set to the detailed property value
859 errorcodeptr points to the error code variable
860
861 Returns: type value from ucp_type_table, or -1 for an invalid type
862 */
863
864 static int
865 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
866 {
867 int c, i, bot, top;
868 const uschar *ptr = *ptrptr;
869 char name[32];
870
871 c = *(++ptr);
872 if (c == 0) goto ERROR_RETURN;
873
874 *negptr = FALSE;
875
876 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
877 negation. */
878
879 if (c == CHAR_LEFT_CURLY_BRACKET)
880 {
881 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
882 {
883 *negptr = TRUE;
884 ptr++;
885 }
886 for (i = 0; i < (int)sizeof(name) - 1; i++)
887 {
888 c = *(++ptr);
889 if (c == 0) goto ERROR_RETURN;
890 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
891 name[i] = c;
892 }
893 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
894 name[i] = 0;
895 }
896
897 /* Otherwise there is just one following character */
898
899 else
900 {
901 name[0] = c;
902 name[1] = 0;
903 }
904
905 *ptrptr = ptr;
906
907 /* Search for a recognized property name using binary chop */
908
909 bot = 0;
910 top = _pcre_utt_size;
911
912 while (bot < top)
913 {
914 i = (bot + top) >> 1;
915 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
916 if (c == 0)
917 {
918 *dptr = _pcre_utt[i].value;
919 return _pcre_utt[i].type;
920 }
921 if (c > 0) bot = i + 1; else top = i;
922 }
923
924 *errorcodeptr = ERR47;
925 *ptrptr = ptr;
926 return -1;
927
928 ERROR_RETURN:
929 *errorcodeptr = ERR46;
930 *ptrptr = ptr;
931 return -1;
932 }
933 #endif
934
935
936
937
938 /*************************************************
939 * Check for counted repeat *
940 *************************************************/
941
942 /* This function is called when a '{' is encountered in a place where it might
943 start a quantifier. It looks ahead to see if it really is a quantifier or not.
944 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
945 where the ddds are digits.
946
947 Arguments:
948 p pointer to the first char after '{'
949
950 Returns: TRUE or FALSE
951 */
952
953 static BOOL
954 is_counted_repeat(const uschar *p)
955 {
956 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
957 while ((digitab[*p] & ctype_digit) != 0) p++;
958 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
959
960 if (*p++ != CHAR_COMMA) return FALSE;
961 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
962
963 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
964 while ((digitab[*p] & ctype_digit) != 0) p++;
965
966 return (*p == CHAR_RIGHT_CURLY_BRACKET);
967 }
968
969
970
971 /*************************************************
972 * Read repeat counts *
973 *************************************************/
974
975 /* Read an item of the form {n,m} and return the values. This is called only
976 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
977 so the syntax is guaranteed to be correct, but we need to check the values.
978
979 Arguments:
980 p pointer to first char after '{'
981 minp pointer to int for min
982 maxp pointer to int for max
983 returned as -1 if no max
984 errorcodeptr points to error code variable
985
986 Returns: pointer to '}' on success;
987 current ptr on error, with errorcodeptr set non-zero
988 */
989
990 static const uschar *
991 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
992 {
993 int min = 0;
994 int max = -1;
995
996 /* Read the minimum value and do a paranoid check: a negative value indicates
997 an integer overflow. */
998
999 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1000 if (min < 0 || min > 65535)
1001 {
1002 *errorcodeptr = ERR5;
1003 return p;
1004 }
1005
1006 /* Read the maximum value if there is one, and again do a paranoid on its size.
1007 Also, max must not be less than min. */
1008
1009 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1010 {
1011 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1012 {
1013 max = 0;
1014 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1015 if (max < 0 || max > 65535)
1016 {
1017 *errorcodeptr = ERR5;
1018 return p;
1019 }
1020 if (max < min)
1021 {
1022 *errorcodeptr = ERR4;
1023 return p;
1024 }
1025 }
1026 }
1027
1028 /* Fill in the required variables, and pass back the pointer to the terminating
1029 '}'. */
1030
1031 *minp = min;
1032 *maxp = max;
1033 return p;
1034 }
1035
1036
1037
1038 /*************************************************
1039 * Subroutine for finding forward reference *
1040 *************************************************/
1041
1042 /* This recursive function is called only from find_parens() below. The
1043 top-level call starts at the beginning of the pattern. All other calls must
1044 start at a parenthesis. It scans along a pattern's text looking for capturing
1045 subpatterns, and counting them. If it finds a named pattern that matches the
1046 name it is given, it returns its number. Alternatively, if the name is NULL, it
1047 returns when it reaches a given numbered subpattern. We know that if (?P< is
1048 encountered, the name will be terminated by '>' because that is checked in the
1049 first pass. Recursion is used to keep track of subpatterns that reset the
1050 capturing group numbers - the (?| feature.
1051
1052 Arguments:
1053 ptrptr address of the current character pointer (updated)
1054 cd compile background data
1055 name name to seek, or NULL if seeking a numbered subpattern
1056 lorn name length, or subpattern number if name is NULL
1057 xmode TRUE if we are in /x mode
1058 count pointer to the current capturing subpattern number (updated)
1059
1060 Returns: the number of the named subpattern, or -1 if not found
1061 */
1062
1063 static int
1064 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1065 BOOL xmode, int *count)
1066 {
1067 uschar *ptr = *ptrptr;
1068 int start_count = *count;
1069 int hwm_count = start_count;
1070 BOOL dup_parens = FALSE;
1071
1072 /* If the first character is a parenthesis, check on the type of group we are
1073 dealing with. The very first call may not start with a parenthesis. */
1074
1075 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1076 {
1077 if (ptr[1] == CHAR_QUESTION_MARK &&
1078 ptr[2] == CHAR_VERTICAL_LINE)
1079 {
1080 ptr += 3;
1081 dup_parens = TRUE;
1082 }
1083
1084 /* Handle a normal, unnamed capturing parenthesis */
1085
1086 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1087 {
1088 *count += 1;
1089 if (name == NULL && *count == lorn) return *count;
1090 ptr++;
1091 }
1092
1093 /* Handle a condition. If it is an assertion, just carry on so that it
1094 is processed as normal. If not, skip to the closing parenthesis of the
1095 condition (there can't be any nested parens. */
1096
1097 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1098 {
1099 ptr += 2;
1100 if (ptr[1] != CHAR_QUESTION_MARK)
1101 {
1102 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1103 if (*ptr != 0) ptr++;
1104 }
1105 }
1106
1107 /* We have either (? or (* and not a condition */
1108
1109 else
1110 {
1111 ptr += 2;
1112 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1113
1114 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1115
1116 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1117 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1118 {
1119 int term;
1120 const uschar *thisname;
1121 *count += 1;
1122 if (name == NULL && *count == lorn) return *count;
1123 term = *ptr++;
1124 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1125 thisname = ptr;
1126 while (*ptr != term) ptr++;
1127 if (name != NULL && lorn == ptr - thisname &&
1128 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1129 return *count;
1130 term++;
1131 }
1132 }
1133 }
1134
1135 /* Past any initial parenthesis handling, scan for parentheses or vertical
1136 bars. */
1137
1138 for (; *ptr != 0; ptr++)
1139 {
1140 /* Skip over backslashed characters and also entire \Q...\E */
1141
1142 if (*ptr == CHAR_BACKSLASH)
1143 {
1144 if (*(++ptr) == 0) goto FAIL_EXIT;
1145 if (*ptr == CHAR_Q) for (;;)
1146 {
1147 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1148 if (*ptr == 0) goto FAIL_EXIT;
1149 if (*(++ptr) == CHAR_E) break;
1150 }
1151 continue;
1152 }
1153
1154 /* Skip over character classes; this logic must be similar to the way they
1155 are handled for real. If the first character is '^', skip it. Also, if the
1156 first few characters (either before or after ^) are \Q\E or \E we skip them
1157 too. This makes for compatibility with Perl. Note the use of STR macros to
1158 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1159
1160 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1161 {
1162 BOOL negate_class = FALSE;
1163 for (;;)
1164 {
1165 if (ptr[1] == CHAR_BACKSLASH)
1166 {
1167 if (ptr[2] == CHAR_E)
1168 ptr+= 2;
1169 else if (strncmp((const char *)ptr+2,
1170 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1171 ptr += 4;
1172 else
1173 break;
1174 }
1175 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1176 {
1177 negate_class = TRUE;
1178 ptr++;
1179 }
1180 else break;
1181 }
1182
1183 /* If the next character is ']', it is a data character that must be
1184 skipped, except in JavaScript compatibility mode. */
1185
1186 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1187 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1188 ptr++;
1189
1190 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1191 {
1192 if (*ptr == 0) return -1;
1193 if (*ptr == CHAR_BACKSLASH)
1194 {
1195 if (*(++ptr) == 0) goto FAIL_EXIT;
1196 if (*ptr == CHAR_Q) for (;;)
1197 {
1198 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1199 if (*ptr == 0) goto FAIL_EXIT;
1200 if (*(++ptr) == CHAR_E) break;
1201 }
1202 continue;
1203 }
1204 }
1205 continue;
1206 }
1207
1208 /* Skip comments in /x mode */
1209
1210 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1211 {
1212 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1213 if (*ptr == 0) goto FAIL_EXIT;
1214 continue;
1215 }
1216
1217 /* Check for the special metacharacters */
1218
1219 if (*ptr == CHAR_LEFT_PARENTHESIS)
1220 {
1221 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1222 if (rc > 0) return rc;
1223 if (*ptr == 0) goto FAIL_EXIT;
1224 }
1225
1226 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1227 {
1228 if (dup_parens && *count < hwm_count) *count = hwm_count;
1229 *ptrptr = ptr;
1230 return -1;
1231 }
1232
1233 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1234 {
1235 if (*count > hwm_count) hwm_count = *count;
1236 *count = start_count;
1237 }
1238 }
1239
1240 FAIL_EXIT:
1241 *ptrptr = ptr;
1242 return -1;
1243 }
1244
1245
1246
1247
1248 /*************************************************
1249 * Find forward referenced subpattern *
1250 *************************************************/
1251
1252 /* This function scans along a pattern's text looking for capturing
1253 subpatterns, and counting them. If it finds a named pattern that matches the
1254 name it is given, it returns its number. Alternatively, if the name is NULL, it
1255 returns when it reaches a given numbered subpattern. This is used for forward
1256 references to subpatterns. We used to be able to start this scan from the
1257 current compiling point, using the current count value from cd->bracount, and
1258 do it all in a single loop, but the addition of the possibility of duplicate
1259 subpattern numbers means that we have to scan from the very start, in order to
1260 take account of such duplicates, and to use a recursive function to keep track
1261 of the different types of group.
1262
1263 Arguments:
1264 cd compile background data
1265 name name to seek, or NULL if seeking a numbered subpattern
1266 lorn name length, or subpattern number if name is NULL
1267 xmode TRUE if we are in /x mode
1268
1269 Returns: the number of the found subpattern, or -1 if not found
1270 */
1271
1272 static int
1273 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1274 {
1275 uschar *ptr = (uschar *)cd->start_pattern;
1276 int count = 0;
1277 int rc;
1278
1279 /* If the pattern does not start with an opening parenthesis, the first call
1280 to find_parens_sub() will scan right to the end (if necessary). However, if it
1281 does start with a parenthesis, find_parens_sub() will return when it hits the
1282 matching closing parens. That is why we have to have a loop. */
1283
1284 for (;;)
1285 {
1286 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1287 if (rc > 0 || *ptr++ == 0) break;
1288 }
1289
1290 return rc;
1291 }
1292
1293
1294
1295
1296 /*************************************************
1297 * Find first significant op code *
1298 *************************************************/
1299
1300 /* This is called by several functions that scan a compiled expression looking
1301 for a fixed first character, or an anchoring op code etc. It skips over things
1302 that do not influence this. For some calls, a change of option is important.
1303 For some calls, it makes sense to skip negative forward and all backward
1304 assertions, and also the \b assertion; for others it does not.
1305
1306 Arguments:
1307 code pointer to the start of the group
1308 options pointer to external options
1309 optbit the option bit whose changing is significant, or
1310 zero if none are
1311 skipassert TRUE if certain assertions are to be skipped
1312
1313 Returns: pointer to the first significant opcode
1314 */
1315
1316 static const uschar*
1317 first_significant_code(const uschar *code, int *options, int optbit,
1318 BOOL skipassert)
1319 {
1320 for (;;)
1321 {
1322 switch ((int)*code)
1323 {
1324 case OP_OPT:
1325 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1326 *options = (int)code[1];
1327 code += 2;
1328 break;
1329
1330 case OP_ASSERT_NOT:
1331 case OP_ASSERTBACK:
1332 case OP_ASSERTBACK_NOT:
1333 if (!skipassert) return code;
1334 do code += GET(code, 1); while (*code == OP_ALT);
1335 code += _pcre_OP_lengths[*code];
1336 break;
1337
1338 case OP_WORD_BOUNDARY:
1339 case OP_NOT_WORD_BOUNDARY:
1340 if (!skipassert) return code;
1341 /* Fall through */
1342
1343 case OP_CALLOUT:
1344 case OP_CREF:
1345 case OP_NCREF:
1346 case OP_RREF:
1347 case OP_NRREF:
1348 case OP_DEF:
1349 code += _pcre_OP_lengths[*code];
1350 break;
1351
1352 default:
1353 return code;
1354 }
1355 }
1356 /* Control never reaches here */
1357 }
1358
1359
1360
1361
1362 /*************************************************
1363 * Find the fixed length of a branch *
1364 *************************************************/
1365
1366 /* Scan a branch and compute the fixed length of subject that will match it,
1367 if the length is fixed. This is needed for dealing with backward assertions.
1368 In UTF8 mode, the result is in characters rather than bytes. The branch is
1369 temporarily terminated with OP_END when this function is called.
1370
1371 This function is called when a backward assertion is encountered, so that if it
1372 fails, the error message can point to the correct place in the pattern.
1373 However, we cannot do this when the assertion contains subroutine calls,
1374 because they can be forward references. We solve this by remembering this case
1375 and doing the check at the end; a flag specifies which mode we are running in.
1376
1377 Arguments:
1378 code points to the start of the pattern (the bracket)
1379 options the compiling options
1380 atend TRUE if called when the pattern is complete
1381 cd the "compile data" structure
1382
1383 Returns: the fixed length,
1384 or -1 if there is no fixed length,
1385 or -2 if \C was encountered
1386 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1387 */
1388
1389 static int
1390 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1391 {
1392 int length = -1;
1393
1394 register int branchlength = 0;
1395 register uschar *cc = code + 1 + LINK_SIZE;
1396
1397 /* Scan along the opcodes for this branch. If we get to the end of the
1398 branch, check the length against that of the other branches. */
1399
1400 for (;;)
1401 {
1402 int d;
1403 uschar *ce, *cs;
1404 register int op = *cc;
1405 switch (op)
1406 {
1407 case OP_CBRA:
1408 case OP_BRA:
1409 case OP_ONCE:
1410 case OP_COND:
1411 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1412 if (d < 0) return d;
1413 branchlength += d;
1414 do cc += GET(cc, 1); while (*cc == OP_ALT);
1415 cc += 1 + LINK_SIZE;
1416 break;
1417
1418 /* Reached end of a branch; if it's a ket it is the end of a nested
1419 call. If it's ALT it is an alternation in a nested call. If it is
1420 END it's the end of the outer call. All can be handled by the same code. */
1421
1422 case OP_ALT:
1423 case OP_KET:
1424 case OP_KETRMAX:
1425 case OP_KETRMIN:
1426 case OP_END:
1427 if (length < 0) length = branchlength;
1428 else if (length != branchlength) return -1;
1429 if (*cc != OP_ALT) return length;
1430 cc += 1 + LINK_SIZE;
1431 branchlength = 0;
1432 break;
1433
1434 /* A true recursion implies not fixed length, but a subroutine call may
1435 be OK. If the subroutine is a forward reference, we can't deal with
1436 it until the end of the pattern, so return -3. */
1437
1438 case OP_RECURSE:
1439 if (!atend) return -3;
1440 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1441 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1442 if (cc > cs && cc < ce) return -1; /* Recursion */
1443 d = find_fixedlength(cs + 2, options, atend, cd);
1444 if (d < 0) return d;
1445 branchlength += d;
1446 cc += 1 + LINK_SIZE;
1447 break;
1448
1449 /* Skip over assertive subpatterns */
1450
1451 case OP_ASSERT:
1452 case OP_ASSERT_NOT:
1453 case OP_ASSERTBACK:
1454 case OP_ASSERTBACK_NOT:
1455 do cc += GET(cc, 1); while (*cc == OP_ALT);
1456 /* Fall through */
1457
1458 /* Skip over things that don't match chars */
1459
1460 case OP_REVERSE:
1461 case OP_CREF:
1462 case OP_NCREF:
1463 case OP_RREF:
1464 case OP_NRREF:
1465 case OP_DEF:
1466 case OP_OPT:
1467 case OP_CALLOUT:
1468 case OP_SOD:
1469 case OP_SOM:
1470 case OP_SET_SOM:
1471 case OP_EOD:
1472 case OP_EODN:
1473 case OP_CIRC:
1474 case OP_DOLL:
1475 case OP_NOT_WORD_BOUNDARY:
1476 case OP_WORD_BOUNDARY:
1477 cc += _pcre_OP_lengths[*cc];
1478 break;
1479
1480 /* Handle literal characters */
1481
1482 case OP_CHAR:
1483 case OP_CHARNC:
1484 case OP_NOT:
1485 branchlength++;
1486 cc += 2;
1487 #ifdef SUPPORT_UTF8
1488 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1489 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1490 #endif
1491 break;
1492
1493 /* Handle exact repetitions. The count is already in characters, but we
1494 need to skip over a multibyte character in UTF8 mode. */
1495
1496 case OP_EXACT:
1497 branchlength += GET2(cc,1);
1498 cc += 4;
1499 #ifdef SUPPORT_UTF8
1500 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1501 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1502 #endif
1503 break;
1504
1505 case OP_TYPEEXACT:
1506 branchlength += GET2(cc,1);
1507 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1508 cc += 4;
1509 break;
1510
1511 /* Handle single-char matchers */
1512
1513 case OP_PROP:
1514 case OP_NOTPROP:
1515 cc += 2;
1516 /* Fall through */
1517
1518 case OP_NOT_DIGIT:
1519 case OP_DIGIT:
1520 case OP_NOT_WHITESPACE:
1521 case OP_WHITESPACE:
1522 case OP_NOT_WORDCHAR:
1523 case OP_WORDCHAR:
1524 case OP_ANY:
1525 case OP_ALLANY:
1526 branchlength++;
1527 cc++;
1528 break;
1529
1530 /* The single-byte matcher isn't allowed */
1531
1532 case OP_ANYBYTE:
1533 return -2;
1534
1535 /* Check a class for variable quantification */
1536
1537 #ifdef SUPPORT_UTF8
1538 case OP_XCLASS:
1539 cc += GET(cc, 1) - 33;
1540 /* Fall through */
1541 #endif
1542
1543 case OP_CLASS:
1544 case OP_NCLASS:
1545 cc += 33;
1546
1547 switch (*cc)
1548 {
1549 case OP_CRSTAR:
1550 case OP_CRMINSTAR:
1551 case OP_CRQUERY:
1552 case OP_CRMINQUERY:
1553 return -1;
1554
1555 case OP_CRRANGE:
1556 case OP_CRMINRANGE:
1557 if (GET2(cc,1) != GET2(cc,3)) return -1;
1558 branchlength += GET2(cc,1);
1559 cc += 5;
1560 break;
1561
1562 default:
1563 branchlength++;
1564 }
1565 break;
1566
1567 /* Anything else is variable length */
1568
1569 default:
1570 return -1;
1571 }
1572 }
1573 /* Control never gets here */
1574 }
1575
1576
1577
1578
1579 /*************************************************
1580 * Scan compiled regex for specific bracket *
1581 *************************************************/
1582
1583 /* This little function scans through a compiled pattern until it finds a
1584 capturing bracket with the given number, or, if the number is negative, an
1585 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1586 so that it can be called from pcre_study() when finding the minimum matching
1587 length.
1588
1589 Arguments:
1590 code points to start of expression
1591 utf8 TRUE in UTF-8 mode
1592 number the required bracket number or negative to find a lookbehind
1593
1594 Returns: pointer to the opcode for the bracket, or NULL if not found
1595 */
1596
1597 const uschar *
1598 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1599 {
1600 for (;;)
1601 {
1602 register int c = *code;
1603 if (c == OP_END) return NULL;
1604
1605 /* XCLASS is used for classes that cannot be represented just by a bit
1606 map. This includes negated single high-valued characters. The length in
1607 the table is zero; the actual length is stored in the compiled code. */
1608
1609 if (c == OP_XCLASS) code += GET(code, 1);
1610
1611 /* Handle recursion */
1612
1613 else if (c == OP_REVERSE)
1614 {
1615 if (number < 0) return (uschar *)code;
1616 code += _pcre_OP_lengths[c];
1617 }
1618
1619 /* Handle capturing bracket */
1620
1621 else if (c == OP_CBRA)
1622 {
1623 int n = GET2(code, 1+LINK_SIZE);
1624 if (n == number) return (uschar *)code;
1625 code += _pcre_OP_lengths[c];
1626 }
1627
1628 /* Otherwise, we can get the item's length from the table, except that for
1629 repeated character types, we have to test for \p and \P, which have an extra
1630 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1631 must add in its length. */
1632
1633 else
1634 {
1635 switch(c)
1636 {
1637 case OP_TYPESTAR:
1638 case OP_TYPEMINSTAR:
1639 case OP_TYPEPLUS:
1640 case OP_TYPEMINPLUS:
1641 case OP_TYPEQUERY:
1642 case OP_TYPEMINQUERY:
1643 case OP_TYPEPOSSTAR:
1644 case OP_TYPEPOSPLUS:
1645 case OP_TYPEPOSQUERY:
1646 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1647 break;
1648
1649 case OP_TYPEUPTO:
1650 case OP_TYPEMINUPTO:
1651 case OP_TYPEEXACT:
1652 case OP_TYPEPOSUPTO:
1653 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1654 break;
1655
1656 case OP_MARK:
1657 case OP_PRUNE_ARG:
1658 case OP_SKIP_ARG:
1659 case OP_THEN_ARG:
1660 code += code[1];
1661 break;
1662 }
1663
1664 /* Add in the fixed length from the table */
1665
1666 code += _pcre_OP_lengths[c];
1667
1668 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1669 a multi-byte character. The length in the table is a minimum, so we have to
1670 arrange to skip the extra bytes. */
1671
1672 #ifdef SUPPORT_UTF8
1673 if (utf8) switch(c)
1674 {
1675 case OP_CHAR:
1676 case OP_CHARNC:
1677 case OP_EXACT:
1678 case OP_UPTO:
1679 case OP_MINUPTO:
1680 case OP_POSUPTO:
1681 case OP_STAR:
1682 case OP_MINSTAR:
1683 case OP_POSSTAR:
1684 case OP_PLUS:
1685 case OP_MINPLUS:
1686 case OP_POSPLUS:
1687 case OP_QUERY:
1688 case OP_MINQUERY:
1689 case OP_POSQUERY:
1690 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1691 break;
1692 }
1693 #else
1694 (void)(utf8); /* Keep compiler happy by referencing function argument */
1695 #endif
1696 }
1697 }
1698 }
1699
1700
1701
1702 /*************************************************
1703 * Scan compiled regex for recursion reference *
1704 *************************************************/
1705
1706 /* This little function scans through a compiled pattern until it finds an
1707 instance of OP_RECURSE.
1708
1709 Arguments:
1710 code points to start of expression
1711 utf8 TRUE in UTF-8 mode
1712
1713 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1714 */
1715
1716 static const uschar *
1717 find_recurse(const uschar *code, BOOL utf8)
1718 {
1719 for (;;)
1720 {
1721 register int c = *code;
1722 if (c == OP_END) return NULL;
1723 if (c == OP_RECURSE) return code;
1724
1725 /* XCLASS is used for classes that cannot be represented just by a bit
1726 map. This includes negated single high-valued characters. The length in
1727 the table is zero; the actual length is stored in the compiled code. */
1728
1729 if (c == OP_XCLASS) code += GET(code, 1);
1730
1731 /* Otherwise, we can get the item's length from the table, except that for
1732 repeated character types, we have to test for \p and \P, which have an extra
1733 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1734 must add in its length. */
1735
1736 else
1737 {
1738 switch(c)
1739 {
1740 case OP_TYPESTAR:
1741 case OP_TYPEMINSTAR:
1742 case OP_TYPEPLUS:
1743 case OP_TYPEMINPLUS:
1744 case OP_TYPEQUERY:
1745 case OP_TYPEMINQUERY:
1746 case OP_TYPEPOSSTAR:
1747 case OP_TYPEPOSPLUS:
1748 case OP_TYPEPOSQUERY:
1749 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1750 break;
1751
1752 case OP_TYPEPOSUPTO:
1753 case OP_TYPEUPTO:
1754 case OP_TYPEMINUPTO:
1755 case OP_TYPEEXACT:
1756 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1757 break;
1758
1759 case OP_MARK:
1760 case OP_PRUNE_ARG:
1761 case OP_SKIP_ARG:
1762 case OP_THEN_ARG:
1763 code += code[1];
1764 break;
1765 }
1766
1767 /* Add in the fixed length from the table */
1768
1769 code += _pcre_OP_lengths[c];
1770
1771 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1772 by a multi-byte character. The length in the table is a minimum, so we have
1773 to arrange to skip the extra bytes. */
1774
1775 #ifdef SUPPORT_UTF8
1776 if (utf8) switch(c)
1777 {
1778 case OP_CHAR:
1779 case OP_CHARNC:
1780 case OP_EXACT:
1781 case OP_UPTO:
1782 case OP_MINUPTO:
1783 case OP_POSUPTO:
1784 case OP_STAR:
1785 case OP_MINSTAR:
1786 case OP_POSSTAR:
1787 case OP_PLUS:
1788 case OP_MINPLUS:
1789 case OP_POSPLUS:
1790 case OP_QUERY:
1791 case OP_MINQUERY:
1792 case OP_POSQUERY:
1793 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1794 break;
1795 }
1796 #else
1797 (void)(utf8); /* Keep compiler happy by referencing function argument */
1798 #endif
1799 }
1800 }
1801 }
1802
1803
1804
1805 /*************************************************
1806 * Scan compiled branch for non-emptiness *
1807 *************************************************/
1808
1809 /* This function scans through a branch of a compiled pattern to see whether it
1810 can match the empty string or not. It is called from could_be_empty()
1811 below and from compile_branch() when checking for an unlimited repeat of a
1812 group that can match nothing. Note that first_significant_code() skips over
1813 backward and negative forward assertions when its final argument is TRUE. If we
1814 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1815 bracket whose current branch will already have been scanned.
1816
1817 Arguments:
1818 code points to start of search
1819 endcode points to where to stop
1820 utf8 TRUE if in UTF8 mode
1821 cd contains pointers to tables etc.
1822
1823 Returns: TRUE if what is matched could be empty
1824 */
1825
1826 static BOOL
1827 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1828 compile_data *cd)
1829 {
1830 register int c;
1831 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1832 code < endcode;
1833 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1834 {
1835 const uschar *ccode;
1836
1837 c = *code;
1838
1839 /* Skip over forward assertions; the other assertions are skipped by
1840 first_significant_code() with a TRUE final argument. */
1841
1842 if (c == OP_ASSERT)
1843 {
1844 do code += GET(code, 1); while (*code == OP_ALT);
1845 c = *code;
1846 continue;
1847 }
1848
1849 /* Groups with zero repeats can of course be empty; skip them. */
1850
1851 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1852 {
1853 code += _pcre_OP_lengths[c];
1854 do code += GET(code, 1); while (*code == OP_ALT);
1855 c = *code;
1856 continue;
1857 }
1858
1859 /* For a recursion/subroutine call, if its end has been reached, which
1860 implies a subroutine call, we can scan it. */
1861
1862 if (c == OP_RECURSE)
1863 {
1864 BOOL empty_branch = FALSE;
1865 const uschar *scode = cd->start_code + GET(code, 1);
1866 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1867 do
1868 {
1869 if (could_be_empty_branch(scode, endcode, utf8, cd))
1870 {
1871 empty_branch = TRUE;
1872 break;
1873 }
1874 scode += GET(scode, 1);
1875 }
1876 while (*scode == OP_ALT);
1877 if (!empty_branch) return FALSE; /* All branches are non-empty */
1878 continue;
1879 }
1880
1881 /* For other groups, scan the branches. */
1882
1883 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1884 {
1885 BOOL empty_branch;
1886 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1887
1888 /* If a conditional group has only one branch, there is a second, implied,
1889 empty branch, so just skip over the conditional, because it could be empty.
1890 Otherwise, scan the individual branches of the group. */
1891
1892 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1893 code += GET(code, 1);
1894 else
1895 {
1896 empty_branch = FALSE;
1897 do
1898 {
1899 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1900 empty_branch = TRUE;
1901 code += GET(code, 1);
1902 }
1903 while (*code == OP_ALT);
1904 if (!empty_branch) return FALSE; /* All branches are non-empty */
1905 }
1906
1907 c = *code;
1908 continue;
1909 }
1910
1911 /* Handle the other opcodes */
1912
1913 switch (c)
1914 {
1915 /* Check for quantifiers after a class. XCLASS is used for classes that
1916 cannot be represented just by a bit map. This includes negated single
1917 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1918 actual length is stored in the compiled code, so we must update "code"
1919 here. */
1920
1921 #ifdef SUPPORT_UTF8
1922 case OP_XCLASS:
1923 ccode = code += GET(code, 1);
1924 goto CHECK_CLASS_REPEAT;
1925 #endif
1926
1927 case OP_CLASS:
1928 case OP_NCLASS:
1929 ccode = code + 33;
1930
1931 #ifdef SUPPORT_UTF8
1932 CHECK_CLASS_REPEAT:
1933 #endif
1934
1935 switch (*ccode)
1936 {
1937 case OP_CRSTAR: /* These could be empty; continue */
1938 case OP_CRMINSTAR:
1939 case OP_CRQUERY:
1940 case OP_CRMINQUERY:
1941 break;
1942
1943 default: /* Non-repeat => class must match */
1944 case OP_CRPLUS: /* These repeats aren't empty */
1945 case OP_CRMINPLUS:
1946 return FALSE;
1947
1948 case OP_CRRANGE:
1949 case OP_CRMINRANGE:
1950 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1951 break;
1952 }
1953 break;
1954
1955 /* Opcodes that must match a character */
1956
1957 case OP_PROP:
1958 case OP_NOTPROP:
1959 case OP_EXTUNI:
1960 case OP_NOT_DIGIT:
1961 case OP_DIGIT:
1962 case OP_NOT_WHITESPACE:
1963 case OP_WHITESPACE:
1964 case OP_NOT_WORDCHAR:
1965 case OP_WORDCHAR:
1966 case OP_ANY:
1967 case OP_ALLANY:
1968 case OP_ANYBYTE:
1969 case OP_CHAR:
1970 case OP_CHARNC:
1971 case OP_NOT:
1972 case OP_PLUS:
1973 case OP_MINPLUS:
1974 case OP_POSPLUS:
1975 case OP_EXACT:
1976 case OP_NOTPLUS:
1977 case OP_NOTMINPLUS:
1978 case OP_NOTPOSPLUS:
1979 case OP_NOTEXACT:
1980 case OP_TYPEPLUS:
1981 case OP_TYPEMINPLUS:
1982 case OP_TYPEPOSPLUS:
1983 case OP_TYPEEXACT:
1984 return FALSE;
1985
1986 /* These are going to continue, as they may be empty, but we have to
1987 fudge the length for the \p and \P cases. */
1988
1989 case OP_TYPESTAR:
1990 case OP_TYPEMINSTAR:
1991 case OP_TYPEPOSSTAR:
1992 case OP_TYPEQUERY:
1993 case OP_TYPEMINQUERY:
1994 case OP_TYPEPOSQUERY:
1995 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1996 break;
1997
1998 /* Same for these */
1999
2000 case OP_TYPEUPTO:
2001 case OP_TYPEMINUPTO:
2002 case OP_TYPEPOSUPTO:
2003 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2004 break;
2005
2006 /* End of branch */
2007
2008 case OP_KET:
2009 case OP_KETRMAX:
2010 case OP_KETRMIN:
2011 case OP_ALT:
2012 return TRUE;
2013
2014 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2015 MINUPTO, and POSUPTO may be followed by a multibyte character */
2016
2017 #ifdef SUPPORT_UTF8
2018 case OP_STAR:
2019 case OP_MINSTAR:
2020 case OP_POSSTAR:
2021 case OP_QUERY:
2022 case OP_MINQUERY:
2023 case OP_POSQUERY:
2024 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2025 break;
2026
2027 case OP_UPTO:
2028 case OP_MINUPTO:
2029 case OP_POSUPTO:
2030 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2031 break;
2032 #endif
2033
2034 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2035 string. */
2036
2037 case OP_MARK:
2038 case OP_PRUNE_ARG:
2039 case OP_SKIP_ARG:
2040 case OP_THEN_ARG:
2041 code += code[1];
2042 break;
2043
2044 /* None of the remaining opcodes are required to match a character. */
2045
2046 default:
2047 break;
2048 }
2049 }
2050
2051 return TRUE;
2052 }
2053
2054
2055
2056 /*************************************************
2057 * Scan compiled regex for non-emptiness *
2058 *************************************************/
2059
2060 /* This function is called to check for left recursive calls. We want to check
2061 the current branch of the current pattern to see if it could match the empty
2062 string. If it could, we must look outwards for branches at other levels,
2063 stopping when we pass beyond the bracket which is the subject of the recursion.
2064
2065 Arguments:
2066 code points to start of the recursion
2067 endcode points to where to stop (current RECURSE item)
2068 bcptr points to the chain of current (unclosed) branch starts
2069 utf8 TRUE if in UTF-8 mode
2070 cd pointers to tables etc
2071
2072 Returns: TRUE if what is matched could be empty
2073 */
2074
2075 static BOOL
2076 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2077 BOOL utf8, compile_data *cd)
2078 {
2079 while (bcptr != NULL && bcptr->current_branch >= code)
2080 {
2081 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2082 return FALSE;
2083 bcptr = bcptr->outer;
2084 }
2085 return TRUE;
2086 }
2087
2088
2089
2090 /*************************************************
2091 * Check for POSIX class syntax *
2092 *************************************************/
2093
2094 /* This function is called when the sequence "[:" or "[." or "[=" is
2095 encountered in a character class. It checks whether this is followed by a
2096 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2097 reach an unescaped ']' without the special preceding character, return FALSE.
2098
2099 Originally, this function only recognized a sequence of letters between the
2100 terminators, but it seems that Perl recognizes any sequence of characters,
2101 though of course unknown POSIX names are subsequently rejected. Perl gives an
2102 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2103 didn't consider this to be a POSIX class. Likewise for [:1234:].
2104
2105 The problem in trying to be exactly like Perl is in the handling of escapes. We
2106 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2107 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2108 below handles the special case of \], but does not try to do any other escape
2109 processing. This makes it different from Perl for cases such as [:l\ower:]
2110 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2111 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2112 I think.
2113
2114 Arguments:
2115 ptr pointer to the initial [
2116 endptr where to return the end pointer
2117
2118 Returns: TRUE or FALSE
2119 */
2120
2121 static BOOL
2122 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2123 {
2124 int terminator; /* Don't combine these lines; the Solaris cc */
2125 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2126 for (++ptr; *ptr != 0; ptr++)
2127 {
2128 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2129 {
2130 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2131 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2132 {
2133 *endptr = ptr;
2134 return TRUE;
2135 }
2136 }
2137 }
2138 return FALSE;
2139 }
2140
2141
2142
2143
2144 /*************************************************
2145 * Check POSIX class name *
2146 *************************************************/
2147
2148 /* This function is called to check the name given in a POSIX-style class entry
2149 such as [:alnum:].
2150
2151 Arguments:
2152 ptr points to the first letter
2153 len the length of the name
2154
2155 Returns: a value representing the name, or -1 if unknown
2156 */
2157
2158 static int
2159 check_posix_name(const uschar *ptr, int len)
2160 {
2161 const char *pn = posix_names;
2162 register int yield = 0;
2163 while (posix_name_lengths[yield] != 0)
2164 {
2165 if (len == posix_name_lengths[yield] &&
2166 strncmp((const char *)ptr, pn, len) == 0) return yield;
2167 pn += posix_name_lengths[yield] + 1;
2168 yield++;
2169 }
2170 return -1;
2171 }
2172
2173
2174 /*************************************************
2175 * Adjust OP_RECURSE items in repeated group *
2176 *************************************************/
2177
2178 /* OP_RECURSE items contain an offset from the start of the regex to the group
2179 that is referenced. This means that groups can be replicated for fixed
2180 repetition simply by copying (because the recursion is allowed to refer to
2181 earlier groups that are outside the current group). However, when a group is
2182 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2183 inserted before it, after it has been compiled. This means that any OP_RECURSE
2184 items within it that refer to the group itself or any contained groups have to
2185 have their offsets adjusted. That one of the jobs of this function. Before it
2186 is called, the partially compiled regex must be temporarily terminated with
2187 OP_END.
2188
2189 This function has been extended with the possibility of forward references for
2190 recursions and subroutine calls. It must also check the list of such references
2191 for the group we are dealing with. If it finds that one of the recursions in
2192 the current group is on this list, it adjusts the offset in the list, not the
2193 value in the reference (which is a group number).
2194
2195 Arguments:
2196 group points to the start of the group
2197 adjust the amount by which the group is to be moved
2198 utf8 TRUE in UTF-8 mode
2199 cd contains pointers to tables etc.
2200 save_hwm the hwm forward reference pointer at the start of the group
2201
2202 Returns: nothing
2203 */
2204
2205 static void
2206 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2207 uschar *save_hwm)
2208 {
2209 uschar *ptr = group;
2210
2211 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2212 {
2213 int offset;
2214 uschar *hc;
2215
2216 /* See if this recursion is on the forward reference list. If so, adjust the
2217 reference. */
2218
2219 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2220 {
2221 offset = GET(hc, 0);
2222 if (cd->start_code + offset == ptr + 1)
2223 {
2224 PUT(hc, 0, offset + adjust);
2225 break;
2226 }
2227 }
2228
2229 /* Otherwise, adjust the recursion offset if it's after the start of this
2230 group. */
2231
2232 if (hc >= cd->hwm)
2233 {
2234 offset = GET(ptr, 1);
2235 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2236 }
2237
2238 ptr += 1 + LINK_SIZE;
2239 }
2240 }
2241
2242
2243
2244 /*************************************************
2245 * Insert an automatic callout point *
2246 *************************************************/
2247
2248 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2249 callout points before each pattern item.
2250
2251 Arguments:
2252 code current code pointer
2253 ptr current pattern pointer
2254 cd pointers to tables etc
2255
2256 Returns: new code pointer
2257 */
2258
2259 static uschar *
2260 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2261 {
2262 *code++ = OP_CALLOUT;
2263 *code++ = 255;
2264 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2265 PUT(code, LINK_SIZE, 0); /* Default length */
2266 return code + 2*LINK_SIZE;
2267 }
2268
2269
2270
2271 /*************************************************
2272 * Complete a callout item *
2273 *************************************************/
2274
2275 /* A callout item contains the length of the next item in the pattern, which
2276 we can't fill in till after we have reached the relevant point. This is used
2277 for both automatic and manual callouts.
2278
2279 Arguments:
2280 previous_callout points to previous callout item
2281 ptr current pattern pointer
2282 cd pointers to tables etc
2283
2284 Returns: nothing
2285 */
2286
2287 static void
2288 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2289 {
2290 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2291 PUT(previous_callout, 2 + LINK_SIZE, length);
2292 }
2293
2294
2295
2296 #ifdef SUPPORT_UCP
2297 /*************************************************
2298 * Get othercase range *
2299 *************************************************/
2300
2301 /* This function is passed the start and end of a class range, in UTF-8 mode
2302 with UCP support. It searches up the characters, looking for internal ranges of
2303 characters in the "other" case. Each call returns the next one, updating the
2304 start address.
2305
2306 Arguments:
2307 cptr points to starting character value; updated
2308 d end value
2309 ocptr where to put start of othercase range
2310 odptr where to put end of othercase range
2311
2312 Yield: TRUE when range returned; FALSE when no more
2313 */
2314
2315 static BOOL
2316 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2317 unsigned int *odptr)
2318 {
2319 unsigned int c, othercase, next;
2320
2321 for (c = *cptr; c <= d; c++)
2322 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2323
2324 if (c > d) return FALSE;
2325
2326 *ocptr = othercase;
2327 next = othercase + 1;
2328
2329 for (++c; c <= d; c++)
2330 {
2331 if (UCD_OTHERCASE(c) != next) break;
2332 next++;
2333 }
2334
2335 *odptr = next - 1;
2336 *cptr = c;
2337
2338 return TRUE;
2339 }
2340 #endif /* SUPPORT_UCP */
2341
2342
2343
2344 /*************************************************
2345 * Check if auto-possessifying is possible *
2346 *************************************************/
2347
2348 /* This function is called for unlimited repeats of certain items, to see
2349 whether the next thing could possibly match the repeated item. If not, it makes
2350 sense to automatically possessify the repeated item.
2351
2352 Arguments:
2353 op_code the repeated op code
2354 this data for this item, depends on the opcode
2355 utf8 TRUE in UTF-8 mode
2356 utf8_char used for utf8 character bytes, NULL if not relevant
2357 ptr next character in pattern
2358 options options bits
2359 cd contains pointers to tables etc.
2360
2361 Returns: TRUE if possessifying is wanted
2362 */
2363
2364 static BOOL
2365 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2366 const uschar *ptr, int options, compile_data *cd)
2367 {
2368 int next;
2369
2370 /* Skip whitespace and comments in extended mode */
2371
2372 if ((options & PCRE_EXTENDED) != 0)
2373 {
2374 for (;;)
2375 {
2376 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2377 if (*ptr == CHAR_NUMBER_SIGN)
2378 {
2379 while (*(++ptr) != 0)
2380 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2381 }
2382 else break;
2383 }
2384 }
2385
2386 /* If the next item is one that we can handle, get its value. A non-negative
2387 value is a character, a negative value is an escape value. */
2388
2389 if (*ptr == CHAR_BACKSLASH)
2390 {
2391 int temperrorcode = 0;
2392 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2393 if (temperrorcode != 0) return FALSE;
2394 ptr++; /* Point after the escape sequence */
2395 }
2396
2397 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2398 {
2399 #ifdef SUPPORT_UTF8
2400 if (utf8) { GETCHARINC(next, ptr); } else
2401 #endif
2402 next = *ptr++;
2403 }
2404
2405 else return FALSE;
2406
2407 /* Skip whitespace and comments in extended mode */
2408
2409 if ((options & PCRE_EXTENDED) != 0)
2410 {
2411 for (;;)
2412 {
2413 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2414 if (*ptr == CHAR_NUMBER_SIGN)
2415 {
2416 while (*(++ptr) != 0)
2417 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2418 }
2419 else break;
2420 }
2421 }
2422
2423 /* If the next thing is itself optional, we have to give up. */
2424
2425 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2426 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2427 return FALSE;
2428
2429 /* Now compare the next item with the previous opcode. If the previous is a
2430 positive single character match, "item" either contains the character or, if
2431 "item" is greater than 127 in utf8 mode, the character's bytes are in
2432 utf8_char. */
2433
2434
2435 /* Handle cases when the next item is a character. */
2436
2437 if (next >= 0) switch(op_code)
2438 {
2439 case OP_CHAR:
2440 #ifdef SUPPORT_UTF8
2441 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2442 #else
2443 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2444 #endif
2445 return item != next;
2446
2447 /* For CHARNC (caseless character) we must check the other case. If we have
2448 Unicode property support, we can use it to test the other case of
2449 high-valued characters. */
2450
2451 case OP_CHARNC:
2452 #ifdef SUPPORT_UTF8
2453 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2454 #endif
2455 if (item == next) return FALSE;
2456 #ifdef SUPPORT_UTF8
2457 if (utf8)
2458 {
2459 unsigned int othercase;
2460 if (next < 128) othercase = cd->fcc[next]; else
2461 #ifdef SUPPORT_UCP
2462 othercase = UCD_OTHERCASE((unsigned int)next);
2463 #else
2464 othercase = NOTACHAR;
2465 #endif
2466 return (unsigned int)item != othercase;
2467 }
2468 else
2469 #endif /* SUPPORT_UTF8 */
2470 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2471
2472 /* For OP_NOT, "item" must be a single-byte character. */
2473
2474 case OP_NOT:
2475 if (item == next) return TRUE;
2476 if ((options & PCRE_CASELESS) == 0) return FALSE;
2477 #ifdef SUPPORT_UTF8
2478 if (utf8)
2479 {
2480 unsigned int othercase;
2481 if (next < 128) othercase = cd->fcc[next]; else
2482 #ifdef SUPPORT_UCP
2483 othercase = UCD_OTHERCASE(next);
2484 #else
2485 othercase = NOTACHAR;
2486 #endif
2487 return (unsigned int)item == othercase;
2488 }
2489 else
2490 #endif /* SUPPORT_UTF8 */
2491 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2492
2493 case OP_DIGIT:
2494 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2495
2496 case OP_NOT_DIGIT:
2497 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2498
2499 case OP_WHITESPACE:
2500 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2501
2502 case OP_NOT_WHITESPACE:
2503 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2504
2505 case OP_WORDCHAR:
2506 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2507
2508 case OP_NOT_WORDCHAR:
2509 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2510
2511 case OP_HSPACE:
2512 case OP_NOT_HSPACE:
2513 switch(next)
2514 {
2515 case 0x09:
2516 case 0x20:
2517 case 0xa0:
2518 case 0x1680:
2519 case 0x180e:
2520 case 0x2000:
2521 case 0x2001:
2522 case 0x2002:
2523 case 0x2003:
2524 case 0x2004:
2525 case 0x2005:
2526 case 0x2006:
2527 case 0x2007:
2528 case 0x2008:
2529 case 0x2009:
2530 case 0x200A:
2531 case 0x202f:
2532 case 0x205f:
2533 case 0x3000:
2534 return op_code != OP_HSPACE;
2535 default:
2536 return op_code == OP_HSPACE;
2537 }
2538
2539 case OP_VSPACE:
2540 case OP_NOT_VSPACE:
2541 switch(next)
2542 {
2543 case 0x0a:
2544 case 0x0b:
2545 case 0x0c:
2546 case 0x0d:
2547 case 0x85:
2548 case 0x2028:
2549 case 0x2029:
2550 return op_code != OP_VSPACE;
2551 default:
2552 return op_code == OP_VSPACE;
2553 }
2554
2555 default:
2556 return FALSE;
2557 }
2558
2559
2560 /* Handle the case when the next item is \d, \s, etc. */
2561
2562 switch(op_code)
2563 {
2564 case OP_CHAR:
2565 case OP_CHARNC:
2566 #ifdef SUPPORT_UTF8
2567 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2568 #endif
2569 switch(-next)
2570 {
2571 case ESC_d:
2572 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2573
2574 case ESC_D:
2575 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2576
2577 case ESC_s:
2578 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2579
2580 case ESC_S:
2581 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2582
2583 case ESC_w:
2584 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2585
2586 case ESC_W:
2587 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2588
2589 case ESC_h:
2590 case ESC_H:
2591 switch(item)
2592 {
2593 case 0x09:
2594 case 0x20:
2595 case 0xa0:
2596 case 0x1680:
2597 case 0x180e:
2598 case 0x2000:
2599 case 0x2001:
2600 case 0x2002:
2601 case 0x2003:
2602 case 0x2004:
2603 case 0x2005:
2604 case 0x2006:
2605 case 0x2007:
2606 case 0x2008:
2607 case 0x2009:
2608 case 0x200A:
2609 case 0x202f:
2610 case 0x205f:
2611 case 0x3000:
2612 return -next != ESC_h;
2613 default:
2614 return -next == ESC_h;
2615 }
2616
2617 case ESC_v:
2618 case ESC_V:
2619 switch(item)
2620 {
2621 case 0x0a:
2622 case 0x0b:
2623 case 0x0c:
2624 case 0x0d:
2625 case 0x85:
2626 case 0x2028:
2627 case 0x2029:
2628 return -next != ESC_v;
2629 default:
2630 return -next == ESC_v;
2631 }
2632
2633 default:
2634 return FALSE;
2635 }
2636
2637 case OP_DIGIT:
2638 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2639 next == -ESC_h || next == -ESC_v;
2640
2641 case OP_NOT_DIGIT:
2642 return next == -ESC_d;
2643
2644 case OP_WHITESPACE:
2645 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2646
2647 case OP_NOT_WHITESPACE:
2648 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2649
2650 case OP_HSPACE:
2651 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2652
2653 case OP_NOT_HSPACE:
2654 return next == -ESC_h;
2655
2656 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2657 case OP_VSPACE:
2658 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2659
2660 case OP_NOT_VSPACE:
2661 return next == -ESC_v;
2662
2663 case OP_WORDCHAR:
2664 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2665
2666 case OP_NOT_WORDCHAR:
2667 return next == -ESC_w || next == -ESC_d;
2668
2669 default:
2670 return FALSE;
2671 }
2672
2673 /* Control does not reach here */
2674 }
2675
2676
2677
2678 /*************************************************
2679 * Compile one branch *
2680 *************************************************/
2681
2682 /* Scan the pattern, compiling it into the a vector. If the options are
2683 changed during the branch, the pointer is used to change the external options
2684 bits. This function is used during the pre-compile phase when we are trying
2685 to find out the amount of memory needed, as well as during the real compile
2686 phase. The value of lengthptr distinguishes the two phases.
2687
2688 Arguments:
2689 optionsptr pointer to the option bits
2690 codeptr points to the pointer to the current code point
2691 ptrptr points to the current pattern pointer
2692 errorcodeptr points to error code variable
2693 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2694 reqbyteptr set to the last literal character required, else < 0
2695 bcptr points to current branch chain
2696 cd contains pointers to tables etc.
2697 lengthptr NULL during the real compile phase
2698 points to length accumulator during pre-compile phase
2699
2700 Returns: TRUE on success
2701 FALSE, with *errorcodeptr set non-zero on error
2702 */
2703
2704 static BOOL
2705 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2706 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2707 compile_data *cd, int *lengthptr)
2708 {
2709 int repeat_type, op_type;
2710 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2711 int bravalue = 0;
2712 int greedy_default, greedy_non_default;
2713 int firstbyte, reqbyte;
2714 int zeroreqbyte, zerofirstbyte;
2715 int req_caseopt, reqvary, tempreqvary;
2716 int options = *optionsptr;
2717 int after_manual_callout = 0;
2718 int length_prevgroup = 0;
2719 register int c;
2720 register uschar *code = *codeptr;
2721 uschar *last_code = code;
2722 uschar *orig_code = code;
2723 uschar *tempcode;
2724 BOOL inescq = FALSE;
2725 BOOL groupsetfirstbyte = FALSE;
2726 const uschar *ptr = *ptrptr;
2727 const uschar *tempptr;
2728 uschar *previous = NULL;
2729 uschar *previous_callout = NULL;
2730 uschar *save_hwm = NULL;
2731 uschar classbits[32];
2732
2733 #ifdef SUPPORT_UTF8
2734 BOOL class_utf8;
2735 BOOL utf8 = (options & PCRE_UTF8) != 0;
2736 uschar *class_utf8data;
2737 uschar *class_utf8data_base;
2738 uschar utf8_char[6];
2739 #else
2740 BOOL utf8 = FALSE;
2741 uschar *utf8_char = NULL;
2742 #endif
2743
2744 #ifdef PCRE_DEBUG
2745 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2746 #endif
2747
2748 /* Set up the default and non-default settings for greediness */
2749
2750 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2751 greedy_non_default = greedy_default ^ 1;
2752
2753 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2754 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2755 matches a non-fixed char first char; reqbyte just remains unset if we never
2756 find one.
2757
2758 When we hit a repeat whose minimum is zero, we may have to adjust these values
2759 to take the zero repeat into account. This is implemented by setting them to
2760 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2761 item types that can be repeated set these backoff variables appropriately. */
2762
2763 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2764
2765 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2766 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2767 value > 255. It is added into the firstbyte or reqbyte variables to record the
2768 case status of the value. This is used only for ASCII characters. */
2769
2770 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2771
2772 /* Switch on next character until the end of the branch */
2773
2774 for (;; ptr++)
2775 {
2776 BOOL negate_class;
2777 BOOL should_flip_negation;
2778 BOOL possessive_quantifier;
2779 BOOL is_quantifier;
2780 BOOL is_recurse;
2781 BOOL reset_bracount;
2782 int class_charcount;
2783 int class_lastchar;
2784 int newoptions;
2785 int recno;
2786 int refsign;
2787 int skipbytes;
2788 int subreqbyte;
2789 int subfirstbyte;
2790 int terminator;
2791 int mclength;
2792 uschar mcbuffer[8];
2793
2794 /* Get next byte in the pattern */
2795
2796 c = *ptr;
2797
2798 /* If we are in the pre-compile phase, accumulate the length used for the
2799 previous cycle of this loop. */
2800
2801 if (lengthptr != NULL)
2802 {
2803 #ifdef PCRE_DEBUG
2804 if (code > cd->hwm) cd->hwm = code; /* High water info */
2805 #endif
2806 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
2807 {
2808 *errorcodeptr = ERR52;
2809 goto FAILED;
2810 }
2811
2812 /* There is at least one situation where code goes backwards: this is the
2813 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2814 the class is simply eliminated. However, it is created first, so we have to
2815 allow memory for it. Therefore, don't ever reduce the length at this point.
2816 */
2817
2818 if (code < last_code) code = last_code;
2819
2820 /* Paranoid check for integer overflow */
2821
2822 if (OFLOW_MAX - *lengthptr < code - last_code)
2823 {
2824 *errorcodeptr = ERR20;
2825 goto FAILED;
2826 }
2827
2828 *lengthptr += code - last_code;
2829 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2830
2831 /* If "previous" is set and it is not at the start of the work space, move
2832 it back to there, in order to avoid filling up the work space. Otherwise,
2833 if "previous" is NULL, reset the current code pointer to the start. */
2834
2835 if (previous != NULL)
2836 {
2837 if (previous > orig_code)
2838 {
2839 memmove(orig_code, previous, code - previous);
2840 code -= previous - orig_code;
2841 previous = orig_code;
2842 }
2843 }
2844 else code = orig_code;
2845
2846 /* Remember where this code item starts so we can pick up the length
2847 next time round. */
2848
2849 last_code = code;
2850 }
2851
2852 /* In the real compile phase, just check the workspace used by the forward
2853 reference list. */
2854
2855 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2856 {
2857 *errorcodeptr = ERR52;
2858 goto FAILED;
2859 }
2860
2861 /* If in \Q...\E, check for the end; if not, we have a literal */
2862
2863 if (inescq && c != 0)
2864 {
2865 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2866 {
2867 inescq = FALSE;
2868 ptr++;
2869 continue;
2870 }
2871 else
2872 {
2873 if (previous_callout != NULL)
2874 {
2875 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2876 complete_callout(previous_callout, ptr, cd);
2877 previous_callout = NULL;
2878 }
2879 if ((options & PCRE_AUTO_CALLOUT) != 0)
2880 {
2881 previous_callout = code;
2882 code = auto_callout(code, ptr, cd);
2883 }
2884 goto NORMAL_CHAR;
2885 }
2886 }
2887
2888 /* Fill in length of a previous callout, except when the next thing is
2889 a quantifier. */
2890
2891 is_quantifier =
2892 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2893 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2894
2895 if (!is_quantifier && previous_callout != NULL &&
2896 after_manual_callout-- <= 0)
2897 {
2898 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2899 complete_callout(previous_callout, ptr, cd);
2900 previous_callout = NULL;
2901 }
2902
2903 /* In extended mode, skip white space and comments */
2904
2905 if ((options & PCRE_EXTENDED) != 0)
2906 {
2907 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2908 if (c == CHAR_NUMBER_SIGN)
2909 {
2910 while (*(++ptr) != 0)
2911 {
2912 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2913 }
2914 if (*ptr != 0) continue;
2915
2916 /* Else fall through to handle end of string */
2917 c = 0;
2918 }
2919 }
2920
2921 /* No auto callout for quantifiers. */
2922
2923 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2924 {
2925 previous_callout = code;
2926 code = auto_callout(code, ptr, cd);
2927 }
2928
2929 switch(c)
2930 {
2931 /* ===================================================================*/
2932 case 0: /* The branch terminates at string end */
2933 case CHAR_VERTICAL_LINE: /* or | or ) */
2934 case CHAR_RIGHT_PARENTHESIS:
2935 *firstbyteptr = firstbyte;
2936 *reqbyteptr = reqbyte;
2937 *codeptr = code;
2938 *ptrptr = ptr;
2939 if (lengthptr != NULL)
2940 {
2941 if (OFLOW_MAX - *lengthptr < code - last_code)
2942 {
2943 *errorcodeptr = ERR20;
2944 goto FAILED;
2945 }
2946 *lengthptr += code - last_code; /* To include callout length */
2947 DPRINTF((">> end branch\n"));
2948 }
2949 return TRUE;
2950
2951
2952 /* ===================================================================*/
2953 /* Handle single-character metacharacters. In multiline mode, ^ disables
2954 the setting of any following char as a first character. */
2955
2956 case CHAR_CIRCUMFLEX_ACCENT:
2957 if ((options & PCRE_MULTILINE) != 0)
2958 {
2959 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2960 }
2961 previous = NULL;
2962 *code++ = OP_CIRC;
2963 break;
2964
2965 case CHAR_DOLLAR_SIGN:
2966 previous = NULL;
2967 *code++ = OP_DOLL;
2968 break;
2969
2970 /* There can never be a first char if '.' is first, whatever happens about
2971 repeats. The value of reqbyte doesn't change either. */
2972
2973 case CHAR_DOT:
2974 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2975 zerofirstbyte = firstbyte;
2976 zeroreqbyte = reqbyte;
2977 previous = code;
2978 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2979 break;
2980
2981
2982 /* ===================================================================*/
2983 /* Character classes. If the included characters are all < 256, we build a
2984 32-byte bitmap of the permitted characters, except in the special case
2985 where there is only one such character. For negated classes, we build the
2986 map as usual, then invert it at the end. However, we use a different opcode
2987 so that data characters > 255 can be handled correctly.
2988
2989 If the class contains characters outside the 0-255 range, a different
2990 opcode is compiled. It may optionally have a bit map for characters < 256,
2991 but those above are are explicitly listed afterwards. A flag byte tells
2992 whether the bitmap is present, and whether this is a negated class or not.
2993
2994 In JavaScript compatibility mode, an isolated ']' causes an error. In
2995 default (Perl) mode, it is treated as a data character. */
2996
2997 case CHAR_RIGHT_SQUARE_BRACKET:
2998 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2999 {
3000 *errorcodeptr = ERR64;
3001 goto FAILED;
3002 }
3003 goto NORMAL_CHAR;
3004
3005 case CHAR_LEFT_SQUARE_BRACKET:
3006 previous = code;
3007
3008 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3009 they are encountered at the top level, so we'll do that too. */
3010
3011 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3012 ptr[1] == CHAR_EQUALS_SIGN) &&
3013 check_posix_syntax(ptr, &tempptr))
3014 {
3015 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3016 goto FAILED;
3017 }
3018
3019 /* If the first character is '^', set the negation flag and skip it. Also,
3020 if the first few characters (either before or after ^) are \Q\E or \E we
3021 skip them too. This makes for compatibility with Perl. */
3022
3023 negate_class = FALSE;
3024 for (;;)
3025 {
3026 c = *(++ptr);
3027 if (c == CHAR_BACKSLASH)
3028 {
3029 if (ptr[1] == CHAR_E)
3030 ptr++;
3031 else if (strncmp((const char *)ptr+1,
3032 STR_Q STR_BACKSLASH STR_E, 3) == 0)
3033 ptr += 3;
3034 else
3035 break;
3036 }
3037 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3038 negate_class = TRUE;
3039 else break;
3040 }
3041
3042 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3043 an initial ']' is taken as a data character -- the code below handles
3044 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3045 [^] must match any character, so generate OP_ALLANY. */
3046
3047 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3048 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3049 {
3050 *code++ = negate_class? OP_ALLANY : OP_FAIL;
3051 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3052 zerofirstbyte = firstbyte;
3053 break;
3054 }
3055
3056 /* If a class contains a negative special such as \S, we need to flip the
3057 negation flag at the end, so that support for characters > 255 works
3058 correctly (they are all included in the class). */
3059
3060 should_flip_negation = FALSE;
3061
3062 /* Keep a count of chars with values < 256 so that we can optimize the case
3063 of just a single character (as long as it's < 256). However, For higher
3064 valued UTF-8 characters, we don't yet do any optimization. */
3065
3066 class_charcount = 0;
3067 class_lastchar = -1;
3068
3069 /* Initialize the 32-char bit map to all zeros. We build the map in a
3070 temporary bit of memory, in case the class contains only 1 character (less
3071 than 256), because in that case the compiled code doesn't use the bit map.
3072 */
3073
3074 memset(classbits, 0, 32 * sizeof(uschar));
3075
3076 #ifdef SUPPORT_UTF8
3077 class_utf8 = FALSE; /* No chars >= 256 */
3078 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3079 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3080 #endif
3081
3082 /* Process characters until ] is reached. By writing this as a "do" it
3083 means that an initial ] is taken as a data character. At the start of the
3084 loop, c contains the first byte of the character. */
3085
3086 if (c != 0) do
3087 {
3088 const uschar *oldptr;
3089
3090 #ifdef SUPPORT_UTF8
3091 if (utf8 && c > 127)
3092 { /* Braces are required because the */
3093 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3094 }
3095
3096 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3097 data and reset the pointer. This is so that very large classes that
3098 contain a zillion UTF-8 characters no longer overwrite the work space
3099 (which is on the stack). */
3100
3101 if (lengthptr != NULL)
3102 {
3103 *lengthptr += class_utf8data - class_utf8data_base;
3104 class_utf8data = class_utf8data_base;
3105 }
3106
3107 #endif
3108
3109 /* Inside \Q...\E everything is literal except \E */
3110
3111 if (inescq)
3112 {
3113 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3114 {
3115 inescq = FALSE; /* Reset literal state */
3116 ptr++; /* Skip the 'E' */
3117 continue; /* Carry on with next */
3118 }
3119 goto CHECK_RANGE; /* Could be range if \E follows */
3120 }
3121
3122 /* Handle POSIX class names. Perl allows a negation extension of the
3123 form [:^name:]. A square bracket that doesn't match the syntax is
3124 treated as a literal. We also recognize the POSIX constructions
3125 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3126 5.6 and 5.8 do. */
3127
3128 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3129 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3130 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3131 {
3132 BOOL local_negate = FALSE;
3133 int posix_class, taboffset, tabopt;
3134 register const uschar *cbits = cd->cbits;
3135 uschar pbits[32];
3136
3137 if (ptr[1] != CHAR_COLON)
3138 {
3139 *errorcodeptr = ERR31;
3140 goto FAILED;
3141 }
3142
3143 ptr += 2;
3144 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3145 {
3146 local_negate = TRUE;
3147 should_flip_negation = TRUE; /* Note negative special */
3148 ptr++;
3149 }
3150
3151 posix_class = check_posix_name(ptr, tempptr - ptr);
3152 if (posix_class < 0)
3153 {
3154 *errorcodeptr = ERR30;
3155 goto FAILED;
3156 }
3157
3158 /* If matching is caseless, upper and lower are converted to
3159 alpha. This relies on the fact that the class table starts with
3160 alpha, lower, upper as the first 3 entries. */
3161
3162 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3163 posix_class = 0;
3164
3165 /* We build the bit map for the POSIX class in a chunk of local store
3166 because we may be adding and subtracting from it, and we don't want to
3167 subtract bits that may be in the main map already. At the end we or the
3168 result into the bit map that is being built. */
3169
3170 posix_class *= 3;
3171
3172 /* Copy in the first table (always present) */
3173
3174 memcpy(pbits, cbits + posix_class_maps[posix_class],
3175 32 * sizeof(uschar));
3176
3177 /* If there is a second table, add or remove it as required. */
3178
3179 taboffset = posix_class_maps[posix_class + 1];
3180 tabopt = posix_class_maps[posix_class + 2];
3181
3182 if (taboffset >= 0)
3183 {
3184 if (tabopt >= 0)
3185 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3186 else
3187 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3188 }
3189
3190 /* Not see if we need to remove any special characters. An option
3191 value of 1 removes vertical space and 2 removes underscore. */
3192
3193 if (tabopt < 0) tabopt = -tabopt;
3194 if (tabopt == 1) pbits[1] &= ~0x3c;
3195 else if (tabopt == 2) pbits[11] &= 0x7f;
3196
3197 /* Add the POSIX table or its complement into the main table that is
3198 being built and we are done. */
3199
3200 if (local_negate)
3201 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3202 else
3203 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3204
3205 ptr = tempptr + 1;
3206 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3207 continue; /* End of POSIX syntax handling */
3208 }
3209
3210 /* Backslash may introduce a single character, or it may introduce one
3211 of the specials, which just set a flag. The sequence \b is a special
3212 case. Inside a class (and only there) it is treated as backspace. We
3213 assume that other escapes have more than one character in them, so set
3214 class_charcount bigger than one. Unrecognized escapes fall through and
3215 are either treated as literal characters (by default), or are faulted if
3216 PCRE_EXTRA is set. */
3217
3218 if (c == CHAR_BACKSLASH)
3219 {
3220 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3221 if (*errorcodeptr != 0) goto FAILED;
3222
3223 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3224 else if (-c == ESC_Q) /* Handle start of quoted string */
3225 {
3226 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3227 {
3228 ptr += 2; /* avoid empty string */
3229 }
3230 else inescq = TRUE;
3231 continue;
3232 }
3233 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3234
3235 if (c < 0)
3236 {
3237 register const uschar *cbits = cd->cbits;
3238 class_charcount += 2; /* Greater than 1 is what matters */
3239
3240 /* Save time by not doing this in the pre-compile phase. */
3241
3242 if (lengthptr == NULL) switch (-c)
3243 {
3244 case ESC_d:
3245 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3246 continue;
3247
3248 case ESC_D:
3249 should_flip_negation = TRUE;
3250 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3251 continue;
3252
3253 case ESC_w:
3254 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3255 continue;
3256
3257 case ESC_W:
3258 should_flip_negation = TRUE;
3259 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3260 continue;
3261
3262 case ESC_s:
3263 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3264 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3265 continue;
3266
3267 case ESC_S:
3268 should_flip_negation = TRUE;
3269 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3270 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3271 continue;
3272
3273 default: /* Not recognized; fall through */
3274 break; /* Need "default" setting to stop compiler warning. */
3275 }
3276
3277 /* In the pre-compile phase, just do the recognition. */
3278
3279 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3280 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3281
3282 /* We need to deal with \H, \h, \V, and \v in both phases because
3283 they use extra memory. */
3284
3285 if (-c == ESC_h)
3286 {
3287 SETBIT(classbits, 0x09); /* VT */
3288 SETBIT(classbits, 0x20); /* SPACE */
3289 SETBIT(classbits, 0xa0); /* NSBP */
3290 #ifdef SUPPORT_UTF8
3291 if (utf8)
3292 {
3293 class_utf8 = TRUE;
3294 *class_utf8data++ = XCL_SINGLE;
3295 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3296 *class_utf8data++ = XCL_SINGLE;
3297 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3298 *class_utf8data++ = XCL_RANGE;
3299 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3300 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3301 *class_utf8data++ = XCL_SINGLE;
3302 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3303 *class_utf8data++ = XCL_SINGLE;
3304 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3305 *class_utf8data++ = XCL_SINGLE;
3306 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3307 }
3308 #endif
3309 continue;
3310 }
3311
3312 if (-c == ESC_H)
3313 {
3314 for (c = 0; c < 32; c++)
3315 {
3316 int x = 0xff;
3317 switch (c)
3318 {
3319 case 0x09/8: x ^= 1 << (0x09%8); break;
3320 case 0x20/8: x ^= 1 << (0x20%8); break;
3321 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3322 default: break;
3323 }
3324 classbits[c] |= x;
3325 }
3326
3327 #ifdef SUPPORT_UTF8
3328 if (utf8)
3329 {
3330 class_utf8 = TRUE;
3331 *class_utf8data++ = XCL_RANGE;
3332 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3333 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3334 *class_utf8data++ = XCL_RANGE;
3335 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3336 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3337 *class_utf8data++ = XCL_RANGE;
3338 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3339 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3340 *class_utf8data++ = XCL_RANGE;
3341 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3342 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3343 *class_utf8data++ = XCL_RANGE;
3344 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3345 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3346 *class_utf8data++ = XCL_RANGE;
3347 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3348 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3349 *class_utf8data++ = XCL_RANGE;
3350 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3351 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3352 }
3353 #endif
3354 continue;
3355 }
3356
3357 if (-c == ESC_v)
3358 {
3359 SETBIT(classbits, 0x0a); /* LF */
3360 SETBIT(classbits, 0x0b); /* VT */
3361 SETBIT(classbits, 0x0c); /* FF */
3362 SETBIT(classbits, 0x0d); /* CR */
3363 SETBIT(classbits, 0x85); /* NEL */
3364 #ifdef SUPPORT_UTF8
3365 if (utf8)
3366 {
3367 class_utf8 = TRUE;
3368 *class_utf8data++ = XCL_RANGE;
3369 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3370 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3371 }
3372 #endif
3373 continue;
3374 }
3375
3376 if (-c == ESC_V)
3377 {
3378 for (c = 0; c < 32; c++)
3379 {
3380 int x = 0xff;
3381 switch (c)
3382 {
3383 case 0x0a/8: x ^= 1 << (0x0a%8);
3384 x ^= 1 << (0x0b%8);
3385 x ^= 1 << (0x0c%8);
3386 x ^= 1 << (0x0d%8);
3387 break;
3388 case 0x85/8: x ^= 1 << (0x85%8); break;
3389 default: break;
3390 }
3391 classbits[c] |= x;
3392 }
3393
3394 #ifdef SUPPORT_UTF8
3395 if (utf8)
3396 {
3397 class_utf8 = TRUE;
3398 *class_utf8data++ = XCL_RANGE;
3399 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3400 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3401 *class_utf8data++ = XCL_RANGE;
3402 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3403 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3404 }
3405 #endif
3406 continue;
3407 }
3408
3409 /* We need to deal with \P and \p in both phases. */
3410
3411 #ifdef SUPPORT_UCP
3412 if (-c == ESC_p || -c == ESC_P)
3413 {
3414 BOOL negated;
3415 int pdata;
3416 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3417 if (ptype < 0) goto FAILED;
3418 class_utf8 = TRUE;
3419 *class_utf8data++ = ((-c == ESC_p) != negated)?
3420 XCL_PROP : XCL_NOTPROP;
3421 *class_utf8data++ = ptype;
3422 *class_utf8data++ = pdata;
3423 class_charcount -= 2; /* Not a < 256 character */
3424 continue;
3425 }
3426 #endif
3427 /* Unrecognized escapes are faulted if PCRE is running in its
3428 strict mode. By default, for compatibility with Perl, they are
3429 treated as literals. */
3430
3431 if ((options & PCRE_EXTRA) != 0)
3432 {
3433 *errorcodeptr = ERR7;
3434 goto FAILED;
3435 }
3436
3437 class_charcount -= 2; /* Undo the default count from above */
3438 c = *ptr; /* Get the final character and fall through */
3439 }
3440
3441 /* Fall through if we have a single character (c >= 0). This may be
3442 greater than 256 in UTF-8 mode. */
3443
3444 } /* End of backslash handling */
3445
3446 /* A single character may be followed by '-' to form a range. However,
3447 Perl does not permit ']' to be the end of the range. A '-' character
3448 at the end is treated as a literal. Perl ignores orphaned \E sequences
3449 entirely. The code for handling \Q and \E is messy. */
3450
3451 CHECK_RANGE:
3452 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3453 {
3454 inescq = FALSE;
3455 ptr += 2;
3456 }
3457
3458 oldptr = ptr;
3459
3460 /* Remember \r or \n */
3461
3462 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3463
3464 /* Check for range */
3465
3466 if (!inescq && ptr[1] == CHAR_MINUS)
3467 {
3468 int d;
3469 ptr += 2;
3470 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3471
3472 /* If we hit \Q (not followed by \E) at this point, go into escaped
3473 mode. */
3474
3475 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3476 {
3477 ptr += 2;
3478 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3479 { ptr += 2; continue; }
3480 inescq = TRUE;
3481 break;
3482 }
3483
3484 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3485 {
3486 ptr = oldptr;
3487 goto LONE_SINGLE_CHARACTER;
3488 }
3489
3490 #ifdef SUPPORT_UTF8
3491 if (utf8)
3492 { /* Braces are required because the */
3493 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3494 }
3495 else
3496 #endif
3497 d = *ptr; /* Not UTF-8 mode */
3498
3499 /* The second part of a range can be a single-character escape, but
3500 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3501 in such circumstances. */
3502
3503 if (!inescq && d == CHAR_BACKSLASH)
3504 {
3505 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3506 if (*errorcodeptr != 0) goto FAILED;
3507
3508 /* \b is backspace; any other special means the '-' was literal */
3509
3510 if (d < 0)
3511 {
3512 if (d == -ESC_b) d = CHAR_BS; else
3513 {
3514 ptr = oldptr;
3515 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3516 }
3517 }
3518 }
3519
3520 /* Check that the two values are in the correct order. Optimize
3521 one-character ranges */
3522
3523 if (d < c)
3524 {
3525 *errorcodeptr = ERR8;
3526 goto FAILED;
3527 }
3528
3529 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3530
3531 /* Remember \r or \n */
3532
3533 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3534
3535 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3536 matching, we have to use an XCLASS with extra data items. Caseless
3537 matching for characters > 127 is available only if UCP support is
3538 available. */
3539
3540 #ifdef SUPPORT_UTF8
3541 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3542 {
3543 class_utf8 = TRUE;
3544
3545 /* With UCP support, we can find the other case equivalents of
3546 the relevant characters. There may be several ranges. Optimize how
3547 they fit with the basic range. */
3548
3549 #ifdef SUPPORT_UCP
3550 if ((options & PCRE_CASELESS) != 0)
3551 {
3552 unsigned int occ, ocd;
3553 unsigned int cc = c;
3554 unsigned int origd = d;
3555 while (get_othercase_range(&cc, origd, &occ, &ocd))
3556 {
3557 if (occ >= (unsigned int)c &&
3558 ocd <= (unsigned int)d)
3559 continue; /* Skip embedded ranges */
3560
3561 if (occ < (unsigned int)c &&
3562 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3563 { /* if there is overlap, */
3564 c = occ; /* noting that if occ < c */
3565 continue; /* we can't have ocd > d */
3566 } /* because a subrange is */
3567 if (ocd > (unsigned int)d &&
3568 occ <= (unsigned int)d + 1) /* always shorter than */
3569 { /* the basic range. */
3570 d = ocd;
3571 continue;
3572 }
3573
3574 if (occ == ocd)
3575 {
3576 *class_utf8data++ = XCL_SINGLE;
3577 }
3578 else
3579 {
3580 *class_utf8data++ = XCL_RANGE;
3581 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3582 }
3583 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3584 }
3585 }
3586 #endif /* SUPPORT_UCP */
3587
3588 /* Now record the original range, possibly modified for UCP caseless
3589 overlapping ranges. */
3590
3591 *class_utf8data++ = XCL_RANGE;
3592 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3593 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3594
3595 /* With UCP support, we are done. Without UCP support, there is no
3596 caseless matching for UTF-8 characters > 127; we can use the bit map
3597 for the smaller ones. */
3598
3599 #ifdef SUPPORT_UCP
3600 continue; /* With next character in the class */
3601 #else
3602 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3603
3604 /* Adjust upper limit and fall through to set up the map */
3605
3606 d = 127;
3607
3608 #endif /* SUPPORT_UCP */
3609 }
3610 #endif /* SUPPORT_UTF8 */
3611
3612 /* We use the bit map for all cases when not in UTF-8 mode; else
3613 ranges that lie entirely within 0-127 when there is UCP support; else
3614 for partial ranges without UCP support. */
3615
3616 class_charcount += d - c + 1;
3617 class_lastchar = d;
3618
3619 /* We can save a bit of time by skipping this in the pre-compile. */
3620
3621 if (lengthptr == NULL) for (; c <= d; c++)
3622 {
3623 classbits[c/8] |= (1 << (c&7));
3624 if ((options & PCRE_CASELESS) != 0)
3625 {
3626 int uc = cd->fcc[c]; /* flip case */
3627 classbits[uc/8] |= (1 << (uc&7));
3628 }
3629 }
3630
3631 continue; /* Go get the next char in the class */
3632 }
3633
3634 /* Handle a lone single character - we can get here for a normal
3635 non-escape char, or after \ that introduces a single character or for an
3636 apparent range that isn't. */
3637
3638 LONE_SINGLE_CHARACTER:
3639
3640 /* Handle a character that cannot go in the bit map */
3641
3642 #ifdef SUPPORT_UTF8
3643 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3644 {
3645 class_utf8 = TRUE;
3646 *class_utf8data++ = XCL_SINGLE;
3647 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3648
3649 #ifdef SUPPORT_UCP
3650 if ((options & PCRE_CASELESS) != 0)
3651 {
3652 unsigned int othercase;
3653 if ((othercase = UCD_OTHERCASE(c)) != c)
3654 {
3655 *class_utf8data++ = XCL_SINGLE;
3656 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3657 }
3658 }
3659 #endif /* SUPPORT_UCP */
3660
3661 }
3662 else
3663 #endif /* SUPPORT_UTF8 */
3664
3665 /* Handle a single-byte character */
3666 {
3667 classbits[c/8] |= (1 << (c&7));
3668 if ((options & PCRE_CASELESS) != 0)
3669 {
3670 c = cd->fcc[c]; /* flip case */
3671 classbits[c/8] |= (1 << (c&7));
3672 }
3673 class_charcount++;
3674 class_lastchar = c;
3675 }
3676 }
3677
3678 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3679
3680 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3681
3682 if (c == 0) /* Missing terminating ']' */
3683 {
3684 *errorcodeptr = ERR6;
3685 goto FAILED;
3686 }
3687
3688
3689 /* This code has been disabled because it would mean that \s counts as
3690 an explicit \r or \n reference, and that's not really what is wanted. Now
3691 we set the flag only if there is a literal "\r" or "\n" in the class. */
3692
3693 #if 0
3694 /* Remember whether \r or \n are in this class */
3695
3696 if (negate_class)
3697 {
3698 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3699 }
3700 else
3701 {
3702 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3703 }
3704 #endif
3705
3706
3707 /* If class_charcount is 1, we saw precisely one character whose value is
3708 less than 256. As long as there were no characters >= 128 and there was no
3709 use of \p or \P, in other words, no use of any XCLASS features, we can
3710 optimize.
3711
3712 In UTF-8 mode, we can optimize the negative case only if there were no
3713 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3714 operate on single-bytes only. This is an historical hangover. Maybe one day
3715 we can tidy these opcodes to handle multi-byte characters.
3716
3717 The optimization throws away the bit map. We turn the item into a
3718 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3719 that OP_NOT does not support multibyte characters. In the positive case, it
3720 can cause firstbyte to be set. Otherwise, there can be no first char if
3721 this item is first, whatever repeat count may follow. In the case of
3722 reqbyte, save the previous value for reinstating. */
3723
3724 #ifdef SUPPORT_UTF8
3725 if (class_charcount == 1 && !class_utf8 &&
3726 (!utf8 || !negate_class || class_lastchar < 128))
3727 #else
3728 if (class_charcount == 1)
3729 #endif
3730 {
3731 zeroreqbyte = reqbyte;
3732
3733 /* The OP_NOT opcode works on one-byte characters only. */
3734
3735 if (negate_class)
3736 {
3737 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3738 zerofirstbyte = firstbyte;
3739 *code++ = OP_NOT;
3740 *code++ = class_lastchar;
3741 break;
3742 }
3743
3744 /* For a single, positive character, get the value into mcbuffer, and
3745 then we can handle this with the normal one-character code. */
3746
3747 #ifdef SUPPORT_UTF8
3748 if (utf8 && class_lastchar > 127)
3749 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3750 else
3751 #endif
3752 {
3753 mcbuffer[0] = class_lastchar;
3754 mclength = 1;
3755 }
3756 goto ONE_CHAR;
3757 } /* End of 1-char optimization */
3758
3759 /* The general case - not the one-char optimization. If this is the first
3760 thing in the branch, there can be no first char setting, whatever the
3761 repeat count. Any reqbyte setting must remain unchanged after any kind of
3762 repeat. */
3763
3764 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3765 zerofirstbyte = firstbyte;
3766 zeroreqbyte = reqbyte;
3767
3768 /* If there are characters with values > 255, we have to compile an
3769 extended class, with its own opcode, unless there was a negated special
3770 such as \S in the class, because in that case all characters > 255 are in
3771 the class, so any that were explicitly given as well can be ignored. If
3772 (when there are explicit characters > 255 that must be listed) there are no
3773 characters < 256, we can omit the bitmap in the actual compiled code. */
3774
3775 #ifdef SUPPORT_UTF8
3776 if (class_utf8 && !should_flip_negation)
3777 {
3778 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3779 *code++ = OP_XCLASS;
3780 code += LINK_SIZE;
3781 *code = negate_class? XCL_NOT : 0;
3782
3783 /* If the map is required, move up the extra data to make room for it;
3784 otherwise just move the code pointer to the end of the extra data. */
3785
3786 if (class_charcount > 0)
3787 {
3788 *code++ |= XCL_MAP;
3789 memmove(code + 32, code, class_utf8data - code);
3790 memcpy(code, classbits, 32);
3791 code = class_utf8data + 32;
3792 }
3793 else code = class_utf8data;
3794
3795 /* Now fill in the complete length of the item */
3796
3797 PUT(previous, 1, code - previous);
3798 break; /* End of class handling */
3799 }
3800 #endif
3801
3802 /* If there are no characters > 255, set the opcode to OP_CLASS or
3803 OP_NCLASS, depending on whether the whole class was negated and whether
3804 there were negative specials such as \S in the class. Then copy the 32-byte
3805 map into the code vector, negating it if necessary. */
3806
3807 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3808 if (negate_class)
3809 {
3810 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3811 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3812 }
3813 else
3814 {
3815 memcpy(code, classbits, 32);
3816 }
3817 code += 32;
3818 break;
3819
3820
3821 /* ===================================================================*/
3822 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3823 has been tested above. */
3824
3825 case CHAR_LEFT_CURLY_BRACKET:
3826 if (!is_quantifier) goto NORMAL_CHAR;
3827 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3828 if (*errorcodeptr != 0) goto FAILED;
3829 goto REPEAT;
3830
3831 case CHAR_ASTERISK:
3832 repeat_min = 0;
3833 repeat_max = -1;
3834 goto REPEAT;
3835
3836 case CHAR_PLUS:
3837 repeat_min = 1;
3838 repeat_max = -1;
3839 goto REPEAT;
3840
3841 case CHAR_QUESTION_MARK:
3842 repeat_min = 0;
3843 repeat_max = 1;
3844
3845 REPEAT:
3846 if (previous == NULL)
3847 {
3848 *errorcodeptr = ERR9;
3849 goto FAILED;
3850 }
3851
3852 if (repeat_min == 0)
3853 {
3854 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3855 reqbyte = zeroreqbyte; /* Ditto */
3856 }
3857
3858 /* Remember whether this is a variable length repeat */
3859
3860 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3861
3862 op_type = 0; /* Default single-char op codes */
3863 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3864
3865 /* Save start of previous item, in case we have to move it up to make space
3866 for an inserted OP_ONCE for the additional '+' extension. */
3867
3868 tempcode = previous;
3869
3870 /* If the next character is '+', we have a possessive quantifier. This
3871 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3872 If the next character is '?' this is a minimizing repeat, by default,
3873 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3874 repeat type to the non-default. */
3875
3876 if (ptr[1] == CHAR_PLUS)
3877 {
3878 repeat_type = 0; /* Force greedy */
3879 possessive_quantifier = TRUE;
3880 ptr++;
3881 }
3882 else if (ptr[1] == CHAR_QUESTION_MARK)
3883 {
3884 repeat_type = greedy_non_default;
3885 ptr++;
3886 }
3887 else repeat_type = greedy_default;
3888
3889 /* If previous was a character match, abolish the item and generate a
3890 repeat item instead. If a char item has a minumum of more than one, ensure
3891 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3892 the first thing in a branch because the x will have gone into firstbyte
3893 instead. */
3894
3895 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3896 {
3897 /* Deal with UTF-8 characters that take up more than one byte. It's
3898 easier to write this out separately than try to macrify it. Use c to
3899 hold the length of the character in bytes, plus 0x80 to flag that it's a
3900 length rather than a small character. */
3901
3902 #ifdef SUPPORT_UTF8
3903 if (utf8 && (code[-1] & 0x80) != 0)
3904 {
3905 uschar *lastchar = code - 1;
3906 while((*lastchar & 0xc0) == 0x80) lastchar--;
3907 c = code - lastchar; /* Length of UTF-8 character */
3908 memcpy(utf8_char, lastchar, c); /* Save the char */
3909 c |= 0x80; /* Flag c as a length */
3910 }
3911 else
3912 #endif
3913
3914 /* Handle the case of a single byte - either with no UTF8 support, or
3915 with UTF-8 disabled, or for a UTF-8 character < 128. */
3916
3917 {
3918 c = code[-1];
3919 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3920 }
3921
3922 /* If the repetition is unlimited, it pays to see if the next thing on
3923 the line is something that cannot possibly match this character. If so,
3924 automatically possessifying this item gains some performance in the case
3925 where the match fails. */
3926
3927 if (!possessive_quantifier &&
3928 repeat_max < 0 &&
3929 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3930 options, cd))
3931 {
3932 repeat_type = 0; /* Force greedy */
3933 possessive_quantifier = TRUE;
3934 }
3935
3936 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3937 }
3938
3939 /* If previous was a single negated character ([^a] or similar), we use
3940 one of the special opcodes, replacing it. The code is shared with single-
3941 character repeats by setting opt_type to add a suitable offset into
3942 repeat_type. We can also test for auto-possessification. OP_NOT is
3943 currently used only for single-byte chars. */
3944
3945 else if (*previous == OP_NOT)
3946 {
3947 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3948 c = previous[1];
3949 if (!possessive_quantifier &&
3950 repeat_max < 0 &&
3951 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3952 {
3953 repeat_type = 0; /* Force greedy */
3954 possessive_quantifier = TRUE;
3955 }
3956 goto OUTPUT_SINGLE_REPEAT;
3957 }
3958
3959 /* If previous was a character type match (\d or similar), abolish it and
3960 create a suitable repeat item. The code is shared with single-character
3961 repeats by setting op_type to add a suitable offset into repeat_type. Note
3962 the the Unicode property types will be present only when SUPPORT_UCP is
3963 defined, but we don't wrap the little bits of code here because it just
3964 makes it horribly messy. */
3965
3966 else if (*previous < OP_EODN)
3967 {
3968 uschar *oldcode;
3969 int prop_type, prop_value;
3970 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3971 c = *previous;
3972
3973 if (!possessive_quantifier &&
3974 repeat_max < 0 &&
3975 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3976 {
3977 repeat_type = 0; /* Force greedy */
3978 possessive_quantifier = TRUE;
3979 }
3980
3981 OUTPUT_SINGLE_REPEAT:
3982 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3983 {
3984 prop_type = previous[1];
3985 prop_value = previous[2];
3986 }
3987 else prop_type = prop_value = -1;
3988
3989 oldcode = code;
3990 code = previous; /* Usually overwrite previous item */
3991
3992 /* If the maximum is zero then the minimum must also be zero; Perl allows
3993 this case, so we do too - by simply omitting the item altogether. */
3994
3995 if (repeat_max == 0) goto END_REPEAT;
3996
3997 /*--------------------------------------------------------------------*/
3998 /* This code is obsolete from release 8.00; the restriction was finally
3999 removed: */
4000
4001 /* All real repeats make it impossible to handle partial matching (maybe
4002 one day we will be able to remove this restriction). */
4003
4004 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4005 /*--------------------------------------------------------------------*/
4006
4007 /* Combine the op_type with the repeat_type */
4008
4009 repeat_type += op_type;
4010
4011 /* A minimum of zero is handled either as the special case * or ?, or as
4012 an UPTO, with the maximum given. */
4013
4014 if (repeat_min == 0)
4015 {
4016 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4017 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4018 else
4019 {
4020 *code++ = OP_UPTO + repeat_type;
4021 PUT2INC(code, 0, repeat_max);
4022 }
4023 }
4024
4025 /* A repeat minimum of 1 is optimized into some special cases. If the
4026 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4027 left in place and, if the maximum is greater than 1, we use OP_UPTO with
4028 one less than the maximum. */
4029
4030 else if (repeat_min == 1)
4031 {
4032 if (repeat_max == -1)
4033 *code++ = OP_PLUS + repeat_type;
4034 else
4035 {
4036 code = oldcode; /* leave previous item in place */
4037 if (repeat_max == 1) goto END_REPEAT;
4038 *code++ = OP_UPTO + repeat_type;
4039 PUT2INC(code, 0, repeat_max - 1);
4040 }
4041 }
4042
4043 /* The case {n,n} is just an EXACT, while the general case {n,m} is
4044 handled as an EXACT followed by an UPTO. */
4045
4046 else
4047 {
4048 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4049 PUT2INC(code, 0, repeat_min);
4050
4051 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4052 we have to insert the character for the previous code. For a repeated
4053 Unicode property match, there are two extra bytes that define the
4054 required property. In UTF-8 mode, long characters have their length in
4055 c, with the 0x80 bit as a flag. */
4056
4057 if (repeat_max < 0)
4058 {
4059 #ifdef SUPPORT_UTF8
4060 if (utf8 && c >= 128)
4061 {
4062 memcpy(code, utf8_char, c & 7);
4063 code += c & 7;
4064 }
4065 else
4066 #endif
4067 {
4068 *code++ = c;
4069 if (prop_type >= 0)
4070 {
4071 *code++ = prop_type;
4072 *code++ = prop_value;
4073 }
4074 }
4075 *code++ = OP_STAR + repeat_type;
4076 }
4077
4078 /* Else insert an UPTO if the max is greater than the min, again
4079 preceded by the character, for the previously inserted code. If the
4080 UPTO is just for 1 instance, we can use QUERY instead. */
4081
4082 else if (repeat_max != repeat_min)
4083 {
4084 #ifdef SUPPORT_UTF8
4085 if (utf8 && c >= 128)
4086 {
4087 memcpy(code, utf8_char, c & 7);
4088 code += c & 7;
4089 }
4090 else
4091 #endif
4092 *code++ = c;
4093 if (prop_type >= 0)
4094 {
4095 *code++ = prop_type;
4096 *code++ = prop_value;
4097 }
4098 repeat_max -= repeat_min;
4099
4100 if (repeat_max == 1)
4101 {
4102 *code++ = OP_QUERY + repeat_type;
4103 }
4104 else
4105 {
4106 *code++ = OP_UPTO + repeat_type;
4107 PUT2INC(code, 0, repeat_max);
4108 }
4109 }
4110 }
4111
4112 /* The character or character type itself comes last in all cases. */
4113
4114 #ifdef SUPPORT_UTF8
4115 if (utf8 && c >= 128)
4116 {
4117 memcpy(code, utf8_char, c & 7);
4118 code += c & 7;
4119 }
4120 else
4121 #endif
4122 *code++ = c;
4123
4124 /* For a repeated Unicode property match, there are two extra bytes that
4125 define the required property. */
4126
4127 #ifdef SUPPORT_UCP
4128 if (prop_type >= 0)
4129 {
4130 *code++ = prop_type;
4131 *code++ = prop_value;
4132 }
4133 #endif
4134 }
4135
4136 /* If previous was a character class or a back reference, we put the repeat
4137 stuff after it, but just skip the item if the repeat was {0,0}. */
4138
4139 else if (*previous == OP_CLASS ||
4140 *previous == OP_NCLASS ||
4141 #ifdef SUPPORT_UTF8
4142 *previous == OP_XCLASS ||
4143 #endif
4144 *previous == OP_REF)
4145 {
4146 if (repeat_max == 0)
4147 {
4148 code = previous;
4149 goto END_REPEAT;
4150 }
4151
4152 /*--------------------------------------------------------------------*/
4153 /* This code is obsolete from release 8.00; the restriction was finally
4154 removed: */
4155
4156 /* All real repeats make it impossible to handle partial matching (maybe
4157 one day we will be able to remove this restriction). */
4158
4159 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4160 /*--------------------------------------------------------------------*/
4161
4162 if (repeat_min == 0 && repeat_max == -1)
4163 *code++ = OP_CRSTAR + repeat_type;
4164 else if (repeat_min == 1 && repeat_max == -1)
4165 *code++ = OP_CRPLUS + repeat_type;
4166 else if (repeat_min == 0 && repeat_max == 1)
4167 *code++ = OP_CRQUERY + repeat_type;
4168 else
4169 {
4170 *code++ = OP_CRRANGE + repeat_type;
4171 PUT2INC(code, 0, repeat_min);
4172 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4173 PUT2INC(code, 0, repeat_max);
4174 }
4175 }
4176
4177 /* If previous was a bracket group, we may have to replicate it in certain
4178 cases. */
4179
4180 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4181 *previous == OP_ONCE || *previous == OP_COND)
4182 {
4183 register int i;
4184 int ketoffset = 0;
4185 int len = code - previous;
4186 uschar *bralink = NULL;
4187
4188 /* Repeating a DEFINE group is pointless */
4189
4190 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4191 {
4192 *errorcodeptr = ERR55;
4193 goto FAILED;
4194 }
4195
4196 /* If the maximum repeat count is unlimited, find the end of the bracket
4197 by scanning through from the start, and compute the offset back to it
4198 from the current code pointer. There may be an OP_OPT setting following
4199 the final KET, so we can't find the end just by going back from the code
4200 pointer. */
4201
4202 if (repeat_max == -1)
4203 {
4204 register uschar *ket = previous;
4205 do ket += GET(ket, 1); while (*ket != OP_KET);
4206 ketoffset = code - ket;
4207 }
4208
4209 /* The case of a zero minimum is special because of the need to stick
4210 OP_BRAZERO in front of it, and because the group appears once in the
4211 data, whereas in other cases it appears the minimum number of times. For
4212 this reason, it is simplest to treat this case separately, as otherwise
4213 the code gets far too messy. There are several special subcases when the
4214 minimum is zero. */
4215
4216 if (repeat_min == 0)
4217 {
4218 /* If the maximum is also zero, we used to just omit the group from the
4219 output altogether, like this:
4220
4221 ** if (repeat_max == 0)
4222 ** {
4223 ** code = previous;
4224 ** goto END_REPEAT;
4225 ** }
4226
4227 However, that fails when a group is referenced as a subroutine from
4228 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4229 so that it is skipped on execution. As we don't have a list of which
4230 groups are referenced, we cannot do this selectively.
4231
4232 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4233 and do no more at this point. However, we do need to adjust any
4234 OP_RECURSE calls inside the group that refer to the group itself or any
4235 internal or forward referenced group, because the offset is from the
4236 start of the whole regex. Temporarily terminate the pattern while doing
4237 this. */
4238
4239 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4240 {
4241 *code = OP_END;
4242 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4243 memmove(previous+1, previous, len);
4244 code++;
4245 if (repeat_max == 0)
4246 {
4247 *previous++ = OP_SKIPZERO;
4248 goto END_REPEAT;
4249 }
4250 *previous++ = OP_BRAZERO + repeat_type;
4251 }
4252
4253 /* If the maximum is greater than 1 and limited, we have to replicate
4254 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4255 The first one has to be handled carefully because it's the original
4256 copy, which has to be moved up. The remainder can be handled by code
4257 that is common with the non-zero minimum case below. We have to
4258 adjust the value or repeat_max, since one less copy is required. Once
4259 again, we may have to adjust any OP_RECURSE calls inside the group. */
4260
4261 else
4262 {
4263 int offset;
4264 *code = OP_END;
4265 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4266 memmove(previous + 2 + LINK_SIZE, previous, len);
4267 code += 2 + LINK_SIZE;
4268 *previous++ = OP_BRAZERO + repeat_type;
4269 *previous++ = OP_BRA;
4270
4271 /* We chain together the bracket offset fields that have to be
4272 filled in later when the ends of the brackets are reached. */
4273
4274 offset = (bralink == NULL)? 0 : previous - bralink;
4275 bralink = previous;
4276 PUTINC(previous, 0, offset);
4277 }
4278
4279 repeat_max--;
4280 }
4281
4282 /* If the minimum is greater than zero, replicate the group as many
4283 times as necessary, and adjust the maximum to the number of subsequent
4284 copies that we need. If we set a first char from the group, and didn't
4285 set a required char, copy the latter from the former. If there are any
4286 forward reference subroutine calls in the group, there will be entries on
4287 the workspace list; replicate these with an appropriate increment. */
4288
4289 else
4290 {
4291 if (repeat_min > 1)
4292 {
4293 /* In the pre-compile phase, we don't actually do the replication. We
4294 just adjust the length as if we had. Do some paranoid checks for
4295 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4296 integer type when available, otherwise double. */
4297
4298 if (lengthptr != NULL)
4299 {
4300 int delta = (repeat_min - 1)*length_prevgroup;
4301 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4302 (INT64_OR_DOUBLE)length_prevgroup >
4303 (INT64_OR_DOUBLE)INT_MAX ||
4304 OFLOW_MAX - *lengthptr < delta)
4305 {
4306 *errorcodeptr = ERR20;
4307 goto FAILED;
4308 }
4309 *lengthptr += delta;
4310 }
4311
4312 /* This is compiling for real */
4313
4314 else
4315 {
4316 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4317 for (i = 1; i < repeat_min; i++)
4318 {
4319 uschar *hc;
4320 uschar *this_hwm = cd->hwm;
4321 memcpy(code, previous, len);
4322 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4323 {
4324 PUT(cd->hwm, 0, GET(hc, 0) + len);
4325 cd->hwm += LINK_SIZE;
4326 }
4327 save_hwm = this_hwm;
4328 code += len;
4329 }
4330 }
4331 }
4332
4333 if (repeat_max > 0) repeat_max -= repeat_min;
4334 }
4335
4336 /* This code is common to both the zero and non-zero minimum cases. If
4337 the maximum is limited, it replicates the group in a nested fashion,
4338 remembering the bracket starts on a stack. In the case of a zero minimum,
4339 the first one was set up above. In all cases the repeat_max now specifies
4340 the number of additional copies needed. Again, we must remember to
4341 replicate entries on the forward reference list. */
4342
4343 if (repeat_max >= 0)
4344 {
4345 /* In the pre-compile phase, we don't actually do the replication. We
4346 just adjust the length as if we had. For each repetition we must add 1
4347 to the length for BRAZERO and for all but the last repetition we must
4348 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4349 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4350 a 64-bit integer type when available, otherwise double. */
4351
4352 if (lengthptr != NULL && repeat_max > 0)
4353 {
4354 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4355 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4356 if ((INT64_OR_DOUBLE)repeat_max *
4357 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4358 > (INT64_OR_DOUBLE)INT_MAX ||
4359 OFLOW_MAX - *lengthptr < delta)
4360 {
4361 *errorcodeptr = ERR20;
4362 goto FAILED;
4363 }
4364 *lengthptr += delta;
4365 }
4366
4367 /* This is compiling for real */
4368
4369 else for (i = repeat_max - 1; i >= 0; i--)
4370 {
4371 uschar *hc;
4372 uschar *this_hwm = cd->hwm;
4373
4374 *code++ = OP_BRAZERO + repeat_type;
4375
4376 /* All but the final copy start a new nesting, maintaining the
4377 chain of brackets outstanding. */
4378
4379 if (i != 0)
4380 {
4381 int offset;
4382 *code++ = OP_BRA;
4383 offset = (bralink == NULL)? 0 : code - bralink;
4384 bralink = code;
4385 PUTINC(code, 0, offset);
4386 }
4387
4388 memcpy(code, previous, len);
4389 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4390 {
4391 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4392 cd->hwm += LINK_SIZE;
4393 }
4394 save_hwm = this_hwm;
4395 code += len;
4396 }
4397
4398 /* Now chain through the pending brackets, and fill in their length
4399 fields (which are holding the chain links pro tem). */
4400
4401 while (bralink != NULL)
4402 {
4403 int oldlinkoffset;
4404 int offset = code - bralink + 1;
4405 uschar *bra = code - offset;
4406 oldlinkoffset = GET(bra, 1);
4407 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4408 *code++ = OP_KET;
4409 PUTINC(code, 0, offset);
4410 PUT(bra, 1, offset);
4411 }
4412 }
4413
4414 /* If the maximum is unlimited, set a repeater in the final copy. We
4415 can't just offset backwards from the current code point, because we
4416 don't know if there's been an options resetting after the ket. The
4417 correct offset was computed above.
4418
4419 Then, when we are doing the actual compile phase, check to see whether
4420 this group is a non-atomic one that could match an empty string. If so,
4421 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4422 that runtime checking can be done. [This check is also applied to
4423 atomic groups at runtime, but in a different way.] */
4424
4425 else
4426 {
4427 uschar *ketcode = code - ketoffset;
4428 uschar *bracode = ketcode - GET(ketcode, 1);
4429 *ketcode = OP_KETRMAX + repeat_type;
4430 if (lengthptr == NULL && *bracode != OP_ONCE)
4431 {
4432 uschar *scode = bracode;
4433 do
4434 {
4435 if (could_be_empty_branch(scode, ketcode, utf8, cd))
4436 {
4437 *bracode += OP_SBRA - OP_BRA;
4438 break;
4439 }
4440 scode += GET(scode, 1);
4441 }
4442 while (*scode == OP_ALT);
4443 }
4444 }
4445 }
4446
4447 /* If previous is OP_FAIL, it was generated by an empty class [] in
4448 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4449 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4450 error above. We can just ignore the repeat in JS case. */
4451
4452 else if (*previous == OP_FAIL) goto END_REPEAT;
4453
4454 /* Else there's some kind of shambles */
4455
4456 else
4457 {
4458 *errorcodeptr = ERR11;
4459 goto FAILED;
4460 }
4461
4462 /* If the character following a repeat is '+', or if certain optimization
4463 tests above succeeded, possessive_quantifier is TRUE. For some of the
4464 simpler opcodes, there is an special alternative opcode for this. For
4465 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4466 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4467 but the special opcodes can optimize it a bit. The repeated item starts at
4468 tempcode, not at previous, which might be the first part of a string whose
4469 (former) last char we repeated.
4470
4471 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4472 an 'upto' may follow. We skip over an 'exact' item, and then test the
4473 length of what remains before proceeding. */
4474
4475 if (possessive_quantifier)
4476 {
4477 int len;
4478
4479 if (*tempcode == OP_TYPEEXACT)
4480 tempcode += _pcre_OP_lengths[*tempcode] +
4481 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4482
4483 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4484 {
4485 tempcode += _pcre_OP_lengths[*tempcode];
4486 #ifdef SUPPORT_UTF8
4487 if (utf8 && tempcode[-1] >= 0xc0)
4488 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4489 #endif
4490 }
4491
4492 len = code - tempcode;
4493 if (len > 0) switch (*tempcode)
4494 {
4495 case OP_STAR: *tempcode = OP_POSSTAR; break;
4496 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4497 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4498 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4499
4500 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4501 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4502 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4503 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4504
4505 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4506 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4507 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4508 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4509
4510 /* Because we are moving code along, we must ensure that any
4511 pending recursive references are updated. */
4512
4513 default:
4514 *code = OP_END;
4515 adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4516 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4517 code += 1 + LINK_SIZE;
4518 len += 1 + LINK_SIZE;
4519 tempcode[0] = OP_ONCE;
4520 *code++ = OP_KET;
4521 PUTINC(code, 0, len);
4522 PUT(tempcode, 1, len);
4523 break;
4524 }
4525 }
4526
4527 /* In all case we no longer have a previous item. We also set the
4528 "follows varying string" flag for subsequently encountered reqbytes if
4529 it isn't already set and we have just passed a varying length item. */
4530
4531 END_REPEAT:
4532 previous = NULL;
4533 cd->req_varyopt |= reqvary;
4534 break;
4535
4536
4537 /* ===================================================================*/
4538 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4539 lookbehind or option setting or condition or all the other extended
4540 parenthesis forms. */
4541
4542 case CHAR_LEFT_PARENTHESIS:
4543 newoptions = options;
4544 skipbytes = 0;
4545 bravalue = OP_CBRA;
4546 save_hwm = cd->hwm;
4547 reset_bracount = FALSE;
4548
4549 /* First deal with various "verbs" that can be introduced by '*'. */
4550
4551 if (*(++ptr) == CHAR_ASTERISK &&
4552 ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
4553 {
4554 int i, namelen;
4555 int arglen = 0;
4556 const char *vn = verbnames;
4557 const uschar *name = ptr + 1;
4558 const uschar *arg = NULL;
4559 previous = NULL;
4560 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4561 namelen = ptr - name;
4562
4563 if (*ptr == CHAR_COLON)
4564 {
4565 arg = ++ptr;
4566 while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
4567 || *ptr == '_') ptr++;
4568 arglen = ptr - arg;
4569 }
4570
4571 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4572 {
4573 *errorcodeptr = ERR60;
4574 goto FAILED;
4575 }
4576
4577 /* Scan the table of verb names */
4578
4579 for (i = 0; i < verbcount; i++)
4580 {
4581 if (namelen == verbs[i].len &&
4582 strncmp((char *)name, vn, namelen) == 0)
4583 {
4584 /* Check for open captures before ACCEPT */
4585
4586 if (verbs[i].op == OP_ACCEPT)
4587 {
4588 open_capitem *oc;
4589 cd->had_accept = TRUE;
4590 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4591 {
4592 *code++ = OP_CLOSE;
4593 PUT2INC(code, 0, oc->number);
4594 }
4595 }
4596
4597 /* Handle the cases with/without an argument */
4598
4599 if (arglen == 0)
4600 {
4601 if (verbs[i].op < 0) /* Argument is mandatory */
4602 {
4603 *errorcodeptr = ERR66;
4604 goto FAILED;
4605 }
4606 *code++ = verbs[i].op;
4607 }
4608
4609 else
4610 {
4611 if (verbs[i].op_arg < 0) /* Argument is forbidden */
4612 {
4613 *errorcodeptr = ERR59;
4614 goto FAILED;
4615 }
4616 *code++ = verbs[i].op_arg;
4617 *code++ = arglen;
4618 memcpy(code, arg, arglen);
4619 code += arglen;
4620 *code++ = 0;
4621 }
4622
4623 break; /* Found verb, exit loop */
4624 }
4625
4626 vn += verbs[i].len + 1;
4627 }
4628
4629 if (i < verbcount) continue; /* Successfully handled a verb */
4630 *errorcodeptr = ERR60; /* Verb not recognized */
4631 goto FAILED;
4632 }
4633
4634 /* Deal with the extended parentheses; all are introduced by '?', and the
4635 appearance of any of them means that this is not a capturing group. */
4636
4637 else if (*ptr == CHAR_QUESTION_MARK)
4638 {
4639 int i, set, unset, namelen;
4640 int *optset;
4641 const uschar *name;
4642 uschar *slot;
4643
4644 switch (*(++ptr))
4645 {
4646 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4647 ptr++;
4648 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4649 if (*ptr == 0)
4650 {
4651 *errorcodeptr = ERR18;
4652 goto FAILED;
4653 }
4654 continue;
4655
4656
4657 /* ------------------------------------------------------------ */
4658 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4659 reset_bracount = TRUE;
4660 /* Fall through */
4661
4662 /* ------------------------------------------------------------ */
4663 case CHAR_COLON: /* Non-capturing bracket */
4664 bravalue = OP_BRA;
4665 ptr++;
4666 break;
4667
4668
4669 /* ------------------------------------------------------------ */
4670 case CHAR_LEFT_PARENTHESIS:
4671 bravalue = OP_COND; /* Conditional group */
4672
4673 /* A condition can be an assertion, a number (referring to a numbered
4674 group), a name (referring to a named group), or 'R', referring to
4675 recursion. R<digits> and R&name are also permitted for recursion tests.
4676
4677 There are several syntaxes for testing a named group: (?(name)) is used
4678 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4679
4680 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4681 be the recursive thing or the name 'R' (and similarly for 'R' followed
4682 by digits), and (b) a number could be a name that consists of digits.
4683 In both cases, we look for a name first; if not found, we try the other
4684 cases. */
4685
4686 /* For conditions that are assertions, check the syntax, and then exit
4687 the switch. This will take control down to where bracketed groups,
4688 including assertions, are processed. */
4689
4690 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4691 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4692 break;
4693
4694 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4695 below), and all need to skip 3 bytes at the start of the group. */
4696
4697 code[1+LINK_SIZE] = OP_CREF;
4698 skipbytes = 3;
4699 refsign = -1;
4700
4701 /* Check for a test for recursion in a named group. */
4702
4703 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4704 {
4705 terminator = -1;
4706 ptr += 2;
4707 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4708 }
4709
4710 /* Check for a test for a named group's having been set, using the Perl
4711 syntax (?(<name>) or (?('name') */
4712
4713 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4714 {
4715 terminator = CHAR_GREATER_THAN_SIGN;
4716 ptr++;
4717 }
4718 else if (ptr[1] == CHAR_APOSTROPHE)
4719 {
4720 terminator = CHAR_APOSTROPHE;
4721 ptr++;
4722 }
4723 else
4724 {
4725 terminator = 0;
4726 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4727 }
4728
4729 /* We now expect to read a name; any thing else is an error */
4730
4731 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4732 {
4733 ptr += 1; /* To get the right offset */
4734 *errorcodeptr = ERR28;
4735 goto FAILED;
4736 }
4737
4738 /* Read the name, but also get it as a number if it's all digits */
4739
4740 recno = 0;
4741 name = ++ptr;
4742 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4743 {
4744 if (recno >= 0)
4745 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4746 recno * 10 + *ptr - CHAR_0 : -1;
4747 ptr++;
4748 }
4749 namelen = ptr - name;
4750
4751 if ((terminator > 0 && *ptr++ != terminator) ||
4752 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4753 {
4754 ptr--; /* Error offset */
4755 *errorcodeptr = ERR26;
4756 goto FAILED;
4757 }
4758
4759 /* Do no further checking in the pre-compile phase. */
4760
4761 if (lengthptr != NULL) break;
4762
4763 /* In the real compile we do the work of looking for the actual
4764 reference. If the string started with "+" or "-" we require the rest to
4765 be digits, in which case recno will be set. */
4766
4767 if (refsign > 0)
4768 {
4769 if (recno <= 0)
4770 {
4771 *errorcodeptr = ERR58;
4772 goto FAILED;
4773 }
4774 recno = (refsign == CHAR_MINUS)?
4775 cd->bracount - recno + 1 : recno +cd->bracount;
4776 if (recno <= 0 || recno > cd->final_bracount)
4777 {
4778 *errorcodeptr = ERR15;
4779 goto FAILED;
4780 }
4781 PUT2(code, 2+LINK_SIZE, recno);
4782 break;
4783 }
4784
4785 /* Otherwise (did not start with "+" or "-"), start by looking for the
4786 name. If we find a name, add one to the opcode to change OP_CREF or
4787 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4788 except they record that the reference was originally to a name. The
4789 information is used to check duplicate names. */
4790
4791 slot = cd->name_table;
4792 for (i = 0; i < cd->names_found; i++)
4793 {
4794 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4795 slot += cd->name_entry_size;
4796 }
4797
4798 /* Found a previous named subpattern */
4799
4800 if (i < cd->names_found)
4801 {
4802 recno = GET2(slot, 0);
4803 PUT2(code, 2+LINK_SIZE, recno);
4804 code[1+LINK_SIZE]++;
4805 }
4806
4807 /* Search the pattern for a forward reference */
4808
4809 else if ((i = find_parens(cd, name, namelen,
4810 (options & PCRE_EXTENDED) != 0)) > 0)
4811 {
4812 PUT2(code, 2+LINK_SIZE, i);
4813 code[1+LINK_SIZE]++;
4814 }
4815
4816 /* If terminator == 0 it means that the name followed directly after
4817 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4818 some further alternatives to try. For the cases where terminator != 0
4819 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4820 now checked all the possibilities, so give an error. */
4821
4822 else if (terminator != 0)
4823 {
4824 *errorcodeptr = ERR15;
4825 goto FAILED;
4826 }
4827
4828 /* Check for (?(R) for recursion. Allow digits after R to specify a
4829 specific group number. */
4830
4831 else if (*name == CHAR_R)
4832 {
4833 recno = 0;
4834 for (i = 1; i < namelen; i++)
4835 {
4836 if ((digitab[name[i]] & ctype_digit) == 0)
4837 {
4838 *errorcodeptr = ERR15;
4839 goto FAILED;
4840 }
4841 recno = recno * 10 + name[i] - CHAR_0;
4842 }
4843 if (recno == 0) recno = RREF_ANY;
4844 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4845 PUT2(code, 2+LINK_SIZE, recno);
4846 }
4847
4848 /* Similarly, check for the (?(DEFINE) "condition", which is always
4849 false. */
4850
4851 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4852 {
4853 code[1+LINK_SIZE] = OP_DEF;
4854 skipbytes = 1;
4855 }
4856
4857 /* Check for the "name" actually being a subpattern number. We are
4858 in the second pass here, so final_bracount is set. */
4859
4860 else if (recno > 0 && recno <= cd->final_bracount)
4861 {
4862 PUT2(code, 2+LINK_SIZE, recno);
4863 }
4864
4865 /* Either an unidentified subpattern, or a reference to (?(0) */
4866
4867 else
4868 {
4869 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4870 goto FAILED;
4871 }
4872 break;
4873
4874
4875 /* ------------------------------------------------------------ */
4876 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4877 bravalue = OP_ASSERT;
4878 ptr++;
4879 break;
4880
4881
4882 /* ------------------------------------------------------------ */
4883 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4884 ptr++;
4885 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4886 {
4887 *code++ = OP_FAIL;
4888 previous = NULL;
4889 continue;
4890 }
4891 bravalue = OP_ASSERT_NOT;
4892 break;
4893
4894
4895 /* ------------------------------------------------------------ */
4896 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4897 switch (ptr[1])
4898 {
4899 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4900 bravalue = OP_ASSERTBACK;
4901 ptr += 2;
4902 break;
4903
4904 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4905 bravalue = OP_ASSERTBACK_NOT;
4906 ptr += 2;
4907 break;
4908
4909 default: /* Could be name define, else bad */
4910 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4911 ptr++; /* Correct offset for error */
4912 *errorcodeptr = ERR24;
4913 goto FAILED;
4914 }
4915 break;
4916
4917
4918 /* ------------------------------------------------------------ */
4919 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4920 bravalue = OP_ONCE;
4921 ptr++;
4922 break;
4923
4924
4925 /* ------------------------------------------------------------ */
4926 case CHAR_C: /* Callout - may be followed by digits; */
4927 previous_callout = code; /* Save for later completion */
4928 after_manual_callout = 1; /* Skip one item before completing */
4929 *code++ = OP_CALLOUT;
4930 {
4931 int n = 0;
4932 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4933 n = n * 10 + *ptr - CHAR_0;
4934 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4935 {
4936 *errorcodeptr = ERR39;
4937 goto FAILED;
4938 }
4939 if (n > 255)
4940 {
4941 *errorcodeptr = ERR38;
4942 goto FAILED;
4943 }
4944 *code++ = n;
4945 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4946 PUT(code, LINK_SIZE, 0); /* Default length */
4947 code += 2 * LINK_SIZE;
4948 }
4949 previous = NULL;
4950 continue;
4951
4952
4953 /* ------------------------------------------------------------ */
4954 case CHAR_P: /* Python-style named subpattern handling */
4955 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4956 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4957 {
4958 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4959 terminator = CHAR_RIGHT_PARENTHESIS;
4960 goto NAMED_REF_OR_RECURSE;
4961 }
4962 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4963 {
4964 *errorcodeptr = ERR41;
4965 goto FAILED;
4966 }
4967 /* Fall through to handle (?P< as (?< is handled */
4968
4969
4970 /* ------------------------------------------------------------ */
4971 DEFINE_NAME: /* Come here from (?< handling */
4972 case CHAR_APOSTROPHE:
4973 {
4974 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4975 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4976 name = ++ptr;
4977
4978 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4979 namelen = ptr - name;
4980
4981 /* In the pre-compile phase, just do a syntax check. */
4982
4983 if (lengthptr != NULL)
4984 {
4985 if (*ptr != terminator)
4986 {
4987 *errorcodeptr = ERR42;
4988 goto FAILED;
4989 }
4990 if (cd->names_found >= MAX_NAME_COUNT)
4991 {
4992 *errorcodeptr = ERR49;
4993 goto FAILED;
4994 }
4995 if (namelen + 3 > cd->name_entry_size)
4996 {
4997 cd->name_entry_size = namelen + 3;
4998 if (namelen > MAX_NAME_SIZE)
4999 {
5000 *errorcodeptr = ERR48;
5001 goto FAILED;
5002 }
5003 }
5004 }
5005
5006 /* In the real compile, create the entry in the table, maintaining
5007 alphabetical order. Duplicate names for different numbers are
5008 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
5009 number are always OK. (An existing number can be re-used if (?|
5010 appears in the pattern.) In either event, a duplicate name results in
5011 a duplicate entry in the table, even if the number is the same. This
5012 is because the number of names, and hence the table size, is computed
5013 in the pre-compile, and it affects various numbers and pointers which
5014 would all have to be modified, and the compiled code moved down, if
5015 duplicates with the same number were omitted from the table. This
5016 doesn't seem worth the hassle. However, *different* names for the
5017 same number are not permitted. */
5018
5019 else
5020 {
5021 BOOL dupname = FALSE;
5022 slot = cd->name_table;
5023
5024 for (i = 0; i < cd->names_found; i++)
5025 {
5026 int crc = memcmp(name, slot+2, namelen);
5027 if (crc == 0)
5028 {
5029 if (slot[2+namelen] == 0)
5030 {
5031 if (GET2(slot, 0) != cd->bracount + 1 &&
5032 (options & PCRE_DUPNAMES) == 0)
5033 {
5034 *errorcodeptr = ERR43;
5035 goto FAILED;
5036 }
5037 else dupname = TRUE;
5038 }
5039 else crc = -1; /* Current name is a substring */
5040 }
5041
5042 /* Make space in the table and break the loop for an earlier
5043 name. For a duplicate or later name, carry on. We do this for
5044 duplicates so that in the simple case (when ?(| is not used) they
5045 are in order of their numbers. */
5046
5047 if (crc < 0)
5048 {
5049 memmove(slot + cd->name_entry_size, slot,
5050 (cd->names_found - i) * cd->name_entry_size);
5051 break;
5052 }
5053
5054 /* Continue the loop for a later or duplicate name */
5055
5056 slot += cd->name_entry_size;
5057 }
5058
5059 /* For non-duplicate names, check for a duplicate number before
5060 adding the new name. */
5061
5062 if (!dupname)
5063 {
5064 uschar *cslot = cd->name_table;
5065 for (i = 0; i < cd->names_found; i++)
5066 {
5067 if (cslot != slot)
5068 {
5069 if (GET2(cslot, 0) == cd->bracount + 1)
5070 {
5071 *errorcodeptr = ERR65;
5072 goto FAILED;
5073 }
5074 }
5075 else i--;
5076 cslot += cd->name_entry_size;
5077 }
5078 }
5079
5080 PUT2(slot, 0, cd->bracount + 1);
5081 memcpy(slot + 2, name, namelen);
5082 slot[2+namelen] = 0;
5083 }
5084 }
5085
5086 /* In both pre-compile and compile, count the number of names we've
5087 encountered. */
5088
5089 cd->names_found++;
5090 ptr++; /* Move past > or ' */
5091 goto NUMBERED_GROUP;
5092
5093
5094 /* ------------------------------------------------------------ */
5095 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
5096 terminator = CHAR_RIGHT_PARENTHESIS;
5097 is_recurse = TRUE;
5098 /* Fall through */
5099
5100 /* We come here from the Python syntax above that handles both
5101 references (?P=name) and recursion (?P>name), as well as falling
5102 through from the Perl recursion syntax (?&name). We also come here from
5103 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5104 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5105
5106 NAMED_REF_OR_RECURSE:
5107 name = ++ptr;
5108 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5109 namelen = ptr - name;
5110
5111 /* In the pre-compile phase, do a syntax check and set a dummy
5112 reference number. */
5113
5114 if (lengthptr != NULL)
5115 {
5116 if (namelen == 0)
5117 {
5118 *errorcodeptr = ERR62;
5119 goto FAILED;
5120 }
5121 if (*ptr != terminator)
5122 {
5123 *errorcodeptr = ERR42;
5124 goto FAILED;
5125 }
5126 if (namelen > MAX_NAME_SIZE)
5127 {
5128 *errorcodeptr = ERR48;
5129 goto FAILED;
5130 }
5131 recno = 0;
5132 }
5133
5134 /* In the real compile, seek the name in the table. We check the name
5135 first, and then check that we have reached the end of the name in the
5136 table. That way, if the name that is longer than any in the table,
5137 the comparison will fail without reading beyond the table entry. */
5138
5139 else
5140 {
5141 slot = cd->name_table;
5142 for (i = 0; i < cd->names_found; i++)
5143 {
5144 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5145 slot[2+namelen] == 0)
5146 break;
5147 slot += cd->name_entry_size;
5148 }
5149
5150 if (i < cd->names_found) /* Back reference */
5151 {
5152 recno = GET2(slot, 0);
5153 }
5154 else if ((recno = /* Forward back reference */
5155 find_parens(cd, name, namelen,
5156 (options & PCRE_EXTENDED) != 0)) <= 0)
5157 {
5158 *errorcodeptr = ERR15;
5159 goto FAILED;
5160 }
5161 }
5162
5163 /* In both phases, we can now go to the code than handles numerical
5164 recursion or backreferences. */
5165
5166 if (is_recurse) goto HANDLE_RECURSION;
5167 else goto HANDLE_REFERENCE;
5168
5169
5170 /* ------------------------------------------------------------ */
5171 case CHAR_R: /* Recursion */
5172 ptr++; /* Same as (?0) */
5173 /* Fall through */
5174
5175
5176 /* ------------------------------------------------------------ */
5177 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
5178 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5179 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5180 {
5181 const uschar *called;
5182 terminator = CHAR_RIGHT_PARENTHESIS;
5183
5184 /* Come here from the \g<...> and \g'...' code (Oniguruma
5185 compatibility). However, the syntax has been checked to ensure that
5186 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5187 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5188 ever be taken. */
5189
5190 HANDLE_NUMERICAL_RECURSION:
5191
5192 if ((refsign = *ptr) == CHAR_PLUS)
5193 {
5194 ptr++;
5195 if ((digitab[*ptr] & ctype_digit) == 0)
5196 {
5197 *errorcodeptr = ERR63;
5198 goto FAILED;
5199 }
5200 }
5201 else if (refsign == CHAR_MINUS)
5202 {
5203 if ((digitab[ptr[1]] & ctype_digit) == 0)
5204 goto OTHER_CHAR_AFTER_QUERY;
5205 ptr++;
5206 }
5207
5208 recno = 0;
5209 while((digitab[*ptr] & ctype_digit) != 0)
5210 recno = recno * 10 + *ptr++ - CHAR_0;
5211
5212 if (*ptr != terminator)
5213 {
5214 *errorcodeptr = ERR29;
5215 goto FAILED;
5216 }
5217
5218 if (refsign == CHAR_MINUS)
5219 {
5220 if (recno == 0)
5221 {
5222 *errorcodeptr = ERR58;
5223 goto FAILED;
5224 }
5225 recno = cd->bracount - recno + 1;
5226 if (recno <= 0)
5227 {
5228 *errorcodeptr = ERR15;
5229 goto FAILED;
5230 }
5231 }
5232 else if (refsign == CHAR_PLUS)
5233 {
5234 if (recno == 0)
5235 {
5236 *errorcodeptr = ERR58;
5237 goto FAILED;
5238 }
5239 recno += cd->bracount;
5240 }
5241
5242 /* Come here from code above that handles a named recursion */
5243
5244 HANDLE_RECURSION:
5245
5246 previous = code;
5247 called = cd->start_code;
5248
5249 /* When we are actually compiling, find the bracket that is being
5250 referenced. Temporarily end the regex in case it doesn't exist before
5251 this point. If we end up with a forward reference, first check that
5252 the bracket does occur later so we can give the error (and position)
5253 now. Then remember this forward reference in the workspace so it can
5254 be filled in at the end. */
5255
5256 if (lengthptr == NULL)
5257 {
5258 *code = OP_END;
5259 if (recno != 0)
5260 called = _pcre_find_bracket(cd->start_code, utf8, recno);
5261
5262 /* Forward reference */
5263
5264 if (called == NULL)
5265 {
5266 if (find_parens(cd, NULL, recno,
5267 (options & PCRE_EXTENDED) != 0) < 0)
5268 {
5269 *errorcodeptr = ERR15;
5270 goto FAILED;
5271 }
5272
5273 /* Fudge the value of "called" so that when it is inserted as an
5274 offset below, what it actually inserted is the reference number
5275 of the group. */
5276
5277 called = cd->start_code + recno;
5278 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5279 }
5280
5281 /* If not a forward reference, and the subpattern is still open,
5282 this is a recursive call. We check to see if this is a left
5283 recursion that could loop for ever, and diagnose that case. */
5284
5285 else if (GET(called, 1) == 0 &&
5286 could_be_empty(called, code, bcptr, utf8, cd))
5287 {
5288 *errorcodeptr = ERR40;
5289 goto FAILED;
5290 }
5291 }
5292
5293 /* Insert the recursion/subroutine item, automatically wrapped inside
5294 "once" brackets. Set up a "previous group" length so that a
5295 subsequent quantifier will work. */
5296
5297 *code = OP_ONCE;
5298 PUT(code, 1, 2 + 2*LINK_SIZE);
5299 code += 1 + LINK_SIZE;
5300
5301 *code = OP_RECURSE;
5302 PUT(code, 1, called - cd->start_code);
5303 code += 1 + LINK_SIZE;
5304
5305 *code = OP_KET;
5306 PUT(code, 1, 2 + 2*LINK_SIZE);
5307 code += 1 + LINK_SIZE;
5308
5309 length_prevgroup = 3 + 3*LINK_SIZE;
5310 }
5311
5312 /* Can't determine a first byte now */
5313
5314 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5315 continue;
5316
5317
5318 /* ------------------------------------------------------------ */
5319 default: /* Other characters: check option setting */
5320 OTHER_CHAR_AFTER_QUERY:
5321 set = unset = 0;
5322 optset = &set;
5323
5324 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5325 {
5326 switch (*ptr++)
5327 {
5328 case CHAR_MINUS: optset = &unset; break;
5329
5330 case CHAR_J: /* Record that it changed in the external options */
5331 *optset |= PCRE_DUPNAMES;
5332 cd->external_flags |= PCRE_JCHANGED;
5333 break;
5334
5335 case CHAR_i: *optset |= PCRE_CASELESS; break;
5336 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5337 case CHAR_s: *optset |= PCRE_DOTALL; break;
5338 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5339 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5340 case CHAR_X: *optset |= PCRE_EXTRA; break;
5341
5342 default: *errorcodeptr = ERR12;
5343 ptr--; /* Correct the offset */
5344 goto FAILED;
5345 }
5346 }
5347
5348 /* Set up the changed option bits, but don't change anything yet. */
5349
5350 newoptions = (options | set) & (~unset);
5351
5352 /* If the options ended with ')' this is not the start of a nested
5353 group with option changes, so the options change at this level. If this
5354 item is right at the start of the pattern, the options can be
5355 abstracted and made external in the pre-compile phase, and ignored in
5356 the compile phase. This can be helpful when matching -- for instance in
5357 caseless checking of required bytes.
5358
5359 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5360 definitely *not* at the start of the pattern because something has been
5361 compiled. In the pre-compile phase, however, the code pointer can have
5362 that value after the start, because it gets reset as code is discarded
5363 during the pre-compile. However, this can happen only at top level - if
5364 we are within parentheses, the starting BRA will still be present. At
5365 any parenthesis level, the length value can be used to test if anything
5366 has been compiled at that level. Thus, a test for both these conditions
5367 is necessary to ensure we correctly detect the start of the pattern in
5368 both phases.
5369
5370 If we are not at the pattern start, compile code to change the ims
5371 options if this setting actually changes any of them, and reset the
5372 greedy defaults and the case value for firstbyte and reqbyte. */
5373
5374 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5375 {
5376 if (code == cd->start_code + 1 + LINK_SIZE &&
5377 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5378 {
5379 cd->external_options = newoptions;
5380 }
5381 else
5382 {
5383 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5384 {
5385 *code++ = OP_OPT;
5386 *code++ = newoptions & PCRE_IMS;
5387 }
5388 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5389 greedy_non_default = greedy_default ^ 1;
5390 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5391 }
5392
5393 /* Change options at this level, and pass them back for use
5394 in subsequent branches. When not at the start of the pattern, this
5395 information is also necessary so that a resetting item can be
5396 compiled at the end of a group (if we are in a group). */
5397
5398 *optionsptr = options = newoptions;
5399 previous = NULL; /* This item can't be repeated */
5400 continue; /* It is complete */
5401 }
5402
5403 /* If the options ended with ':' we are heading into a nested group
5404 with possible change of options. Such groups are non-capturing and are
5405 not assertions of any kind. All we need to do is skip over the ':';
5406 the newoptions value is handled below. */
5407
5408 bravalue = OP_BRA;
5409 ptr++;
5410 } /* End of switch for character following (? */
5411 } /* End of (? handling */
5412
5413 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
5414 is set, all unadorned brackets become non-capturing and behave like (?:...)
5415 brackets. */
5416
5417 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5418 {
5419 bravalue = OP_BRA;
5420 }
5421
5422 /* Else we have a capturing group. */
5423
5424 else
5425 {
5426 NUMBERED_GROUP:
5427 cd->bracount += 1;
5428 PUT2(code, 1+LINK_SIZE, cd->bracount);
5429 skipbytes = 2;
5430 }
5431
5432 /* Process nested bracketed regex. Assertions may not be repeated, but
5433 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5434 non-register variable in order to be able to pass its address because some
5435 compilers complain otherwise. Pass in a new setting for the ims options if
5436 they have changed. */
5437
5438 previous = (bravalue >= OP_ONCE)? code : NULL;
5439 *code = bravalue;
5440 tempcode = code;
5441 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5442 length_prevgroup = 0; /* Initialize for pre-compile phase */
5443
5444 if (!compile_regex(
5445 newoptions, /* The complete new option state */
5446 options & PCRE_IMS, /* The previous ims option state */
5447 &tempcode, /* Where to put code (updated) */
5448 &ptr, /* Input pointer (updated) */
5449 errorcodeptr, /* Where to put an error message */
5450 (bravalue == OP_ASSERTBACK ||
5451 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5452 reset_bracount, /* True if (?| group */
5453 skipbytes, /* Skip over bracket number */
5454 &subfirstbyte, /* For possible first char */
5455 &subreqbyte, /* For possible last char */
5456 bcptr, /* Current branch chain */
5457 cd, /* Tables block */
5458 (lengthptr == NULL)? NULL : /* Actual compile phase */
5459 &length_prevgroup /* Pre-compile phase */
5460 ))
5461 goto FAILED;
5462
5463 /* At the end of compiling, code is still pointing to the start of the
5464 group, while tempcode has been updated to point past the end of the group
5465 and any option resetting that may follow it. The pattern pointer (ptr)
5466 is on the bracket. */
5467
5468 /* If this is a conditional bracket, check that there are no more than
5469 two branches in the group, or just one if it's a DEFINE group. We do this
5470 in the real compile phase, not in the pre-pass, where the whole group may
5471 not be available. */
5472
5473 if (bravalue == OP_COND && lengthptr == NULL)
5474 {
5475 uschar *tc = code;
5476 int condcount = 0;
5477
5478 do {
5479 condcount++;
5480 tc += GET(tc,1);
5481 }
5482 while (*tc != OP_KET);
5483
5484 /* A DEFINE group is never obeyed inline (the "condition" is always
5485 false). It must have only one branch. */
5486
5487 if (code[LINK_SIZE+1] == OP_DEF)
5488 {
5489 if (condcount > 1)
5490 {
5491 *errorcodeptr = ERR54;
5492 goto FAILED;
5493 }
5494 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5495 }
5496
5497 /* A "normal" conditional group. If there is just one branch, we must not
5498 make use of its firstbyte or reqbyte, because this is equivalent to an
5499 empty second branch. */
5500
5501 else
5502 {
5503 if (condcount > 2)
5504 {
5505 *errorcodeptr = ERR27;
5506 goto FAILED;
5507 }
5508 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5509 }
5510 }
5511
5512 /* Error if hit end of pattern */
5513
5514 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5515 {
5516 *errorcodeptr = ERR14;
5517 goto FAILED;
5518 }
5519
5520 /* In the pre-compile phase, update the length by the length of the group,
5521 less the brackets at either end. Then reduce the compiled code to just a
5522 set of non-capturing brackets so that it doesn't use much memory if it is
5523 duplicated by a quantifier.*/
5524
5525 if (lengthptr != NULL)
5526 {
5527 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5528 {
5529 *errorcodeptr = ERR20;
5530 goto FAILED;
5531 }
5532 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5533 *code++ = OP_BRA;
5534 PUTINC(code, 0, 1 + LINK_SIZE);
5535 *code++ = OP_KET;
5536 PUTINC(code, 0, 1 + LINK_SIZE);
5537 break; /* No need to waste time with special character handling */
5538 }
5539
5540 /* Otherwise update the main code pointer to the end of the group. */
5541
5542 code = tempcode;
5543
5544 /* For a DEFINE group, required and first character settings are not
5545 relevant. */
5546
5547 if (bravalue == OP_DEF) break;
5548
5549 /* Handle updating of the required and first characters for other types of
5550 group. Update for normal brackets of all kinds, and conditions with two
5551 branches (see code above). If the bracket is followed by a quantifier with
5552 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5553 zerofirstbyte outside the main loop so that they can be accessed for the
5554 back off. */
5555
5556 zeroreqbyte = reqbyte;
5557 zerofirstbyte = firstbyte;
5558 groupsetfirstbyte = FALSE;
5559
5560 if (bravalue >= OP_ONCE)
5561 {
5562 /* If we have not yet set a firstbyte in this branch, take it from the
5563 subpattern, remembering that it was set here so that a repeat of more
5564 than one can replicate it as reqbyte if necessary. If the subpattern has
5565 no firstbyte, set "none" for the whole branch. In both cases, a zero
5566 repeat forces firstbyte to "none". */
5567
5568 if (firstbyte == REQ_UNSET)
5569 {
5570 if (subfirstbyte >= 0)
5571 {
5572 firstbyte = subfirstbyte;
5573 groupsetfirstbyte = TRUE;
5574 }
5575 else firstbyte = REQ_NONE;
5576 zerofirstbyte = REQ_NONE;
5577 }
5578
5579 /* If firstbyte was previously set, convert the subpattern's firstbyte
5580 into reqbyte if there wasn't one, using the vary flag that was in
5581 existence beforehand. */
5582
5583 else if (subfirstbyte >= 0 && subreqbyte < 0)
5584 subreqbyte = subfirstbyte | tempreqvary;
5585
5586 /* If the subpattern set a required byte (or set a first byte that isn't
5587 really the first byte - see above), set it. */
5588
5589 if (subreqbyte >= 0) reqbyte = subreqbyte;
5590 }
5591
5592 /* For a forward assertion, we take the reqbyte, if set. This can be
5593 helpful if the pattern that follows the assertion doesn't set a different
5594 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5595 for an assertion, however because it leads to incorrect effect for patterns
5596 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5597 of a firstbyte. This is overcome by a scan at the end if there's no
5598 firstbyte, looking for an asserted first char. */
5599
5600 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5601 break; /* End of processing '(' */
5602
5603
5604 /* ===================================================================*/
5605 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5606 are arranged to be the negation of the corresponding OP_values. For the
5607 back references, the values are ESC_REF plus the reference number. Only
5608 back references and those types that consume a character may be repeated.
5609 We can test for values between ESC_b and ESC_Z for the latter; this may
5610 have to change if any new ones are ever created. */
5611
5612 case CHAR_BACKSLASH:
5613 tempptr = ptr;
5614 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5615 if (*errorcodeptr != 0) goto FAILED;
5616
5617 if (c < 0)
5618 {
5619 if (-c == ESC_Q) /* Handle start of quoted string */
5620 {
5621 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5622 ptr += 2; /* avoid empty string */
5623 else inescq = TRUE;
5624 continue;
5625 }
5626
5627 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5628
5629 /* For metasequences that actually match a character, we disable the
5630 setting of a first character if it hasn't already been set. */
5631
5632 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5633 firstbyte = REQ_NONE;
5634
5635 /* Set values to reset to if this is followed by a zero repeat. */
5636
5637 zerofirstbyte = firstbyte;
5638 zeroreqbyte = reqbyte;
5639
5640 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5641 is a subroutine call by number (Oniguruma syntax). In fact, the value
5642 -ESC_g is returned only for these cases. So we don't need to check for <
5643 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5644 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5645 that is a synonym for a named back reference). */
5646
5647 if (-c == ESC_g)
5648 {
5649 const uschar *p;
5650 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5651 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5652 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5653
5654 /* These two statements stop the compiler for warning about possibly
5655 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5656 fact, because we actually check for a number below, the paths that
5657 would actually be in error are never taken. */
5658
5659 skipbytes = 0;
5660 reset_bracount = FALSE;
5661
5662 /* Test for a name */
5663
5664 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5665 {
5666 BOOL isnumber = TRUE;
5667 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5668 {
5669 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5670 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5671 }
5672 if (*p != terminator)
5673 {
5674 *errorcodeptr = ERR57;
5675 break;
5676 }
5677 if (isnumber)
5678 {
5679 ptr++;
5680 goto HANDLE_NUMERICAL_RECURSION;
5681 }
5682 is_recurse = TRUE;
5683 goto NAMED_REF_OR_RECURSE;
5684 }
5685
5686 /* Test a signed number in angle brackets or quotes. */
5687
5688 p = ptr + 2;
5689 while ((digitab[*p] & ctype_digit) != 0) p++;
5690 if (*p != terminator)
5691 {
5692 *errorcodeptr = ERR57;
5693 break;
5694 }
5695 ptr++;
5696 goto HANDLE_NUMERICAL_RECURSION;
5697 }
5698
5699 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5700 We also support \k{name} (.NET syntax) */
5701
5702 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5703 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5704 {
5705 is_recurse = FALSE;
5706 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5707 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5708 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5709 goto NAMED_REF_OR_RECURSE;
5710 }
5711
5712 /* Back references are handled specially; must disable firstbyte if
5713 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5714 ':' later. */
5715
5716 if (-c >= ESC_REF)
5717 {
5718 open_capitem *oc;
5719 recno = -c - ESC_REF;
5720
5721 HANDLE_REFERENCE: /* Come here from named backref handling */
5722 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5723 previous = code;
5724 *code++ = OP_REF;
5725 PUT2INC(code, 0, recno);
5726 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5727 if (recno > cd->top_backref) cd->top_backref = recno;
5728
5729 /* Check to see if this back reference is recursive, that it, it
5730 is inside the group that it references. A flag is set so that the
5731 group can be made atomic. */
5732
5733 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5734 {
5735 if (oc->number == recno)
5736 {
5737 oc->flag = TRUE;
5738 break;
5739 }
5740 }
5741 }
5742
5743 /* So are Unicode property matches, if supported. */
5744
5745 #ifdef SUPPORT_UCP
5746 else if (-c == ESC_P || -c == ESC_p)
5747 {
5748 BOOL negated;
5749 int pdata;
5750 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5751 if (ptype < 0) goto FAILED;
5752 previous = code;
5753 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5754 *code++ = ptype;
5755 *code++ = pdata;
5756 }
5757 #else
5758
5759 /* If Unicode properties are not supported, \X, \P, and \p are not
5760 allowed. */
5761
5762 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5763 {
5764 *errorcodeptr = ERR45;
5765 goto FAILED;
5766 }
5767 #endif
5768
5769 /* For the rest (including \X when Unicode properties are supported), we
5770 can obtain the OP value by negating the escape value. */
5771
5772 else
5773 {
5774 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5775 *code++ = -c;
5776 }
5777 continue;
5778 }
5779
5780 /* We have a data character whose value is in c. In UTF-8 mode it may have
5781 a value > 127. We set its representation in the length/buffer, and then
5782 handle it as a data character. */
5783
5784 #ifdef SUPPORT_UTF8
5785 if (utf8 && c > 127)
5786 mclength = _pcre_ord2utf8(c, mcbuffer);
5787 else
5788 #endif
5789
5790 {
5791 mcbuffer[0] = c;
5792 mclength = 1;
5793 }
5794 goto ONE_CHAR;
5795
5796
5797 /* ===================================================================*/
5798 /* Handle a literal character. It is guaranteed not to be whitespace or #
5799 when the extended flag is set. If we are in UTF-8 mode, it may be a
5800 multi-byte literal character. */
5801
5802 default:
5803 NORMAL_CHAR:
5804 mclength = 1;
5805 mcbuffer[0] = c;
5806
5807 #ifdef SUPPORT_UTF8
5808 if (utf8 && c >= 0xc0)
5809 {
5810 while ((ptr[1] & 0xc0) == 0x80)
5811 mcbuffer[mclength++] = *(++ptr);
5812 }
5813 #endif
5814
5815 /* At this point we have the character's bytes in mcbuffer, and the length
5816 in mclength. When not in UTF-8 mode, the length is always 1. */
5817
5818 ONE_CHAR:
5819 previous = code;
5820 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5821 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5822
5823 /* Remember if \r or \n were seen */
5824
5825 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5826 cd->external_flags |= PCRE_HASCRORLF;
5827
5828 /* Set the first and required bytes appropriately. If no previous first
5829 byte, set it from this character, but revert to none on a zero repeat.
5830 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5831 repeat. */
5832
5833 if (firstbyte == REQ_UNSET)
5834 {
5835 zerofirstbyte = REQ_NONE;
5836 zeroreqbyte = reqbyte;
5837
5838 /* If the character is more than one byte long, we can set firstbyte
5839 only if it is not to be matched caselessly. */
5840
5841 if (mclength == 1 || req_caseopt == 0)
5842 {
5843 firstbyte = mcbuffer[0] | req_caseopt;
5844 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5845 }
5846 else firstbyte = reqbyte = REQ_NONE;
5847 }
5848
5849 /* firstbyte was previously set; we can set reqbyte only the length is
5850 1 or the matching is caseful. */
5851
5852 else
5853 {
5854 zerofirstbyte = firstbyte;
5855 zeroreqbyte = reqbyte;
5856 if (mclength == 1 || req_caseopt == 0)
5857 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5858 }
5859
5860 break; /* End of literal character handling */
5861 }
5862 } /* end of big loop */
5863
5864
5865 /* Control never reaches here by falling through, only by a goto for all the
5866 error states. Pass back the position in the pattern so that it can be displayed
5867 to the user for diagnosing the error. */
5868
5869 FAILED:
5870 *ptrptr = ptr;
5871 return FALSE;
5872 }
5873
5874
5875
5876
5877 /*************************************************
5878 * Compile sequence of alternatives *
5879 *************************************************/
5880
5881 /* On entry, ptr is pointing past the bracket character, but on return it
5882 points to the closing bracket, or vertical bar, or end of string. The code
5883 variable is pointing at the byte into which the BRA operator has been stored.
5884 If the ims options are changed at the start (for a (?ims: group) or during any
5885 branch, we need to insert an OP_OPT item at the start of every following branch
5886 to ensure they get set correctly at run time, and also pass the new options
5887 into every subsequent branch compile.
5888
5889 This function is used during the pre-compile phase when we are trying to find
5890 out the amount of memory needed, as well as during the real compile phase. The
5891 value of lengthptr distinguishes the two phases.
5892
5893 Arguments:
5894 options option bits, including any changes for this subpattern
5895 oldims previous settings of ims option bits
5896 codeptr -> the address of the current code pointer
5897 ptrptr -> the address of the current pattern pointer
5898 errorcodeptr -> pointer to error code variable
5899 lookbehind TRUE if this is a lookbehind assertion
5900 reset_bracount TRUE to reset the count for each branch
5901 skipbytes skip this many bytes at start (for brackets and OP_COND)
5902 firstbyteptr place to put the first required character, or a negative number
5903 reqbyteptr place to put the last required character, or a negative number
5904 bcptr pointer to the chain of currently open branches
5905 cd points to the data block with tables pointers etc.
5906 lengthptr NULL during the real compile phase
5907 points to length accumulator during pre-compile phase
5908
5909 Returns: TRUE on success
5910 */
5911
5912 static BOOL
5913 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5914 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5915 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5916 int *lengthptr)
5917 {
5918 const uschar *ptr = *ptrptr;
5919 uschar *code = *codeptr;
5920 uschar *last_branch = code;
5921 uschar *start_bracket = code;
5922 uschar *reverse_count = NULL;
5923 open_capitem capitem;
5924 int capnumber = 0;
5925 int firstbyte, reqbyte;
5926 int branchfirstbyte, branchreqbyte;
5927 int length;
5928 int orig_bracount;
5929 int max_bracount;
5930 int old_external_options = cd->external_options;
5931 branch_chain bc;
5932
5933 bc.outer = bcptr;
5934 bc.current_branch = code;
5935
5936 firstbyte = reqbyte = REQ_UNSET;
5937
5938 /* Accumulate the length for use in the pre-compile phase. Start with the
5939 length of the BRA and KET and any extra bytes that are required at the
5940 beginning. We accumulate in a local variable to save frequent testing of
5941 lenthptr for NULL. We cannot do this by looking at the value of code at the
5942 start and end of each alternative, because compiled items are discarded during
5943 the pre-compile phase so that the work space is not exceeded. */
5944
5945 length = 2 + 2*LINK_SIZE + skipbytes;
5946
5947 /* WARNING: If the above line is changed for any reason, you must also change
5948 the code that abstracts option settings at the start of the pattern and makes
5949 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5950 pre-compile phase to find out whether anything has yet been compiled or not. */
5951
5952 /* If this is a capturing subpattern, add to the chain of open capturing items
5953 so that we can detect them if (*ACCEPT) is encountered. This is also used to
5954 detect groups that contain recursive back references to themselves. */
5955
5956 if (*code == OP_CBRA)
5957 {
5958 capnumber = GET2(code, 1 + LINK_SIZE);
5959 capitem.number = capnumber;
5960 capitem.next = cd->open_caps;
5961 capitem.flag = FALSE;
5962 cd->open_caps = &capitem;
5963 }
5964
5965 /* Offset is set zero to mark that this bracket is still open */
5966
5967 PUT(code, 1, 0);
5968 code += 1 + LINK_SIZE + skipbytes;
5969
5970 /* Loop for each alternative branch */
5971
5972 orig_bracount = max_bracount = cd->bracount;
5973 for (;;)
5974 {
5975 /* For a (?| group, reset the capturing bracket count so that each branch
5976 uses the same numbers. */
5977
5978 if (reset_bracount) cd->bracount = orig_bracount;
5979
5980 /* Handle a change of ims options at the start of the branch */
5981
5982 if ((options & PCRE_IMS) != oldims)
5983 {
5984 *code++ = OP_OPT;
5985 *code++ = options & PCRE_IMS;
5986 length += 2;
5987 }
5988
5989 /* Set up dummy OP_REVERSE if lookbehind assertion */
5990
5991 if (lookbehind)
5992 {
5993 *code++ = OP_REVERSE;
5994 reverse_count = code;
5995 PUTINC(code, 0, 0);
5996 length += 1 + LINK_SIZE;
5997 }
5998
5999 /* Now compile the branch; in the pre-compile phase its length gets added
6000 into the length. */
6001
6002 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
6003 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
6004 {
6005 *ptrptr = ptr;
6006 return FALSE;
6007 }
6008
6009 /* If the external options have changed during this branch, it means that we
6010 are at the top level, and a leading option setting has been encountered. We
6011 need to re-set the original option values to take account of this so that,
6012 during the pre-compile phase, we know to allow for a re-set at the start of
6013 subsequent branches. */
6014
6015 if (old_external_options != cd->external_options)
6016 oldims = cd->external_options & PCRE_IMS;
6017
6018 /* Keep the highest bracket count in case (?| was used and some branch
6019 has fewer than the rest. */
6020
6021 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
6022
6023 /* In the real compile phase, there is some post-processing to be done. */
6024
6025 if (lengthptr == NULL)
6026 {
6027 /* If this is the first branch, the firstbyte and reqbyte values for the
6028 branch become the values for the regex. */
6029
6030 if (*last_branch != OP_ALT)
6031 {
6032 firstbyte = branchfirstbyte;
6033 reqbyte = branchreqbyte;
6034 }
6035
6036 /* If this is not the first branch, the first char and reqbyte have to
6037 match the values from all the previous branches, except that if the
6038 previous value for reqbyte didn't have REQ_VARY set, it can still match,
6039 and we set REQ_VARY for the regex. */
6040
6041 else
6042 {
6043 /* If we previously had a firstbyte, but it doesn't match the new branch,
6044 we have to abandon the firstbyte for the regex, but if there was
6045 previously no reqbyte, it takes on the value of the old firstbyte. */
6046
6047 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
6048 {
6049 if (reqbyte < 0) reqbyte = firstbyte;
6050 firstbyte = REQ_NONE;
6051 }
6052
6053 /* If we (now or from before) have no firstbyte, a firstbyte from the
6054 branch becomes a reqbyte if there isn't a branch reqbyte. */
6055
6056 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
6057 branchreqbyte = branchfirstbyte;
6058
6059 /* Now ensure that the reqbytes match */
6060
6061 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
6062 reqbyte = REQ_NONE;
6063 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
6064 }
6065
6066 /* If lookbehind, check that this branch matches a fixed-length string, and
6067 put the length into the OP_REVERSE item. Temporarily mark the end of the
6068 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
6069 because there may be forward references that we can't check here. Set a
6070 flag to cause another lookbehind check at the end. Why not do it all at the
6071 end? Because common, erroneous checks are picked up here and the offset of
6072 the problem can be shown. */
6073
6074 if (lookbehind)
6075 {
6076 int fixed_length;
6077 *code = OP_END;
6078 fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
6079 DPRINTF(("fixed length = %d\n", fixed_length));
6080 if (fixed_length == -3)
6081 {
6082 cd->check_lookbehind = TRUE;
6083 }
6084 else if (fixed_length < 0)
6085 {
6086 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
6087 *ptrptr = ptr;
6088 return FALSE;
6089 }
6090 else { PUT(reverse_count, 0, fixed_length); }
6091 }
6092 }
6093
6094 /* Reached end of expression, either ')' or end of pattern. In the real
6095 compile phase, go back through the alternative branches and reverse the chain
6096 of offsets, with the field in the BRA item now becoming an offset to the
6097 first alternative. If there are no alternatives, it points to the end of the
6098 group. The length in the terminating ket is always the length of the whole
6099 bracketed item. If any of the ims options were changed inside the group,
6100 compile a resetting op-code following, except at the very end of the pattern.
6101 Return leaving the pointer at the terminating char. */
6102
6103 if (*ptr != CHAR_VERTICAL_LINE)
6104 {
6105 if (lengthptr == NULL)
6106 {
6107 int branch_length = code - last_branch;
6108 do
6109 {
6110 int prev_length = GET(last_branch, 1);
6111 PUT(last_branch, 1, branch_length);
6112 branch_length = prev_length;
6113 last_branch -= branch_length;
6114 }
6115 while (branch_length > 0);
6116 }
6117
6118 /* Fill in the ket */
6119
6120 *code = OP_KET;
6121 PUT(code, 1, code - start_bracket);
6122 code += 1 + LINK_SIZE;
6123
6124 /* If it was a capturing subpattern, check to see if it contained any
6125 recursive back references. If so, we must wrap it in atomic brackets.
6126 In any event, remove the block from the chain. */
6127
6128 if (capnumber > 0)
6129 {
6130 if (cd->open_caps->flag)
6131 {
6132 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6133 code - start_bracket);
6134 *start_bracket = OP_ONCE;
6135 code += 1 + LINK_SIZE;
6136 PUT(start_bracket, 1, code - start_bracket);
6137 *code = OP_KET;
6138 PUT(code, 1, code - start_bracket);
6139 code += 1 + LINK_SIZE;
6140 length += 2 + 2*LINK_SIZE;
6141 }
6142 cd->open_caps = cd->open_caps->next;
6143 }
6144
6145 /* Reset options if needed. */
6146
6147 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6148 {
6149 *code++ = OP_OPT;
6150 *code++ = oldims;
6151 length += 2;
6152 }
6153
6154 /* Retain the highest bracket number, in case resetting was used. */
6155
6156 cd->bracount = max_bracount;
6157
6158 /* Set values to pass back */
6159
6160 *codeptr = code;
6161 *ptrptr = ptr;
6162 *firstbyteptr = firstbyte;
6163 *reqbyteptr = reqbyte;
6164 if (lengthptr != NULL)
6165 {
6166 if (OFLOW_MAX - *lengthptr < length)
6167 {
6168 *errorcodeptr = ERR20;
6169 return FALSE;
6170 }
6171 *lengthptr += length;
6172 }
6173 return TRUE;
6174 }
6175
6176 /* Another branch follows. In the pre-compile phase, we can move the code
6177 pointer back to where it was for the start of the first branch. (That is,
6178 pretend that each branch is the only one.)
6179
6180 In the real compile phase, insert an ALT node. Its length field points back
6181 to the previous branch while the bracket remains open. At the end the chain
6182 is reversed. It's done like this so that the start of the bracket has a
6183 zero offset until it is closed, making it possible to detect recursion. */
6184
6185 if (lengthptr != NULL)
6186 {
6187 code = *codeptr + 1 + LINK_SIZE + skipbytes;
6188 length += 1 + LINK_SIZE;
6189 }
6190 else
6191 {
6192 *code = OP_ALT;
6193 PUT(code, 1, code - last_branch);
6194 bc.current_branch = last_branch = code;
6195 code += 1 + LINK_SIZE;
6196 }
6197
6198 ptr++;
6199 }
6200 /* Control never reaches here */
6201 }
6202
6203
6204
6205
6206 /*************************************************
6207 * Check for anchored expression *
6208 *************************************************/
6209
6210 /* Try to find out if this is an anchored regular expression. Consider each
6211 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6212 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6213 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6214 counts, since OP_CIRC can match in the middle.
6215
6216 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6217 This is the code for \G, which means "match at start of match position, taking
6218 into account the match offset".
6219
6220 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6221 because that will try the rest of the pattern at all possible matching points,
6222 so there is no point trying again.... er ....
6223
6224 .... except when the .* appears inside capturing parentheses, and there is a
6225 subsequent back reference to those parentheses. We haven't enough information
6226 to catch that case precisely.
6227
6228 At first, the best we could do was to detect when .* was in capturing brackets
6229 and the highest back reference was greater than or equal to that level.
6230 However, by keeping a bitmap of the first 31 back references, we can catch some
6231 of the more common cases more precisely.
6232
6233 Arguments:
6234 code points to start of expression (the bracket)
6235 options points to the options setting
6236 bracket_map a bitmap of which brackets we are inside while testing; this
6237 handles up to substring 31; after that we just have to take
6238 the less precise approach
6239 backref_map the back reference bitmap
6240
6241 Returns: TRUE or FALSE
6242 */
6243
6244 static BOOL
6245 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6246 unsigned int backref_map)
6247 {
6248 do {
6249 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6250 options, PCRE_MULTILINE, FALSE);
6251 register int op = *scode;
6252
6253 /* Non-capturing brackets */
6254
6255 if (op == OP_BRA)
6256 {
6257 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6258 }
6259
6260 /* Capturing brackets */
6261
6262 else if (op == OP_CBRA)
6263 {
6264 int n = GET2(scode, 1+LINK_SIZE);
6265 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6266 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6267 }
6268
6269 /* Other brackets */
6270
6271 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6272 {
6273 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6274 }
6275
6276 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6277 it isn't in brackets that are or may be referenced. */
6278
6279 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6280 op == OP_TYPEPOSSTAR))
6281 {
6282 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6283 return FALSE;
6284 }
6285
6286 /* Check for explicit anchoring */
6287
6288 else if (op != OP_SOD && op != OP_SOM &&
6289 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6290 return FALSE;
6291 code += GET(code, 1);
6292 }
6293 while (*code == OP_ALT); /* Loop for each alternative */
6294 return TRUE;
6295 }
6296
6297
6298
6299 /*************************************************
6300 * Check for starting with ^ or .* *
6301 *************************************************/
6302
6303 /* This is called to find out if every branch starts with ^ or .* so that
6304 "first char" processing can be done to speed things up in multiline
6305 matching and for non-DOTALL patterns that start with .* (which must start at
6306 the beginning or after \n). As in the case of is_anchored() (see above), we
6307 have to take account of back references to capturing brackets that contain .*
6308 because in that case we can't make the assumption.
6309
6310 Arguments:
6311 code points to start of expression (the bracket)
6312 bracket_map a bitmap of which brackets we are inside while testing; this
6313 handles up to substring 31; after that we just have to take
6314 the less precise approach
6315 backref_map the back reference bitmap
6316
6317 Returns: TRUE or FALSE
6318 */
6319
6320 static BOOL
6321 is_startline(const uschar *code, unsigned int bracket_map,
6322 unsigned int backref_map)
6323 {
6324 do {
6325 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6326 NULL, 0, FALSE);
6327 register int op = *scode;
6328
6329 /* If we are at the start of a conditional assertion group, *both* the
6330 conditional assertion *and* what follows the condition must satisfy the test
6331 for start of line. Other kinds of condition fail. Note that there may be an
6332 auto-callout at the start of a condition. */
6333
6334 if (op == OP_COND)
6335 {
6336 scode += 1 + LINK_SIZE;
6337 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6338 switch (*scode)
6339 {
6340 case OP_CREF:
6341 case OP_NCREF:
6342 case OP_RREF:
6343 case OP_NRREF:
6344 case OP_DEF:
6345 return FALSE;
6346
6347 default: /* Assertion */
6348 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6349 do scode += GET(scode, 1); while (*scode == OP_ALT);
6350 scode += 1 + LINK_SIZE;
6351 break;
6352 }
6353 scode = first_significant_code(scode, NULL, 0, FALSE);
6354 op = *scode;
6355 }
6356
6357 /* Non-capturing brackets */
6358
6359 if (op == OP_BRA)
6360 {
6361 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6362 }
6363
6364 /* Capturing brackets */
6365
6366 else if (op == OP_CBRA)
6367 {
6368 int n = GET2(scode, 1+LINK_SIZE);
6369 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6370 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6371 }
6372
6373 /* Other brackets */
6374
6375 else if (op == OP_ASSERT || op == OP_ONCE)
6376 {
6377 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6378 }
6379
6380 /* .* means "start at start or after \n" if it isn't in brackets that
6381 may be referenced. */
6382
6383 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6384 {
6385 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6386 }
6387
6388 /* Check for explicit circumflex */
6389
6390 else if (op != OP_CIRC) return FALSE;
6391
6392 /* Move on to the next alternative */
6393
6394 code += GET(code, 1);
6395 }
6396 while (*code == OP_ALT); /* Loop for each alternative */
6397 return TRUE;
6398 }
6399
6400
6401
6402 /*************************************************
6403 * Check for asserted fixed first char *
6404 *************************************************/
6405
6406 /* During compilation, the "first char" settings from forward assertions are
6407 discarded, because they can cause conflicts with actual literals that follow.
6408 However, if we end up without a first char setting for an unanchored pattern,
6409 it is worth scanning the regex to see if there is an initial asserted first
6410 char. If all branches start with the same asserted char, or with a bracket all
6411 of whose alternatives start with the same asserted char (recurse ad lib), then
6412 we return that char, otherwise -1.
6413
6414 Arguments:
6415 code points to start of expression (the bracket)
6416 options pointer to the options (used to check casing changes)
6417 inassert TRUE if in an assertion
6418
6419 Returns: -1 or the fixed first char
6420 */
6421
6422 static int
6423 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6424 {
6425 register int c = -1;
6426 do {
6427 int d;
6428 const uschar *scode =
6429 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6430 register int op = *scode;
6431
6432 switch(op)
6433 {
6434 default:
6435 return -1;
6436
6437 case OP_BRA:
6438 case OP_CBRA:
6439 case OP_ASSERT:
6440 case OP_ONCE:
6441 case OP_COND:
6442 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6443 return -1;
6444 if (c < 0) c = d; else if (c != d) return -1;
6445 break;
6446
6447 case OP_EXACT: /* Fall through */
6448 scode += 2;
6449
6450 case OP_CHAR:
6451 case OP_CHARNC:
6452 case OP_PLUS:
6453 case OP_MINPLUS:
6454 case OP_POSPLUS:
6455 if (!inassert) return -1;
6456 if (c < 0)
6457 {
6458 c = scode[1];
6459 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6460 }
6461 else if (c != scode[1]) return -1;
6462 break;
6463 }
6464
6465 code += GET(code, 1);
6466 }
6467 while (*code == OP_ALT);
6468 return c;
6469 }
6470
6471
6472
6473 /*************************************************
6474 * Compile a Regular Expression *
6475 *************************************************/
6476
6477 /* This function takes a string and returns a pointer to a block of store
6478 holding a compiled version of the expression. The original API for this
6479 function had no error code return variable; it is retained for backwards
6480 compatibility. The new function is given a new name.
6481
6482 Arguments:
6483 pattern the regular expression
6484 options various option bits
6485 errorcodeptr pointer to error code variable (pcre_compile2() only)
6486 can be NULL if you don't want a code value
6487 errorptr pointer to pointer to error text
6488 erroroffset ptr offset in pattern where error was detected
6489 tables pointer to character tables or NULL
6490
6491 Returns: pointer to compiled data block, or NULL on error,
6492 with errorptr and erroroffset set
6493 */
6494
6495 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6496 pcre_compile(const char *pattern, int options, const char **errorptr,
6497 int *erroroffset, const unsigned char *tables)
6498 {
6499 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6500 }
6501
6502
6503 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6504 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6505 const char **errorptr, int *erroroffset, const unsigned char *tables)
6506 {
6507 real_pcre *re;
6508 int length = 1; /* For final END opcode */
6509 int firstbyte, reqbyte, newline;
6510 int errorcode = 0;
6511 int skipatstart = 0;
6512 BOOL utf8 = (options & PCRE_UTF8) != 0;
6513 size_t size;
6514 uschar *code;
6515 const uschar *codestart;
6516 const uschar *ptr;
6517 compile_data compile_block;
6518 compile_data *cd = &compile_block;
6519
6520 /* This space is used for "compiling" into during the first phase, when we are
6521 computing the amount of memory that is needed. Compiled items are thrown away
6522 as soon as possible, so that a fairly large buffer should be sufficient for
6523 this purpose. The same space is used in the second phase for remembering where
6524 to fill in forward references to subpatterns. */
6525
6526 uschar cworkspace[COMPILE_WORK_SIZE];
6527
6528 /* Set this early so that early errors get offset 0. */
6529
6530 ptr = (const uschar *)pattern;
6531
6532 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6533 can do is just return NULL, but we can set a code value if there is a code
6534 pointer. */
6535
6536 if (errorptr == NULL)
6537 {
6538 if (errorcodeptr != NULL) *errorcodeptr = 99;
6539 return NULL;
6540 }
6541
6542 *errorptr = NULL;
6543 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6544
6545 /* However, we can give a message for this error */
6546
6547 if (erroroffset == NULL)
6548 {
6549 errorcode = ERR16;
6550 goto PCRE_EARLY_ERROR_RETURN2;
6551 }
6552
6553 *erroroffset = 0;
6554
6555 /* Set up pointers to the individual character tables */
6556
6557 if (tables == NULL) tables = _pcre_default_tables;
6558 cd->lcc = tables + lcc_offset;
6559 cd->fcc = tables + fcc_offset;
6560 cd->cbits = tables + cbits_offset;
6561 cd->ctypes = tables + ctypes_offset;
6562
6563 /* Check that all undefined public option bits are zero */
6564
6565 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6566 {
6567 errorcode = ERR17;
6568 goto PCRE_EARLY_ERROR_RETURN;
6569 }
6570
6571 /* Check for global one-time settings at the start of the pattern, and remember
6572 the offset for later. */
6573
6574 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6575 ptr[skipatstart+1] == CHAR_ASTERISK)
6576 {
6577 int newnl = 0;
6578 int newbsr = 0;
6579
6580 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6581 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6582
6583 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6584 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6585 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6586 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6587 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6588 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6589 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6590 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6591 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6592 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6593
6594 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6595 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6596 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6597 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6598
6599 if (newnl != 0)
6600 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6601 else if (newbsr != 0)
6602 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6603 else break;
6604 }
6605
6606 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6607
6608 #ifdef SUPPORT_UTF8
6609 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6610 (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6611 {
6612 errorcode = ERR44;
6613 goto PCRE_EARLY_ERROR_RETURN2;
6614 }
6615 #else
6616 if (utf8)
6617 {
6618 errorcode = ERR32;
6619 goto PCRE_EARLY_ERROR_RETURN;
6620 }
6621 #endif
6622
6623 /* Check validity of \R options. */
6624
6625 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6626 {
6627 case 0:
6628 case PCRE_BSR_ANYCRLF:
6629 case PCRE_BSR_UNICODE:
6630 break;
6631 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6632 }
6633
6634 /* Handle different types of newline. The three bits give seven cases. The
6635 current code allows for fixed one- or two-byte sequences, plus "any" and
6636 "anycrlf". */
6637
6638 switch (options & PCRE_NEWLINE_BITS)
6639 {
6640 case 0: newline = NEWLINE; break; /* Build-time default */
6641 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6642 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6643 case PCRE_NEWLINE_CR+
6644 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6645 case PCRE_NEWLINE_ANY: newline = -1; break;
6646 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6647 default: errorc