/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 503 - (show annotations)
Sun Mar 7 17:35:52 2010 UTC (9 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 222952 byte(s)
Error occurred while calculating annotation data.
Fix incorrect compile time error for certain types of recursive patterns.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57 also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. */
59
60 #ifdef PCRE_DEBUG
61 #include "pcre_printint.src"
62 #endif
63
64
65 /* Macro for setting individual bits in class bitmaps. */
66
67 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68
69 /* Maximum length value to check against when making sure that the integer that
70 holds the compiled pattern length does not overflow. We make it a bit less than
71 INT_MAX to allow for adding in group terminating bytes, so that we don't have
72 to check them every time. */
73
74 #define OFLOW_MAX (INT_MAX - 20)
75
76
77 /*************************************************
78 * Code parameters and static tables *
79 *************************************************/
80
81 /* This value specifies the size of stack workspace that is used during the
82 first pre-compile phase that determines how much memory is required. The regex
83 is partly compiled into this space, but the compiled parts are discarded as
84 soon as they can be, so that hopefully there will never be an overrun. The code
85 does, however, check for an overrun. The largest amount I've seen used is 218,
86 so this number is very generous.
87
88 The same workspace is used during the second, actual compile phase for
89 remembering forward references to groups so that they can be filled in at the
90 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91 is 4 there is plenty of room. */
92
93 #define COMPILE_WORK_SIZE (4096)
94
95
96 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
97 are simple data values; negative values are for special things like \d and so
98 on. Zero means further processing is needed (for things like \x), or the escape
99 is invalid. */
100
101 #ifndef EBCDIC
102
103 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
104 in UTF-8 mode. */
105
106 static const short int escapes[] = {
107 0, 0,
108 0, 0,
109 0, 0,
110 0, 0,
111 0, 0,
112 CHAR_COLON, CHAR_SEMICOLON,
113 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
114 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
115 CHAR_COMMERCIAL_AT, -ESC_A,
116 -ESC_B, -ESC_C,
117 -ESC_D, -ESC_E,
118 0, -ESC_G,
119 -ESC_H, 0,
120 0, -ESC_K,
121 0, 0,
122 0, 0,
123 -ESC_P, -ESC_Q,
124 -ESC_R, -ESC_S,
125 0, 0,
126 -ESC_V, -ESC_W,
127 -ESC_X, 0,
128 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
129 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
130 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
131 CHAR_GRAVE_ACCENT, 7,
132 -ESC_b, 0,
133 -ESC_d, ESC_e,
134 ESC_f, 0,
135 -ESC_h, 0,
136 0, -ESC_k,
137 0, 0,
138 ESC_n, 0,
139 -ESC_p, 0,
140 ESC_r, -ESC_s,
141 ESC_tee, 0,
142 -ESC_v, -ESC_w,
143 0, 0,
144 -ESC_z
145 };
146
147 #else
148
149 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
150
151 static const short int escapes[] = {
152 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
153 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
154 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
155 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
156 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
157 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
158 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
159 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
160 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
161 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
162 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
163 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
164 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
165 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
166 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
167 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
168 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
169 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
170 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
171 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
172 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
173 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
174 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
175 };
176 #endif
177
178
179 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
180 searched linearly. Put all the names into a single string, in order to reduce
181 the number of relocations when a shared library is dynamically linked. The
182 string is built from string macros so that it works in UTF-8 mode on EBCDIC
183 platforms. */
184
185 typedef struct verbitem {
186 int len;
187 int op;
188 } verbitem;
189
190 static const char verbnames[] =
191 STRING_ACCEPT0
192 STRING_COMMIT0
193 STRING_F0
194 STRING_FAIL0
195 STRING_PRUNE0
196 STRING_SKIP0
197 STRING_THEN;
198
199 static const verbitem verbs[] = {
200 { 6, OP_ACCEPT },
201 { 6, OP_COMMIT },
202 { 1, OP_FAIL },
203 { 4, OP_FAIL },
204 { 5, OP_PRUNE },
205 { 4, OP_SKIP },
206 { 4, OP_THEN }
207 };
208
209 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
210
211
212 /* Tables of names of POSIX character classes and their lengths. The names are
213 now all in a single string, to reduce the number of relocations when a shared
214 library is dynamically loaded. The list of lengths is terminated by a zero
215 length entry. The first three must be alpha, lower, upper, as this is assumed
216 for handling case independence. */
217
218 static const char posix_names[] =
219 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
220 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
221 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
222 STRING_word0 STRING_xdigit;
223
224 static const uschar posix_name_lengths[] = {
225 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
226
227 /* Table of class bit maps for each POSIX class. Each class is formed from a
228 base map, with an optional addition or removal of another map. Then, for some
229 classes, there is some additional tweaking: for [:blank:] the vertical space
230 characters are removed, and for [:alpha:] and [:alnum:] the underscore
231 character is removed. The triples in the table consist of the base map offset,
232 second map offset or -1 if no second map, and a non-negative value for map
233 addition or a negative value for map subtraction (if there are two maps). The
234 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
235 remove vertical space characters, 2 => remove underscore. */
236
237 static const int posix_class_maps[] = {
238 cbit_word, cbit_digit, -2, /* alpha */
239 cbit_lower, -1, 0, /* lower */
240 cbit_upper, -1, 0, /* upper */
241 cbit_word, -1, 2, /* alnum - word without underscore */
242 cbit_print, cbit_cntrl, 0, /* ascii */
243 cbit_space, -1, 1, /* blank - a GNU extension */
244 cbit_cntrl, -1, 0, /* cntrl */
245 cbit_digit, -1, 0, /* digit */
246 cbit_graph, -1, 0, /* graph */
247 cbit_print, -1, 0, /* print */
248 cbit_punct, -1, 0, /* punct */
249 cbit_space, -1, 0, /* space */
250 cbit_word, -1, 0, /* word - a Perl extension */
251 cbit_xdigit,-1, 0 /* xdigit */
252 };
253
254
255 #define STRING(a) # a
256 #define XSTRING(s) STRING(s)
257
258 /* The texts of compile-time error messages. These are "char *" because they
259 are passed to the outside world. Do not ever re-use any error number, because
260 they are documented. Always add a new error instead. Messages marked DEAD below
261 are no longer used. This used to be a table of strings, but in order to reduce
262 the number of relocations needed when a shared library is loaded dynamically,
263 it is now one long string. We cannot use a table of offsets, because the
264 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
265 simply count through to the one we want - this isn't a performance issue
266 because these strings are used only when there is a compilation error.
267
268 Each substring ends with \0 to insert a null character. This includes the final
269 substring, so that the whole string ends with \0\0, which can be detected when
270 counting through. */
271
272 static const char error_texts[] =
273 "no error\0"
274 "\\ at end of pattern\0"
275 "\\c at end of pattern\0"
276 "unrecognized character follows \\\0"
277 "numbers out of order in {} quantifier\0"
278 /* 5 */
279 "number too big in {} quantifier\0"
280 "missing terminating ] for character class\0"
281 "invalid escape sequence in character class\0"
282 "range out of order in character class\0"
283 "nothing to repeat\0"
284 /* 10 */
285 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
286 "internal error: unexpected repeat\0"
287 "unrecognized character after (? or (?-\0"
288 "POSIX named classes are supported only within a class\0"
289 "missing )\0"
290 /* 15 */
291 "reference to non-existent subpattern\0"
292 "erroffset passed as NULL\0"
293 "unknown option bit(s) set\0"
294 "missing ) after comment\0"
295 "parentheses nested too deeply\0" /** DEAD **/
296 /* 20 */
297 "regular expression is too large\0"
298 "failed to get memory\0"
299 "unmatched parentheses\0"
300 "internal error: code overflow\0"
301 "unrecognized character after (?<\0"
302 /* 25 */
303 "lookbehind assertion is not fixed length\0"
304 "malformed number or name after (?(\0"
305 "conditional group contains more than two branches\0"
306 "assertion expected after (?(\0"
307 "(?R or (?[+-]digits must be followed by )\0"
308 /* 30 */
309 "unknown POSIX class name\0"
310 "POSIX collating elements are not supported\0"
311 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
312 "spare error\0" /** DEAD **/
313 "character value in \\x{...} sequence is too large\0"
314 /* 35 */
315 "invalid condition (?(0)\0"
316 "\\C not allowed in lookbehind assertion\0"
317 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
318 "number after (?C is > 255\0"
319 "closing ) for (?C expected\0"
320 /* 40 */
321 "recursive call could loop indefinitely\0"
322 "unrecognized character after (?P\0"
323 "syntax error in subpattern name (missing terminator)\0"
324 "two named subpatterns have the same name\0"
325 "invalid UTF-8 string\0"
326 /* 45 */
327 "support for \\P, \\p, and \\X has not been compiled\0"
328 "malformed \\P or \\p sequence\0"
329 "unknown property name after \\P or \\p\0"
330 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
331 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
332 /* 50 */
333 "repeated subpattern is too long\0" /** DEAD **/
334 "octal value is greater than \\377 (not in UTF-8 mode)\0"
335 "internal error: overran compiling workspace\0"
336 "internal error: previously-checked referenced subpattern not found\0"
337 "DEFINE group contains more than one branch\0"
338 /* 55 */
339 "repeating a DEFINE group is not allowed\0"
340 "inconsistent NEWLINE options\0"
341 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
342 "a numbered reference must not be zero\0"
343 "(*VERB) with an argument is not supported\0"
344 /* 60 */
345 "(*VERB) not recognized\0"
346 "number is too big\0"
347 "subpattern name expected\0"
348 "digit expected after (?+\0"
349 "] is an invalid data character in JavaScript compatibility mode\0"
350 /* 65 */
351 "different names for subpatterns of the same number are not allowed\0";
352
353 /* Table to identify digits and hex digits. This is used when compiling
354 patterns. Note that the tables in chartables are dependent on the locale, and
355 may mark arbitrary characters as digits - but the PCRE compiling code expects
356 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
357 a private table here. It costs 256 bytes, but it is a lot faster than doing
358 character value tests (at least in some simple cases I timed), and in some
359 applications one wants PCRE to compile efficiently as well as match
360 efficiently.
361
362 For convenience, we use the same bit definitions as in chartables:
363
364 0x04 decimal digit
365 0x08 hexadecimal digit
366
367 Then we can use ctype_digit and ctype_xdigit in the code. */
368
369 #ifndef EBCDIC
370
371 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
372 UTF-8 mode. */
373
374 static const unsigned char digitab[] =
375 {
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
382 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
383 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
384 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
388 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
395 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
406 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
408
409 #else
410
411 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
412
413 static const unsigned char digitab[] =
414 {
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
416 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
417 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
418 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
419 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
420 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
421 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
422 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
425 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
427 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
431 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
433 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
435 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
439 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
441 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
442 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
443 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
444 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
445 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
446 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
447
448 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
449 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
450 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
451 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
452 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
453 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
454 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
456 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
457 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
458 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
459 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
460 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
461 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
462 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
463 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
464 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
465 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
466 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
467 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
468 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
469 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
470 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
471 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
472 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
473 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
474 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
475 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
476 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
477 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
478 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
479 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
480 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
481 #endif
482
483
484 /* Definition to allow mutual recursion */
485
486 static BOOL
487 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
488 int *, int *, branch_chain *, compile_data *, int *);
489
490
491
492 /*************************************************
493 * Find an error text *
494 *************************************************/
495
496 /* The error texts are now all in one long string, to save on relocations. As
497 some of the text is of unknown length, we can't use a table of offsets.
498 Instead, just count through the strings. This is not a performance issue
499 because it happens only when there has been a compilation error.
500
501 Argument: the error number
502 Returns: pointer to the error string
503 */
504
505 static const char *
506 find_error_text(int n)
507 {
508 const char *s = error_texts;
509 for (; n > 0; n--)
510 {
511 while (*s++ != 0) {};
512 if (*s == 0) return "Error text not found (please report)";
513 }
514 return s;
515 }
516
517
518 /*************************************************
519 * Handle escapes *
520 *************************************************/
521
522 /* This function is called when a \ has been encountered. It either returns a
523 positive value for a simple escape such as \n, or a negative value which
524 encodes one of the more complicated things such as \d. A backreference to group
525 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
526 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
527 ptr is pointing at the \. On exit, it is on the final character of the escape
528 sequence.
529
530 Arguments:
531 ptrptr points to the pattern position pointer
532 errorcodeptr points to the errorcode variable
533 bracount number of previous extracting brackets
534 options the options bits
535 isclass TRUE if inside a character class
536
537 Returns: zero or positive => a data character
538 negative => a special escape sequence
539 on error, errorcodeptr is set
540 */
541
542 static int
543 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
544 int options, BOOL isclass)
545 {
546 BOOL utf8 = (options & PCRE_UTF8) != 0;
547 const uschar *ptr = *ptrptr + 1;
548 int c, i;
549
550 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
551 ptr--; /* Set pointer back to the last byte */
552
553 /* If backslash is at the end of the pattern, it's an error. */
554
555 if (c == 0) *errorcodeptr = ERR1;
556
557 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
558 in a table. A non-zero result is something that can be returned immediately.
559 Otherwise further processing may be required. */
560
561 #ifndef EBCDIC /* ASCII/UTF-8 coding */
562 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
563 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
564
565 #else /* EBCDIC coding */
566 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
567 else if ((i = escapes[c - 0x48]) != 0) c = i;
568 #endif
569
570 /* Escapes that need further processing, or are illegal. */
571
572 else
573 {
574 const uschar *oldptr;
575 BOOL braced, negated;
576
577 switch (c)
578 {
579 /* A number of Perl escapes are not handled by PCRE. We give an explicit
580 error. */
581
582 case CHAR_l:
583 case CHAR_L:
584 case CHAR_N:
585 case CHAR_u:
586 case CHAR_U:
587 *errorcodeptr = ERR37;
588 break;
589
590 /* \g must be followed by one of a number of specific things:
591
592 (1) A number, either plain or braced. If positive, it is an absolute
593 backreference. If negative, it is a relative backreference. This is a Perl
594 5.10 feature.
595
596 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
597 is part of Perl's movement towards a unified syntax for back references. As
598 this is synonymous with \k{name}, we fudge it up by pretending it really
599 was \k.
600
601 (3) For Oniguruma compatibility we also support \g followed by a name or a
602 number either in angle brackets or in single quotes. However, these are
603 (possibly recursive) subroutine calls, _not_ backreferences. Just return
604 the -ESC_g code (cf \k). */
605
606 case CHAR_g:
607 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
608 {
609 c = -ESC_g;
610 break;
611 }
612
613 /* Handle the Perl-compatible cases */
614
615 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
616 {
617 const uschar *p;
618 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
619 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
620 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
621 {
622 c = -ESC_k;
623 break;
624 }
625 braced = TRUE;
626 ptr++;
627 }
628 else braced = FALSE;
629
630 if (ptr[1] == CHAR_MINUS)
631 {
632 negated = TRUE;
633 ptr++;
634 }
635 else negated = FALSE;
636
637 c = 0;
638 while ((digitab[ptr[1]] & ctype_digit) != 0)
639 c = c * 10 + *(++ptr) - CHAR_0;
640
641 if (c < 0) /* Integer overflow */
642 {
643 *errorcodeptr = ERR61;
644 break;
645 }
646
647 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
648 {
649 *errorcodeptr = ERR57;
650 break;
651 }
652
653 if (c == 0)
654 {
655 *errorcodeptr = ERR58;
656 break;
657 }
658
659 if (negated)
660 {
661 if (c > bracount)
662 {
663 *errorcodeptr = ERR15;
664 break;
665 }
666 c = bracount - (c - 1);
667 }
668
669 c = -(ESC_REF + c);
670 break;
671
672 /* The handling of escape sequences consisting of a string of digits
673 starting with one that is not zero is not straightforward. By experiment,
674 the way Perl works seems to be as follows:
675
676 Outside a character class, the digits are read as a decimal number. If the
677 number is less than 10, or if there are that many previous extracting
678 left brackets, then it is a back reference. Otherwise, up to three octal
679 digits are read to form an escaped byte. Thus \123 is likely to be octal
680 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
681 value is greater than 377, the least significant 8 bits are taken. Inside a
682 character class, \ followed by a digit is always an octal number. */
683
684 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
685 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
686
687 if (!isclass)
688 {
689 oldptr = ptr;
690 c -= CHAR_0;
691 while ((digitab[ptr[1]] & ctype_digit) != 0)
692 c = c * 10 + *(++ptr) - CHAR_0;
693 if (c < 0) /* Integer overflow */
694 {
695 *errorcodeptr = ERR61;
696 break;
697 }
698 if (c < 10 || c <= bracount)
699 {
700 c = -(ESC_REF + c);
701 break;
702 }
703 ptr = oldptr; /* Put the pointer back and fall through */
704 }
705
706 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
707 generates a binary zero byte and treats the digit as a following literal.
708 Thus we have to pull back the pointer by one. */
709
710 if ((c = *ptr) >= CHAR_8)
711 {
712 ptr--;
713 c = 0;
714 break;
715 }
716
717 /* \0 always starts an octal number, but we may drop through to here with a
718 larger first octal digit. The original code used just to take the least
719 significant 8 bits of octal numbers (I think this is what early Perls used
720 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
721 than 3 octal digits. */
722
723 case CHAR_0:
724 c -= CHAR_0;
725 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
726 c = c * 8 + *(++ptr) - CHAR_0;
727 if (!utf8 && c > 255) *errorcodeptr = ERR51;
728 break;
729
730 /* \x is complicated. \x{ddd} is a character number which can be greater
731 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
732 treated as a data character. */
733
734 case CHAR_x:
735 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
736 {
737 const uschar *pt = ptr + 2;
738 int count = 0;
739
740 c = 0;
741 while ((digitab[*pt] & ctype_xdigit) != 0)
742 {
743 register int cc = *pt++;
744 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
745 count++;
746
747 #ifndef EBCDIC /* ASCII/UTF-8 coding */
748 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
749 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
750 #else /* EBCDIC coding */
751 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
752 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
753 #endif
754 }
755
756 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
757 {
758 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
759 ptr = pt;
760 break;
761 }
762
763 /* If the sequence of hex digits does not end with '}', then we don't
764 recognize this construct; fall through to the normal \x handling. */
765 }
766
767 /* Read just a single-byte hex-defined char */
768
769 c = 0;
770 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
771 {
772 int cc; /* Some compilers don't like */
773 cc = *(++ptr); /* ++ in initializers */
774 #ifndef EBCDIC /* ASCII/UTF-8 coding */
775 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
776 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
777 #else /* EBCDIC coding */
778 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
779 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
780 #endif
781 }
782 break;
783
784 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
785 This coding is ASCII-specific, but then the whole concept of \cx is
786 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
787
788 case CHAR_c:
789 c = *(++ptr);
790 if (c == 0)
791 {
792 *errorcodeptr = ERR2;
793 break;
794 }
795
796 #ifndef EBCDIC /* ASCII/UTF-8 coding */
797 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
798 c ^= 0x40;
799 #else /* EBCDIC coding */
800 if (c >= CHAR_a && c <= CHAR_z) c += 64;
801 c ^= 0xC0;
802 #endif
803 break;
804
805 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
806 other alphanumeric following \ is an error if PCRE_EXTRA was set;
807 otherwise, for Perl compatibility, it is a literal. This code looks a bit
808 odd, but there used to be some cases other than the default, and there may
809 be again in future, so I haven't "optimized" it. */
810
811 default:
812 if ((options & PCRE_EXTRA) != 0) switch(c)
813 {
814 default:
815 *errorcodeptr = ERR3;
816 break;
817 }
818 break;
819 }
820 }
821
822 *ptrptr = ptr;
823 return c;
824 }
825
826
827
828 #ifdef SUPPORT_UCP
829 /*************************************************
830 * Handle \P and \p *
831 *************************************************/
832
833 /* This function is called after \P or \p has been encountered, provided that
834 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
835 pointing at the P or p. On exit, it is pointing at the final character of the
836 escape sequence.
837
838 Argument:
839 ptrptr points to the pattern position pointer
840 negptr points to a boolean that is set TRUE for negation else FALSE
841 dptr points to an int that is set to the detailed property value
842 errorcodeptr points to the error code variable
843
844 Returns: type value from ucp_type_table, or -1 for an invalid type
845 */
846
847 static int
848 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
849 {
850 int c, i, bot, top;
851 const uschar *ptr = *ptrptr;
852 char name[32];
853
854 c = *(++ptr);
855 if (c == 0) goto ERROR_RETURN;
856
857 *negptr = FALSE;
858
859 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
860 negation. */
861
862 if (c == CHAR_LEFT_CURLY_BRACKET)
863 {
864 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
865 {
866 *negptr = TRUE;
867 ptr++;
868 }
869 for (i = 0; i < (int)sizeof(name) - 1; i++)
870 {
871 c = *(++ptr);
872 if (c == 0) goto ERROR_RETURN;
873 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
874 name[i] = c;
875 }
876 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
877 name[i] = 0;
878 }
879
880 /* Otherwise there is just one following character */
881
882 else
883 {
884 name[0] = c;
885 name[1] = 0;
886 }
887
888 *ptrptr = ptr;
889
890 /* Search for a recognized property name using binary chop */
891
892 bot = 0;
893 top = _pcre_utt_size;
894
895 while (bot < top)
896 {
897 i = (bot + top) >> 1;
898 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
899 if (c == 0)
900 {
901 *dptr = _pcre_utt[i].value;
902 return _pcre_utt[i].type;
903 }
904 if (c > 0) bot = i + 1; else top = i;
905 }
906
907 *errorcodeptr = ERR47;
908 *ptrptr = ptr;
909 return -1;
910
911 ERROR_RETURN:
912 *errorcodeptr = ERR46;
913 *ptrptr = ptr;
914 return -1;
915 }
916 #endif
917
918
919
920
921 /*************************************************
922 * Check for counted repeat *
923 *************************************************/
924
925 /* This function is called when a '{' is encountered in a place where it might
926 start a quantifier. It looks ahead to see if it really is a quantifier or not.
927 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
928 where the ddds are digits.
929
930 Arguments:
931 p pointer to the first char after '{'
932
933 Returns: TRUE or FALSE
934 */
935
936 static BOOL
937 is_counted_repeat(const uschar *p)
938 {
939 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
940 while ((digitab[*p] & ctype_digit) != 0) p++;
941 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
942
943 if (*p++ != CHAR_COMMA) return FALSE;
944 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
945
946 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
947 while ((digitab[*p] & ctype_digit) != 0) p++;
948
949 return (*p == CHAR_RIGHT_CURLY_BRACKET);
950 }
951
952
953
954 /*************************************************
955 * Read repeat counts *
956 *************************************************/
957
958 /* Read an item of the form {n,m} and return the values. This is called only
959 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
960 so the syntax is guaranteed to be correct, but we need to check the values.
961
962 Arguments:
963 p pointer to first char after '{'
964 minp pointer to int for min
965 maxp pointer to int for max
966 returned as -1 if no max
967 errorcodeptr points to error code variable
968
969 Returns: pointer to '}' on success;
970 current ptr on error, with errorcodeptr set non-zero
971 */
972
973 static const uschar *
974 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
975 {
976 int min = 0;
977 int max = -1;
978
979 /* Read the minimum value and do a paranoid check: a negative value indicates
980 an integer overflow. */
981
982 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
983 if (min < 0 || min > 65535)
984 {
985 *errorcodeptr = ERR5;
986 return p;
987 }
988
989 /* Read the maximum value if there is one, and again do a paranoid on its size.
990 Also, max must not be less than min. */
991
992 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
993 {
994 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
995 {
996 max = 0;
997 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
998 if (max < 0 || max > 65535)
999 {
1000 *errorcodeptr = ERR5;
1001 return p;
1002 }
1003 if (max < min)
1004 {
1005 *errorcodeptr = ERR4;
1006 return p;
1007 }
1008 }
1009 }
1010
1011 /* Fill in the required variables, and pass back the pointer to the terminating
1012 '}'. */
1013
1014 *minp = min;
1015 *maxp = max;
1016 return p;
1017 }
1018
1019
1020
1021 /*************************************************
1022 * Subroutine for finding forward reference *
1023 *************************************************/
1024
1025 /* This recursive function is called only from find_parens() below. The
1026 top-level call starts at the beginning of the pattern. All other calls must
1027 start at a parenthesis. It scans along a pattern's text looking for capturing
1028 subpatterns, and counting them. If it finds a named pattern that matches the
1029 name it is given, it returns its number. Alternatively, if the name is NULL, it
1030 returns when it reaches a given numbered subpattern. We know that if (?P< is
1031 encountered, the name will be terminated by '>' because that is checked in the
1032 first pass. Recursion is used to keep track of subpatterns that reset the
1033 capturing group numbers - the (?| feature.
1034
1035 Arguments:
1036 ptrptr address of the current character pointer (updated)
1037 cd compile background data
1038 name name to seek, or NULL if seeking a numbered subpattern
1039 lorn name length, or subpattern number if name is NULL
1040 xmode TRUE if we are in /x mode
1041 count pointer to the current capturing subpattern number (updated)
1042
1043 Returns: the number of the named subpattern, or -1 if not found
1044 */
1045
1046 static int
1047 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1048 BOOL xmode, int *count)
1049 {
1050 uschar *ptr = *ptrptr;
1051 int start_count = *count;
1052 int hwm_count = start_count;
1053 BOOL dup_parens = FALSE;
1054
1055 /* If the first character is a parenthesis, check on the type of group we are
1056 dealing with. The very first call may not start with a parenthesis. */
1057
1058 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1059 {
1060 if (ptr[1] == CHAR_QUESTION_MARK &&
1061 ptr[2] == CHAR_VERTICAL_LINE)
1062 {
1063 ptr += 3;
1064 dup_parens = TRUE;
1065 }
1066
1067 /* Handle a normal, unnamed capturing parenthesis */
1068
1069 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1070 {
1071 *count += 1;
1072 if (name == NULL && *count == lorn) return *count;
1073 ptr++;
1074 }
1075
1076 /* Handle a condition. If it is an assertion, just carry on so that it
1077 is processed as normal. If not, skip to the closing parenthesis of the
1078 condition (there can't be any nested parens. */
1079
1080 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1081 {
1082 ptr += 2;
1083 if (ptr[1] != CHAR_QUESTION_MARK)
1084 {
1085 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1086 if (*ptr != 0) ptr++;
1087 }
1088 }
1089
1090 /* We have either (? or (* and not a condition */
1091
1092 else
1093 {
1094 ptr += 2;
1095 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1096
1097 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1098
1099 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1100 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1101 {
1102 int term;
1103 const uschar *thisname;
1104 *count += 1;
1105 if (name == NULL && *count == lorn) return *count;
1106 term = *ptr++;
1107 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1108 thisname = ptr;
1109 while (*ptr != term) ptr++;
1110 if (name != NULL && lorn == ptr - thisname &&
1111 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1112 return *count;
1113 term++;
1114 }
1115 }
1116 }
1117
1118 /* Past any initial parenthesis handling, scan for parentheses or vertical
1119 bars. */
1120
1121 for (; *ptr != 0; ptr++)
1122 {
1123 /* Skip over backslashed characters and also entire \Q...\E */
1124
1125 if (*ptr == CHAR_BACKSLASH)
1126 {
1127 if (*(++ptr) == 0) goto FAIL_EXIT;
1128 if (*ptr == CHAR_Q) for (;;)
1129 {
1130 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1131 if (*ptr == 0) goto FAIL_EXIT;
1132 if (*(++ptr) == CHAR_E) break;
1133 }
1134 continue;
1135 }
1136
1137 /* Skip over character classes; this logic must be similar to the way they
1138 are handled for real. If the first character is '^', skip it. Also, if the
1139 first few characters (either before or after ^) are \Q\E or \E we skip them
1140 too. This makes for compatibility with Perl. Note the use of STR macros to
1141 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1142
1143 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1144 {
1145 BOOL negate_class = FALSE;
1146 for (;;)
1147 {
1148 if (ptr[1] == CHAR_BACKSLASH)
1149 {
1150 if (ptr[2] == CHAR_E)
1151 ptr+= 2;
1152 else if (strncmp((const char *)ptr+2,
1153 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1154 ptr += 4;
1155 else
1156 break;
1157 }
1158 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1159 {
1160 negate_class = TRUE;
1161 ptr++;
1162 }
1163 else break;
1164 }
1165
1166 /* If the next character is ']', it is a data character that must be
1167 skipped, except in JavaScript compatibility mode. */
1168
1169 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1170 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1171 ptr++;
1172
1173 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1174 {
1175 if (*ptr == 0) return -1;
1176 if (*ptr == CHAR_BACKSLASH)
1177 {
1178 if (*(++ptr) == 0) goto FAIL_EXIT;
1179 if (*ptr == CHAR_Q) for (;;)
1180 {
1181 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1182 if (*ptr == 0) goto FAIL_EXIT;
1183 if (*(++ptr) == CHAR_E) break;
1184 }
1185 continue;
1186 }
1187 }
1188 continue;
1189 }
1190
1191 /* Skip comments in /x mode */
1192
1193 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1194 {
1195 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1196 if (*ptr == 0) goto FAIL_EXIT;
1197 continue;
1198 }
1199
1200 /* Check for the special metacharacters */
1201
1202 if (*ptr == CHAR_LEFT_PARENTHESIS)
1203 {
1204 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1205 if (rc > 0) return rc;
1206 if (*ptr == 0) goto FAIL_EXIT;
1207 }
1208
1209 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1210 {
1211 if (dup_parens && *count < hwm_count) *count = hwm_count;
1212 *ptrptr = ptr;
1213 return -1;
1214 }
1215
1216 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1217 {
1218 if (*count > hwm_count) hwm_count = *count;
1219 *count = start_count;
1220 }
1221 }
1222
1223 FAIL_EXIT:
1224 *ptrptr = ptr;
1225 return -1;
1226 }
1227
1228
1229
1230
1231 /*************************************************
1232 * Find forward referenced subpattern *
1233 *************************************************/
1234
1235 /* This function scans along a pattern's text looking for capturing
1236 subpatterns, and counting them. If it finds a named pattern that matches the
1237 name it is given, it returns its number. Alternatively, if the name is NULL, it
1238 returns when it reaches a given numbered subpattern. This is used for forward
1239 references to subpatterns. We used to be able to start this scan from the
1240 current compiling point, using the current count value from cd->bracount, and
1241 do it all in a single loop, but the addition of the possibility of duplicate
1242 subpattern numbers means that we have to scan from the very start, in order to
1243 take account of such duplicates, and to use a recursive function to keep track
1244 of the different types of group.
1245
1246 Arguments:
1247 cd compile background data
1248 name name to seek, or NULL if seeking a numbered subpattern
1249 lorn name length, or subpattern number if name is NULL
1250 xmode TRUE if we are in /x mode
1251
1252 Returns: the number of the found subpattern, or -1 if not found
1253 */
1254
1255 static int
1256 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1257 {
1258 uschar *ptr = (uschar *)cd->start_pattern;
1259 int count = 0;
1260 int rc;
1261
1262 /* If the pattern does not start with an opening parenthesis, the first call
1263 to find_parens_sub() will scan right to the end (if necessary). However, if it
1264 does start with a parenthesis, find_parens_sub() will return when it hits the
1265 matching closing parens. That is why we have to have a loop. */
1266
1267 for (;;)
1268 {
1269 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1270 if (rc > 0 || *ptr++ == 0) break;
1271 }
1272
1273 return rc;
1274 }
1275
1276
1277
1278
1279 /*************************************************
1280 * Find first significant op code *
1281 *************************************************/
1282
1283 /* This is called by several functions that scan a compiled expression looking
1284 for a fixed first character, or an anchoring op code etc. It skips over things
1285 that do not influence this. For some calls, a change of option is important.
1286 For some calls, it makes sense to skip negative forward and all backward
1287 assertions, and also the \b assertion; for others it does not.
1288
1289 Arguments:
1290 code pointer to the start of the group
1291 options pointer to external options
1292 optbit the option bit whose changing is significant, or
1293 zero if none are
1294 skipassert TRUE if certain assertions are to be skipped
1295
1296 Returns: pointer to the first significant opcode
1297 */
1298
1299 static const uschar*
1300 first_significant_code(const uschar *code, int *options, int optbit,
1301 BOOL skipassert)
1302 {
1303 for (;;)
1304 {
1305 switch ((int)*code)
1306 {
1307 case OP_OPT:
1308 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309 *options = (int)code[1];
1310 code += 2;
1311 break;
1312
1313 case OP_ASSERT_NOT:
1314 case OP_ASSERTBACK:
1315 case OP_ASSERTBACK_NOT:
1316 if (!skipassert) return code;
1317 do code += GET(code, 1); while (*code == OP_ALT);
1318 code += _pcre_OP_lengths[*code];
1319 break;
1320
1321 case OP_WORD_BOUNDARY:
1322 case OP_NOT_WORD_BOUNDARY:
1323 if (!skipassert) return code;
1324 /* Fall through */
1325
1326 case OP_CALLOUT:
1327 case OP_CREF:
1328 case OP_NCREF:
1329 case OP_RREF:
1330 case OP_NRREF:
1331 case OP_DEF:
1332 code += _pcre_OP_lengths[*code];
1333 break;
1334
1335 default:
1336 return code;
1337 }
1338 }
1339 /* Control never reaches here */
1340 }
1341
1342
1343
1344
1345 /*************************************************
1346 * Find the fixed length of a branch *
1347 *************************************************/
1348
1349 /* Scan a branch and compute the fixed length of subject that will match it,
1350 if the length is fixed. This is needed for dealing with backward assertions.
1351 In UTF8 mode, the result is in characters rather than bytes. The branch is
1352 temporarily terminated with OP_END when this function is called.
1353
1354 This function is called when a backward assertion is encountered, so that if it
1355 fails, the error message can point to the correct place in the pattern.
1356 However, we cannot do this when the assertion contains subroutine calls,
1357 because they can be forward references. We solve this by remembering this case
1358 and doing the check at the end; a flag specifies which mode we are running in.
1359
1360 Arguments:
1361 code points to the start of the pattern (the bracket)
1362 options the compiling options
1363 atend TRUE if called when the pattern is complete
1364 cd the "compile data" structure
1365
1366 Returns: the fixed length,
1367 or -1 if there is no fixed length,
1368 or -2 if \C was encountered
1369 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1370 */
1371
1372 static int
1373 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1374 {
1375 int length = -1;
1376
1377 register int branchlength = 0;
1378 register uschar *cc = code + 1 + LINK_SIZE;
1379
1380 /* Scan along the opcodes for this branch. If we get to the end of the
1381 branch, check the length against that of the other branches. */
1382
1383 for (;;)
1384 {
1385 int d;
1386 uschar *ce, *cs;
1387 register int op = *cc;
1388 switch (op)
1389 {
1390 case OP_CBRA:
1391 case OP_BRA:
1392 case OP_ONCE:
1393 case OP_COND:
1394 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1395 if (d < 0) return d;
1396 branchlength += d;
1397 do cc += GET(cc, 1); while (*cc == OP_ALT);
1398 cc += 1 + LINK_SIZE;
1399 break;
1400
1401 /* Reached end of a branch; if it's a ket it is the end of a nested
1402 call. If it's ALT it is an alternation in a nested call. If it is
1403 END it's the end of the outer call. All can be handled by the same code. */
1404
1405 case OP_ALT:
1406 case OP_KET:
1407 case OP_KETRMAX:
1408 case OP_KETRMIN:
1409 case OP_END:
1410 if (length < 0) length = branchlength;
1411 else if (length != branchlength) return -1;
1412 if (*cc != OP_ALT) return length;
1413 cc += 1 + LINK_SIZE;
1414 branchlength = 0;
1415 break;
1416
1417 /* A true recursion implies not fixed length, but a subroutine call may
1418 be OK. If the subroutine is a forward reference, we can't deal with
1419 it until the end of the pattern, so return -3. */
1420
1421 case OP_RECURSE:
1422 if (!atend) return -3;
1423 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1424 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1425 if (cc > cs && cc < ce) return -1; /* Recursion */
1426 d = find_fixedlength(cs + 2, options, atend, cd);
1427 if (d < 0) return d;
1428 branchlength += d;
1429 cc += 1 + LINK_SIZE;
1430 break;
1431
1432 /* Skip over assertive subpatterns */
1433
1434 case OP_ASSERT:
1435 case OP_ASSERT_NOT:
1436 case OP_ASSERTBACK:
1437 case OP_ASSERTBACK_NOT:
1438 do cc += GET(cc, 1); while (*cc == OP_ALT);
1439 /* Fall through */
1440
1441 /* Skip over things that don't match chars */
1442
1443 case OP_REVERSE:
1444 case OP_CREF:
1445 case OP_NCREF:
1446 case OP_RREF:
1447 case OP_NRREF:
1448 case OP_DEF:
1449 case OP_OPT:
1450 case OP_CALLOUT:
1451 case OP_SOD:
1452 case OP_SOM:
1453 case OP_SET_SOM:
1454 case OP_EOD:
1455 case OP_EODN:
1456 case OP_CIRC:
1457 case OP_DOLL:
1458 case OP_NOT_WORD_BOUNDARY:
1459 case OP_WORD_BOUNDARY:
1460 cc += _pcre_OP_lengths[*cc];
1461 break;
1462
1463 /* Handle literal characters */
1464
1465 case OP_CHAR:
1466 case OP_CHARNC:
1467 case OP_NOT:
1468 branchlength++;
1469 cc += 2;
1470 #ifdef SUPPORT_UTF8
1471 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1472 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1473 #endif
1474 break;
1475
1476 /* Handle exact repetitions. The count is already in characters, but we
1477 need to skip over a multibyte character in UTF8 mode. */
1478
1479 case OP_EXACT:
1480 branchlength += GET2(cc,1);
1481 cc += 4;
1482 #ifdef SUPPORT_UTF8
1483 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1484 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1485 #endif
1486 break;
1487
1488 case OP_TYPEEXACT:
1489 branchlength += GET2(cc,1);
1490 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1491 cc += 4;
1492 break;
1493
1494 /* Handle single-char matchers */
1495
1496 case OP_PROP:
1497 case OP_NOTPROP:
1498 cc += 2;
1499 /* Fall through */
1500
1501 case OP_NOT_DIGIT:
1502 case OP_DIGIT:
1503 case OP_NOT_WHITESPACE:
1504 case OP_WHITESPACE:
1505 case OP_NOT_WORDCHAR:
1506 case OP_WORDCHAR:
1507 case OP_ANY:
1508 case OP_ALLANY:
1509 branchlength++;
1510 cc++;
1511 break;
1512
1513 /* The single-byte matcher isn't allowed */
1514
1515 case OP_ANYBYTE:
1516 return -2;
1517
1518 /* Check a class for variable quantification */
1519
1520 #ifdef SUPPORT_UTF8
1521 case OP_XCLASS:
1522 cc += GET(cc, 1) - 33;
1523 /* Fall through */
1524 #endif
1525
1526 case OP_CLASS:
1527 case OP_NCLASS:
1528 cc += 33;
1529
1530 switch (*cc)
1531 {
1532 case OP_CRSTAR:
1533 case OP_CRMINSTAR:
1534 case OP_CRQUERY:
1535 case OP_CRMINQUERY:
1536 return -1;
1537
1538 case OP_CRRANGE:
1539 case OP_CRMINRANGE:
1540 if (GET2(cc,1) != GET2(cc,3)) return -1;
1541 branchlength += GET2(cc,1);
1542 cc += 5;
1543 break;
1544
1545 default:
1546 branchlength++;
1547 }
1548 break;
1549
1550 /* Anything else is variable length */
1551
1552 default:
1553 return -1;
1554 }
1555 }
1556 /* Control never gets here */
1557 }
1558
1559
1560
1561
1562 /*************************************************
1563 * Scan compiled regex for specific bracket *
1564 *************************************************/
1565
1566 /* This little function scans through a compiled pattern until it finds a
1567 capturing bracket with the given number, or, if the number is negative, an
1568 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1569 so that it can be called from pcre_study() when finding the minimum matching
1570 length.
1571
1572 Arguments:
1573 code points to start of expression
1574 utf8 TRUE in UTF-8 mode
1575 number the required bracket number or negative to find a lookbehind
1576
1577 Returns: pointer to the opcode for the bracket, or NULL if not found
1578 */
1579
1580 const uschar *
1581 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1582 {
1583 for (;;)
1584 {
1585 register int c = *code;
1586 if (c == OP_END) return NULL;
1587
1588 /* XCLASS is used for classes that cannot be represented just by a bit
1589 map. This includes negated single high-valued characters. The length in
1590 the table is zero; the actual length is stored in the compiled code. */
1591
1592 if (c == OP_XCLASS) code += GET(code, 1);
1593
1594 /* Handle recursion */
1595
1596 else if (c == OP_REVERSE)
1597 {
1598 if (number < 0) return (uschar *)code;
1599 code += _pcre_OP_lengths[c];
1600 }
1601
1602 /* Handle capturing bracket */
1603
1604 else if (c == OP_CBRA)
1605 {
1606 int n = GET2(code, 1+LINK_SIZE);
1607 if (n == number) return (uschar *)code;
1608 code += _pcre_OP_lengths[c];
1609 }
1610
1611 /* Otherwise, we can get the item's length from the table, except that for
1612 repeated character types, we have to test for \p and \P, which have an extra
1613 two bytes of parameters. */
1614
1615 else
1616 {
1617 switch(c)
1618 {
1619 case OP_TYPESTAR:
1620 case OP_TYPEMINSTAR:
1621 case OP_TYPEPLUS:
1622 case OP_TYPEMINPLUS:
1623 case OP_TYPEQUERY:
1624 case OP_TYPEMINQUERY:
1625 case OP_TYPEPOSSTAR:
1626 case OP_TYPEPOSPLUS:
1627 case OP_TYPEPOSQUERY:
1628 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1629 break;
1630
1631 case OP_TYPEUPTO:
1632 case OP_TYPEMINUPTO:
1633 case OP_TYPEEXACT:
1634 case OP_TYPEPOSUPTO:
1635 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1636 break;
1637 }
1638
1639 /* Add in the fixed length from the table */
1640
1641 code += _pcre_OP_lengths[c];
1642
1643 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1644 a multi-byte character. The length in the table is a minimum, so we have to
1645 arrange to skip the extra bytes. */
1646
1647 #ifdef SUPPORT_UTF8
1648 if (utf8) switch(c)
1649 {
1650 case OP_CHAR:
1651 case OP_CHARNC:
1652 case OP_EXACT:
1653 case OP_UPTO:
1654 case OP_MINUPTO:
1655 case OP_POSUPTO:
1656 case OP_STAR:
1657 case OP_MINSTAR:
1658 case OP_POSSTAR:
1659 case OP_PLUS:
1660 case OP_MINPLUS:
1661 case OP_POSPLUS:
1662 case OP_QUERY:
1663 case OP_MINQUERY:
1664 case OP_POSQUERY:
1665 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1666 break;
1667 }
1668 #else
1669 (void)(utf8); /* Keep compiler happy by referencing function argument */
1670 #endif
1671 }
1672 }
1673 }
1674
1675
1676
1677 /*************************************************
1678 * Scan compiled regex for recursion reference *
1679 *************************************************/
1680
1681 /* This little function scans through a compiled pattern until it finds an
1682 instance of OP_RECURSE.
1683
1684 Arguments:
1685 code points to start of expression
1686 utf8 TRUE in UTF-8 mode
1687
1688 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1689 */
1690
1691 static const uschar *
1692 find_recurse(const uschar *code, BOOL utf8)
1693 {
1694 for (;;)
1695 {
1696 register int c = *code;
1697 if (c == OP_END) return NULL;
1698 if (c == OP_RECURSE) return code;
1699
1700 /* XCLASS is used for classes that cannot be represented just by a bit
1701 map. This includes negated single high-valued characters. The length in
1702 the table is zero; the actual length is stored in the compiled code. */
1703
1704 if (c == OP_XCLASS) code += GET(code, 1);
1705
1706 /* Otherwise, we can get the item's length from the table, except that for
1707 repeated character types, we have to test for \p and \P, which have an extra
1708 two bytes of parameters. */
1709
1710 else
1711 {
1712 switch(c)
1713 {
1714 case OP_TYPESTAR:
1715 case OP_TYPEMINSTAR:
1716 case OP_TYPEPLUS:
1717 case OP_TYPEMINPLUS:
1718 case OP_TYPEQUERY:
1719 case OP_TYPEMINQUERY:
1720 case OP_TYPEPOSSTAR:
1721 case OP_TYPEPOSPLUS:
1722 case OP_TYPEPOSQUERY:
1723 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1724 break;
1725
1726 case OP_TYPEPOSUPTO:
1727 case OP_TYPEUPTO:
1728 case OP_TYPEMINUPTO:
1729 case OP_TYPEEXACT:
1730 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1731 break;
1732 }
1733
1734 /* Add in the fixed length from the table */
1735
1736 code += _pcre_OP_lengths[c];
1737
1738 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1739 by a multi-byte character. The length in the table is a minimum, so we have
1740 to arrange to skip the extra bytes. */
1741
1742 #ifdef SUPPORT_UTF8
1743 if (utf8) switch(c)
1744 {
1745 case OP_CHAR:
1746 case OP_CHARNC:
1747 case OP_EXACT:
1748 case OP_UPTO:
1749 case OP_MINUPTO:
1750 case OP_POSUPTO:
1751 case OP_STAR:
1752 case OP_MINSTAR:
1753 case OP_POSSTAR:
1754 case OP_PLUS:
1755 case OP_MINPLUS:
1756 case OP_POSPLUS:
1757 case OP_QUERY:
1758 case OP_MINQUERY:
1759 case OP_POSQUERY:
1760 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1761 break;
1762 }
1763 #else
1764 (void)(utf8); /* Keep compiler happy by referencing function argument */
1765 #endif
1766 }
1767 }
1768 }
1769
1770
1771
1772 /*************************************************
1773 * Scan compiled branch for non-emptiness *
1774 *************************************************/
1775
1776 /* This function scans through a branch of a compiled pattern to see whether it
1777 can match the empty string or not. It is called from could_be_empty()
1778 below and from compile_branch() when checking for an unlimited repeat of a
1779 group that can match nothing. Note that first_significant_code() skips over
1780 backward and negative forward assertions when its final argument is TRUE. If we
1781 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1782 bracket whose current branch will already have been scanned.
1783
1784 Arguments:
1785 code points to start of search
1786 endcode points to where to stop
1787 utf8 TRUE if in UTF8 mode
1788 cd contains pointers to tables etc.
1789
1790 Returns: TRUE if what is matched could be empty
1791 */
1792
1793 static BOOL
1794 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1795 compile_data *cd)
1796 {
1797 register int c;
1798 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1799 code < endcode;
1800 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1801 {
1802 const uschar *ccode;
1803
1804 c = *code;
1805
1806 /* Skip over forward assertions; the other assertions are skipped by
1807 first_significant_code() with a TRUE final argument. */
1808
1809 if (c == OP_ASSERT)
1810 {
1811 do code += GET(code, 1); while (*code == OP_ALT);
1812 c = *code;
1813 continue;
1814 }
1815
1816 /* Groups with zero repeats can of course be empty; skip them. */
1817
1818 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1819 {
1820 code += _pcre_OP_lengths[c];
1821 do code += GET(code, 1); while (*code == OP_ALT);
1822 c = *code;
1823 continue;
1824 }
1825
1826 /* For a recursion/subroutine call, if its end has been reached, which
1827 implies a subroutine call, we can scan it. */
1828
1829 if (c == OP_RECURSE)
1830 {
1831 const uschar *scode = cd->start_code + GET(code, 1);
1832 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1833 do
1834 {
1835 if (!could_be_empty_branch(scode, endcode, utf8, cd)) return FALSE;
1836 scode += GET(scode, 1);
1837 }
1838 while (*scode == OP_ALT);
1839 continue;
1840 }
1841
1842 /* For other groups, scan the branches. */
1843
1844 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1845 {
1846 BOOL empty_branch;
1847 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1848
1849 /* If a conditional group has only one branch, there is a second, implied,
1850 empty branch, so just skip over the conditional, because it could be empty.
1851 Otherwise, scan the individual branches of the group. */
1852
1853 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1854 code += GET(code, 1);
1855 else
1856 {
1857 empty_branch = FALSE;
1858 do
1859 {
1860 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1861 empty_branch = TRUE;
1862 code += GET(code, 1);
1863 }
1864 while (*code == OP_ALT);
1865 if (!empty_branch) return FALSE; /* All branches are non-empty */
1866 }
1867
1868 c = *code;
1869 continue;
1870 }
1871
1872 /* Handle the other opcodes */
1873
1874 switch (c)
1875 {
1876 /* Check for quantifiers after a class. XCLASS is used for classes that
1877 cannot be represented just by a bit map. This includes negated single
1878 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1879 actual length is stored in the compiled code, so we must update "code"
1880 here. */
1881
1882 #ifdef SUPPORT_UTF8
1883 case OP_XCLASS:
1884 ccode = code += GET(code, 1);
1885 goto CHECK_CLASS_REPEAT;
1886 #endif
1887
1888 case OP_CLASS:
1889 case OP_NCLASS:
1890 ccode = code + 33;
1891
1892 #ifdef SUPPORT_UTF8
1893 CHECK_CLASS_REPEAT:
1894 #endif
1895
1896 switch (*ccode)
1897 {
1898 case OP_CRSTAR: /* These could be empty; continue */
1899 case OP_CRMINSTAR:
1900 case OP_CRQUERY:
1901 case OP_CRMINQUERY:
1902 break;
1903
1904 default: /* Non-repeat => class must match */
1905 case OP_CRPLUS: /* These repeats aren't empty */
1906 case OP_CRMINPLUS:
1907 return FALSE;
1908
1909 case OP_CRRANGE:
1910 case OP_CRMINRANGE:
1911 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1912 break;
1913 }
1914 break;
1915
1916 /* Opcodes that must match a character */
1917
1918 case OP_PROP:
1919 case OP_NOTPROP:
1920 case OP_EXTUNI:
1921 case OP_NOT_DIGIT:
1922 case OP_DIGIT:
1923 case OP_NOT_WHITESPACE:
1924 case OP_WHITESPACE:
1925 case OP_NOT_WORDCHAR:
1926 case OP_WORDCHAR:
1927 case OP_ANY:
1928 case OP_ALLANY:
1929 case OP_ANYBYTE:
1930 case OP_CHAR:
1931 case OP_CHARNC:
1932 case OP_NOT:
1933 case OP_PLUS:
1934 case OP_MINPLUS:
1935 case OP_POSPLUS:
1936 case OP_EXACT:
1937 case OP_NOTPLUS:
1938 case OP_NOTMINPLUS:
1939 case OP_NOTPOSPLUS:
1940 case OP_NOTEXACT:
1941 case OP_TYPEPLUS:
1942 case OP_TYPEMINPLUS:
1943 case OP_TYPEPOSPLUS:
1944 case OP_TYPEEXACT:
1945 return FALSE;
1946
1947 /* These are going to continue, as they may be empty, but we have to
1948 fudge the length for the \p and \P cases. */
1949
1950 case OP_TYPESTAR:
1951 case OP_TYPEMINSTAR:
1952 case OP_TYPEPOSSTAR:
1953 case OP_TYPEQUERY:
1954 case OP_TYPEMINQUERY:
1955 case OP_TYPEPOSQUERY:
1956 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1957 break;
1958
1959 /* Same for these */
1960
1961 case OP_TYPEUPTO:
1962 case OP_TYPEMINUPTO:
1963 case OP_TYPEPOSUPTO:
1964 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1965 break;
1966
1967 /* End of branch */
1968
1969 case OP_KET:
1970 case OP_KETRMAX:
1971 case OP_KETRMIN:
1972 case OP_ALT:
1973 return TRUE;
1974
1975 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1976 MINUPTO, and POSUPTO may be followed by a multibyte character */
1977
1978 #ifdef SUPPORT_UTF8
1979 case OP_STAR:
1980 case OP_MINSTAR:
1981 case OP_POSSTAR:
1982 case OP_QUERY:
1983 case OP_MINQUERY:
1984 case OP_POSQUERY:
1985 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1986 break;
1987
1988 case OP_UPTO:
1989 case OP_MINUPTO:
1990 case OP_POSUPTO:
1991 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1992 break;
1993 #endif
1994
1995 /* None of the remaining opcodes are required to match a character. */
1996
1997 default:
1998 break;
1999 }
2000 }
2001
2002 return TRUE;
2003 }
2004
2005
2006
2007 /*************************************************
2008 * Scan compiled regex for non-emptiness *
2009 *************************************************/
2010
2011 /* This function is called to check for left recursive calls. We want to check
2012 the current branch of the current pattern to see if it could match the empty
2013 string. If it could, we must look outwards for branches at other levels,
2014 stopping when we pass beyond the bracket which is the subject of the recursion.
2015
2016 Arguments:
2017 code points to start of the recursion
2018 endcode points to where to stop (current RECURSE item)
2019 bcptr points to the chain of current (unclosed) branch starts
2020 utf8 TRUE if in UTF-8 mode
2021 cd pointers to tables etc
2022
2023 Returns: TRUE if what is matched could be empty
2024 */
2025
2026 static BOOL
2027 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2028 BOOL utf8, compile_data *cd)
2029 {
2030 while (bcptr != NULL && bcptr->current_branch >= code)
2031 {
2032 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2033 return FALSE;
2034 bcptr = bcptr->outer;
2035 }
2036 return TRUE;
2037 }
2038
2039
2040
2041 /*************************************************
2042 * Check for POSIX class syntax *
2043 *************************************************/
2044
2045 /* This function is called when the sequence "[:" or "[." or "[=" is
2046 encountered in a character class. It checks whether this is followed by a
2047 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2048 reach an unescaped ']' without the special preceding character, return FALSE.
2049
2050 Originally, this function only recognized a sequence of letters between the
2051 terminators, but it seems that Perl recognizes any sequence of characters,
2052 though of course unknown POSIX names are subsequently rejected. Perl gives an
2053 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2054 didn't consider this to be a POSIX class. Likewise for [:1234:].
2055
2056 The problem in trying to be exactly like Perl is in the handling of escapes. We
2057 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2058 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2059 below handles the special case of \], but does not try to do any other escape
2060 processing. This makes it different from Perl for cases such as [:l\ower:]
2061 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2062 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2063 I think.
2064
2065 Arguments:
2066 ptr pointer to the initial [
2067 endptr where to return the end pointer
2068
2069 Returns: TRUE or FALSE
2070 */
2071
2072 static BOOL
2073 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2074 {
2075 int terminator; /* Don't combine these lines; the Solaris cc */
2076 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2077 for (++ptr; *ptr != 0; ptr++)
2078 {
2079 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2080 {
2081 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2082 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2083 {
2084 *endptr = ptr;
2085 return TRUE;
2086 }
2087 }
2088 }
2089 return FALSE;
2090 }
2091
2092
2093
2094
2095 /*************************************************
2096 * Check POSIX class name *
2097 *************************************************/
2098
2099 /* This function is called to check the name given in a POSIX-style class entry
2100 such as [:alnum:].
2101
2102 Arguments:
2103 ptr points to the first letter
2104 len the length of the name
2105
2106 Returns: a value representing the name, or -1 if unknown
2107 */
2108
2109 static int
2110 check_posix_name(const uschar *ptr, int len)
2111 {
2112 const char *pn = posix_names;
2113 register int yield = 0;
2114 while (posix_name_lengths[yield] != 0)
2115 {
2116 if (len == posix_name_lengths[yield] &&
2117 strncmp((const char *)ptr, pn, len) == 0) return yield;
2118 pn += posix_name_lengths[yield] + 1;
2119 yield++;
2120 }
2121 return -1;
2122 }
2123
2124
2125 /*************************************************
2126 * Adjust OP_RECURSE items in repeated group *
2127 *************************************************/
2128
2129 /* OP_RECURSE items contain an offset from the start of the regex to the group
2130 that is referenced. This means that groups can be replicated for fixed
2131 repetition simply by copying (because the recursion is allowed to refer to
2132 earlier groups that are outside the current group). However, when a group is
2133 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2134 inserted before it, after it has been compiled. This means that any OP_RECURSE
2135 items within it that refer to the group itself or any contained groups have to
2136 have their offsets adjusted. That one of the jobs of this function. Before it
2137 is called, the partially compiled regex must be temporarily terminated with
2138 OP_END.
2139
2140 This function has been extended with the possibility of forward references for
2141 recursions and subroutine calls. It must also check the list of such references
2142 for the group we are dealing with. If it finds that one of the recursions in
2143 the current group is on this list, it adjusts the offset in the list, not the
2144 value in the reference (which is a group number).
2145
2146 Arguments:
2147 group points to the start of the group
2148 adjust the amount by which the group is to be moved
2149 utf8 TRUE in UTF-8 mode
2150 cd contains pointers to tables etc.
2151 save_hwm the hwm forward reference pointer at the start of the group
2152
2153 Returns: nothing
2154 */
2155
2156 static void
2157 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2158 uschar *save_hwm)
2159 {
2160 uschar *ptr = group;
2161
2162 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2163 {
2164 int offset;
2165 uschar *hc;
2166
2167 /* See if this recursion is on the forward reference list. If so, adjust the
2168 reference. */
2169
2170 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2171 {
2172 offset = GET(hc, 0);
2173 if (cd->start_code + offset == ptr + 1)
2174 {
2175 PUT(hc, 0, offset + adjust);
2176 break;
2177 }
2178 }
2179
2180 /* Otherwise, adjust the recursion offset if it's after the start of this
2181 group. */
2182
2183 if (hc >= cd->hwm)
2184 {
2185 offset = GET(ptr, 1);
2186 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2187 }
2188
2189 ptr += 1 + LINK_SIZE;
2190 }
2191 }
2192
2193
2194
2195 /*************************************************
2196 * Insert an automatic callout point *
2197 *************************************************/
2198
2199 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2200 callout points before each pattern item.
2201
2202 Arguments:
2203 code current code pointer
2204 ptr current pattern pointer
2205 cd pointers to tables etc
2206
2207 Returns: new code pointer
2208 */
2209
2210 static uschar *
2211 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2212 {
2213 *code++ = OP_CALLOUT;
2214 *code++ = 255;
2215 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2216 PUT(code, LINK_SIZE, 0); /* Default length */
2217 return code + 2*LINK_SIZE;
2218 }
2219
2220
2221
2222 /*************************************************
2223 * Complete a callout item *
2224 *************************************************/
2225
2226 /* A callout item contains the length of the next item in the pattern, which
2227 we can't fill in till after we have reached the relevant point. This is used
2228 for both automatic and manual callouts.
2229
2230 Arguments:
2231 previous_callout points to previous callout item
2232 ptr current pattern pointer
2233 cd pointers to tables etc
2234
2235 Returns: nothing
2236 */
2237
2238 static void
2239 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2240 {
2241 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2242 PUT(previous_callout, 2 + LINK_SIZE, length);
2243 }
2244
2245
2246
2247 #ifdef SUPPORT_UCP
2248 /*************************************************
2249 * Get othercase range *
2250 *************************************************/
2251
2252 /* This function is passed the start and end of a class range, in UTF-8 mode
2253 with UCP support. It searches up the characters, looking for internal ranges of
2254 characters in the "other" case. Each call returns the next one, updating the
2255 start address.
2256
2257 Arguments:
2258 cptr points to starting character value; updated
2259 d end value
2260 ocptr where to put start of othercase range
2261 odptr where to put end of othercase range
2262
2263 Yield: TRUE when range returned; FALSE when no more
2264 */
2265
2266 static BOOL
2267 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2268 unsigned int *odptr)
2269 {
2270 unsigned int c, othercase, next;
2271
2272 for (c = *cptr; c <= d; c++)
2273 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2274
2275 if (c > d) return FALSE;
2276
2277 *ocptr = othercase;
2278 next = othercase + 1;
2279
2280 for (++c; c <= d; c++)
2281 {
2282 if (UCD_OTHERCASE(c) != next) break;
2283 next++;
2284 }
2285
2286 *odptr = next - 1;
2287 *cptr = c;
2288
2289 return TRUE;
2290 }
2291 #endif /* SUPPORT_UCP */
2292
2293
2294
2295 /*************************************************
2296 * Check if auto-possessifying is possible *
2297 *************************************************/
2298
2299 /* This function is called for unlimited repeats of certain items, to see
2300 whether the next thing could possibly match the repeated item. If not, it makes
2301 sense to automatically possessify the repeated item.
2302
2303 Arguments:
2304 op_code the repeated op code
2305 this data for this item, depends on the opcode
2306 utf8 TRUE in UTF-8 mode
2307 utf8_char used for utf8 character bytes, NULL if not relevant
2308 ptr next character in pattern
2309 options options bits
2310 cd contains pointers to tables etc.
2311
2312 Returns: TRUE if possessifying is wanted
2313 */
2314
2315 static BOOL
2316 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2317 const uschar *ptr, int options, compile_data *cd)
2318 {
2319 int next;
2320
2321 /* Skip whitespace and comments in extended mode */
2322
2323 if ((options & PCRE_EXTENDED) != 0)
2324 {
2325 for (;;)
2326 {
2327 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2328 if (*ptr == CHAR_NUMBER_SIGN)
2329 {
2330 while (*(++ptr) != 0)
2331 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2332 }
2333 else break;
2334 }
2335 }
2336
2337 /* If the next item is one that we can handle, get its value. A non-negative
2338 value is a character, a negative value is an escape value. */
2339
2340 if (*ptr == CHAR_BACKSLASH)
2341 {
2342 int temperrorcode = 0;
2343 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2344 if (temperrorcode != 0) return FALSE;
2345 ptr++; /* Point after the escape sequence */
2346 }
2347
2348 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2349 {
2350 #ifdef SUPPORT_UTF8
2351 if (utf8) { GETCHARINC(next, ptr); } else
2352 #endif
2353 next = *ptr++;
2354 }
2355
2356 else return FALSE;
2357
2358 /* Skip whitespace and comments in extended mode */
2359
2360 if ((options & PCRE_EXTENDED) != 0)
2361 {
2362 for (;;)
2363 {
2364 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2365 if (*ptr == CHAR_NUMBER_SIGN)
2366 {
2367 while (*(++ptr) != 0)
2368 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2369 }
2370 else break;
2371 }
2372 }
2373
2374 /* If the next thing is itself optional, we have to give up. */
2375
2376 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2377 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2378 return FALSE;
2379
2380 /* Now compare the next item with the previous opcode. If the previous is a
2381 positive single character match, "item" either contains the character or, if
2382 "item" is greater than 127 in utf8 mode, the character's bytes are in
2383 utf8_char. */
2384
2385
2386 /* Handle cases when the next item is a character. */
2387
2388 if (next >= 0) switch(op_code)
2389 {
2390 case OP_CHAR:
2391 #ifdef SUPPORT_UTF8
2392 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2393 #else
2394 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2395 #endif
2396 return item != next;
2397
2398 /* For CHARNC (caseless character) we must check the other case. If we have
2399 Unicode property support, we can use it to test the other case of
2400 high-valued characters. */
2401
2402 case OP_CHARNC:
2403 #ifdef SUPPORT_UTF8
2404 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2405 #endif
2406 if (item == next) return FALSE;
2407 #ifdef SUPPORT_UTF8
2408 if (utf8)
2409 {
2410 unsigned int othercase;
2411 if (next < 128) othercase = cd->fcc[next]; else
2412 #ifdef SUPPORT_UCP
2413 othercase = UCD_OTHERCASE((unsigned int)next);
2414 #else
2415 othercase = NOTACHAR;
2416 #endif
2417 return (unsigned int)item != othercase;
2418 }
2419 else
2420 #endif /* SUPPORT_UTF8 */
2421 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2422
2423 /* For OP_NOT, "item" must be a single-byte character. */
2424
2425 case OP_NOT:
2426 if (item == next) return TRUE;
2427 if ((options & PCRE_CASELESS) == 0) return FALSE;
2428 #ifdef SUPPORT_UTF8
2429 if (utf8)
2430 {
2431 unsigned int othercase;
2432 if (next < 128) othercase = cd->fcc[next]; else
2433 #ifdef SUPPORT_UCP
2434 othercase = UCD_OTHERCASE(next);
2435 #else
2436 othercase = NOTACHAR;
2437 #endif
2438 return (unsigned int)item == othercase;
2439 }
2440 else
2441 #endif /* SUPPORT_UTF8 */
2442 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2443
2444 case OP_DIGIT:
2445 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2446
2447 case OP_NOT_DIGIT:
2448 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2449
2450 case OP_WHITESPACE:
2451 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2452
2453 case OP_NOT_WHITESPACE:
2454 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2455
2456 case OP_WORDCHAR:
2457 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2458
2459 case OP_NOT_WORDCHAR:
2460 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2461
2462 case OP_HSPACE:
2463 case OP_NOT_HSPACE:
2464 switch(next)
2465 {
2466 case 0x09:
2467 case 0x20:
2468 case 0xa0:
2469 case 0x1680:
2470 case 0x180e:
2471 case 0x2000:
2472 case 0x2001:
2473 case 0x2002:
2474 case 0x2003:
2475 case 0x2004:
2476 case 0x2005:
2477 case 0x2006:
2478 case 0x2007:
2479 case 0x2008:
2480 case 0x2009:
2481 case 0x200A:
2482 case 0x202f:
2483 case 0x205f:
2484 case 0x3000:
2485 return op_code != OP_HSPACE;
2486 default:
2487 return op_code == OP_HSPACE;
2488 }
2489
2490 case OP_VSPACE:
2491 case OP_NOT_VSPACE:
2492 switch(next)
2493 {
2494 case 0x0a:
2495 case 0x0b:
2496 case 0x0c:
2497 case 0x0d:
2498 case 0x85:
2499 case 0x2028:
2500 case 0x2029:
2501 return op_code != OP_VSPACE;
2502 default:
2503 return op_code == OP_VSPACE;
2504 }
2505
2506 default:
2507 return FALSE;
2508 }
2509
2510
2511 /* Handle the case when the next item is \d, \s, etc. */
2512
2513 switch(op_code)
2514 {
2515 case OP_CHAR:
2516 case OP_CHARNC:
2517 #ifdef SUPPORT_UTF8
2518 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2519 #endif
2520 switch(-next)
2521 {
2522 case ESC_d:
2523 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2524
2525 case ESC_D:
2526 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2527
2528 case ESC_s:
2529 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2530
2531 case ESC_S:
2532 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2533
2534 case ESC_w:
2535 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2536
2537 case ESC_W:
2538 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2539
2540 case ESC_h:
2541 case ESC_H:
2542 switch(item)
2543 {
2544 case 0x09:
2545 case 0x20:
2546 case 0xa0:
2547 case 0x1680:
2548 case 0x180e:
2549 case 0x2000:
2550 case 0x2001:
2551 case 0x2002:
2552 case 0x2003:
2553 case 0x2004:
2554 case 0x2005:
2555 case 0x2006:
2556 case 0x2007:
2557 case 0x2008:
2558 case 0x2009:
2559 case 0x200A:
2560 case 0x202f:
2561 case 0x205f:
2562 case 0x3000:
2563 return -next != ESC_h;
2564 default:
2565 return -next == ESC_h;
2566 }
2567
2568 case ESC_v:
2569 case ESC_V:
2570 switch(item)
2571 {
2572 case 0x0a:
2573 case 0x0b:
2574 case 0x0c:
2575 case 0x0d:
2576 case 0x85:
2577 case 0x2028:
2578 case 0x2029:
2579 return -next != ESC_v;
2580 default:
2581 return -next == ESC_v;
2582 }
2583
2584 default:
2585 return FALSE;
2586 }
2587
2588 case OP_DIGIT:
2589 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2590 next == -ESC_h || next == -ESC_v;
2591
2592 case OP_NOT_DIGIT:
2593 return next == -ESC_d;
2594
2595 case OP_WHITESPACE:
2596 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2597
2598 case OP_NOT_WHITESPACE:
2599 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2600
2601 case OP_HSPACE:
2602 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2603
2604 case OP_NOT_HSPACE:
2605 return next == -ESC_h;
2606
2607 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2608 case OP_VSPACE:
2609 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2610
2611 case OP_NOT_VSPACE:
2612 return next == -ESC_v;
2613
2614 case OP_WORDCHAR:
2615 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2616
2617 case OP_NOT_WORDCHAR:
2618 return next == -ESC_w || next == -ESC_d;
2619
2620 default:
2621 return FALSE;
2622 }
2623
2624 /* Control does not reach here */
2625 }
2626
2627
2628
2629 /*************************************************
2630 * Compile one branch *
2631 *************************************************/
2632
2633 /* Scan the pattern, compiling it into the a vector. If the options are
2634 changed during the branch, the pointer is used to change the external options
2635 bits. This function is used during the pre-compile phase when we are trying
2636 to find out the amount of memory needed, as well as during the real compile
2637 phase. The value of lengthptr distinguishes the two phases.
2638
2639 Arguments:
2640 optionsptr pointer to the option bits
2641 codeptr points to the pointer to the current code point
2642 ptrptr points to the current pattern pointer
2643 errorcodeptr points to error code variable
2644 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2645 reqbyteptr set to the last literal character required, else < 0
2646 bcptr points to current branch chain
2647 cd contains pointers to tables etc.
2648 lengthptr NULL during the real compile phase
2649 points to length accumulator during pre-compile phase
2650
2651 Returns: TRUE on success
2652 FALSE, with *errorcodeptr set non-zero on error
2653 */
2654
2655 static BOOL
2656 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2657 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2658 compile_data *cd, int *lengthptr)
2659 {
2660 int repeat_type, op_type;
2661 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2662 int bravalue = 0;
2663 int greedy_default, greedy_non_default;
2664 int firstbyte, reqbyte;
2665 int zeroreqbyte, zerofirstbyte;
2666 int req_caseopt, reqvary, tempreqvary;
2667 int options = *optionsptr;
2668 int after_manual_callout = 0;
2669 int length_prevgroup = 0;
2670 register int c;
2671 register uschar *code = *codeptr;
2672 uschar *last_code = code;
2673 uschar *orig_code = code;
2674 uschar *tempcode;
2675 BOOL inescq = FALSE;
2676 BOOL groupsetfirstbyte = FALSE;
2677 const uschar *ptr = *ptrptr;
2678 const uschar *tempptr;
2679 uschar *previous = NULL;
2680 uschar *previous_callout = NULL;
2681 uschar *save_hwm = NULL;
2682 uschar classbits[32];
2683
2684 #ifdef SUPPORT_UTF8
2685 BOOL class_utf8;
2686 BOOL utf8 = (options & PCRE_UTF8) != 0;
2687 uschar *class_utf8data;
2688 uschar *class_utf8data_base;
2689 uschar utf8_char[6];
2690 #else
2691 BOOL utf8 = FALSE;
2692 uschar *utf8_char = NULL;
2693 #endif
2694
2695 #ifdef PCRE_DEBUG
2696 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2697 #endif
2698
2699 /* Set up the default and non-default settings for greediness */
2700
2701 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2702 greedy_non_default = greedy_default ^ 1;
2703
2704 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2705 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2706 matches a non-fixed char first char; reqbyte just remains unset if we never
2707 find one.
2708
2709 When we hit a repeat whose minimum is zero, we may have to adjust these values
2710 to take the zero repeat into account. This is implemented by setting them to
2711 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2712 item types that can be repeated set these backoff variables appropriately. */
2713
2714 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2715
2716 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2717 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2718 value > 255. It is added into the firstbyte or reqbyte variables to record the
2719 case status of the value. This is used only for ASCII characters. */
2720
2721 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2722
2723 /* Switch on next character until the end of the branch */
2724
2725 for (;; ptr++)
2726 {
2727 BOOL negate_class;
2728 BOOL should_flip_negation;
2729 BOOL possessive_quantifier;
2730 BOOL is_quantifier;
2731 BOOL is_recurse;
2732 BOOL reset_bracount;
2733 int class_charcount;
2734 int class_lastchar;
2735 int newoptions;
2736 int recno;
2737 int refsign;
2738 int skipbytes;
2739 int subreqbyte;
2740 int subfirstbyte;
2741 int terminator;
2742 int mclength;
2743 uschar mcbuffer[8];
2744
2745 /* Get next byte in the pattern */
2746
2747 c = *ptr;
2748
2749 /* If we are in the pre-compile phase, accumulate the length used for the
2750 previous cycle of this loop. */
2751
2752 if (lengthptr != NULL)
2753 {
2754 #ifdef PCRE_DEBUG
2755 if (code > cd->hwm) cd->hwm = code; /* High water info */
2756 #endif
2757 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2758 {
2759 *errorcodeptr = ERR52;
2760 goto FAILED;
2761 }
2762
2763 /* There is at least one situation where code goes backwards: this is the
2764 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2765 the class is simply eliminated. However, it is created first, so we have to
2766 allow memory for it. Therefore, don't ever reduce the length at this point.
2767 */
2768
2769 if (code < last_code) code = last_code;
2770
2771 /* Paranoid check for integer overflow */
2772
2773 if (OFLOW_MAX - *lengthptr < code - last_code)
2774 {
2775 *errorcodeptr = ERR20;
2776 goto FAILED;
2777 }
2778
2779 *lengthptr += code - last_code;
2780 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2781
2782 /* If "previous" is set and it is not at the start of the work space, move
2783 it back to there, in order to avoid filling up the work space. Otherwise,
2784 if "previous" is NULL, reset the current code pointer to the start. */
2785
2786 if (previous != NULL)
2787 {
2788 if (previous > orig_code)
2789 {
2790 memmove(orig_code, previous, code - previous);
2791 code -= previous - orig_code;
2792 previous = orig_code;
2793 }
2794 }
2795 else code = orig_code;
2796
2797 /* Remember where this code item starts so we can pick up the length
2798 next time round. */
2799
2800 last_code = code;
2801 }
2802
2803 /* In the real compile phase, just check the workspace used by the forward
2804 reference list. */
2805
2806 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2807 {
2808 *errorcodeptr = ERR52;
2809 goto FAILED;
2810 }
2811
2812 /* If in \Q...\E, check for the end; if not, we have a literal */
2813
2814 if (inescq && c != 0)
2815 {
2816 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2817 {
2818 inescq = FALSE;
2819 ptr++;
2820 continue;
2821 }
2822 else
2823 {
2824 if (previous_callout != NULL)
2825 {
2826 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2827 complete_callout(previous_callout, ptr, cd);
2828 previous_callout = NULL;
2829 }
2830 if ((options & PCRE_AUTO_CALLOUT) != 0)
2831 {
2832 previous_callout = code;
2833 code = auto_callout(code, ptr, cd);
2834 }
2835 goto NORMAL_CHAR;
2836 }
2837 }
2838
2839 /* Fill in length of a previous callout, except when the next thing is
2840 a quantifier. */
2841
2842 is_quantifier =
2843 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2844 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2845
2846 if (!is_quantifier && previous_callout != NULL &&
2847 after_manual_callout-- <= 0)
2848 {
2849 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2850 complete_callout(previous_callout, ptr, cd);
2851 previous_callout = NULL;
2852 }
2853
2854 /* In extended mode, skip white space and comments */
2855
2856 if ((options & PCRE_EXTENDED) != 0)
2857 {
2858 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2859 if (c == CHAR_NUMBER_SIGN)
2860 {
2861 while (*(++ptr) != 0)
2862 {
2863 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2864 }
2865 if (*ptr != 0) continue;
2866
2867 /* Else fall through to handle end of string */
2868 c = 0;
2869 }
2870 }
2871
2872 /* No auto callout for quantifiers. */
2873
2874 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2875 {
2876 previous_callout = code;
2877 code = auto_callout(code, ptr, cd);
2878 }
2879
2880 switch(c)
2881 {
2882 /* ===================================================================*/
2883 case 0: /* The branch terminates at string end */
2884 case CHAR_VERTICAL_LINE: /* or | or ) */
2885 case CHAR_RIGHT_PARENTHESIS:
2886 *firstbyteptr = firstbyte;
2887 *reqbyteptr = reqbyte;
2888 *codeptr = code;
2889 *ptrptr = ptr;
2890 if (lengthptr != NULL)
2891 {
2892 if (OFLOW_MAX - *lengthptr < code - last_code)
2893 {
2894 *errorcodeptr = ERR20;
2895 goto FAILED;
2896 }
2897 *lengthptr += code - last_code; /* To include callout length */
2898 DPRINTF((">> end branch\n"));
2899 }
2900 return TRUE;
2901
2902
2903 /* ===================================================================*/
2904 /* Handle single-character metacharacters. In multiline mode, ^ disables
2905 the setting of any following char as a first character. */
2906
2907 case CHAR_CIRCUMFLEX_ACCENT:
2908 if ((options & PCRE_MULTILINE) != 0)
2909 {
2910 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2911 }
2912 previous = NULL;
2913 *code++ = OP_CIRC;
2914 break;
2915
2916 case CHAR_DOLLAR_SIGN:
2917 previous = NULL;
2918 *code++ = OP_DOLL;
2919 break;
2920
2921 /* There can never be a first char if '.' is first, whatever happens about
2922 repeats. The value of reqbyte doesn't change either. */
2923
2924 case CHAR_DOT:
2925 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2926 zerofirstbyte = firstbyte;
2927 zeroreqbyte = reqbyte;
2928 previous = code;
2929 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2930 break;
2931
2932
2933 /* ===================================================================*/
2934 /* Character classes. If the included characters are all < 256, we build a
2935 32-byte bitmap of the permitted characters, except in the special case
2936 where there is only one such character. For negated classes, we build the
2937 map as usual, then invert it at the end. However, we use a different opcode
2938 so that data characters > 255 can be handled correctly.
2939
2940 If the class contains characters outside the 0-255 range, a different
2941 opcode is compiled. It may optionally have a bit map for characters < 256,
2942 but those above are are explicitly listed afterwards. A flag byte tells
2943 whether the bitmap is present, and whether this is a negated class or not.
2944
2945 In JavaScript compatibility mode, an isolated ']' causes an error. In
2946 default (Perl) mode, it is treated as a data character. */
2947
2948 case CHAR_RIGHT_SQUARE_BRACKET:
2949 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2950 {
2951 *errorcodeptr = ERR64;
2952 goto FAILED;
2953 }
2954 goto NORMAL_CHAR;
2955
2956 case CHAR_LEFT_SQUARE_BRACKET:
2957 previous = code;
2958
2959 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2960 they are encountered at the top level, so we'll do that too. */
2961
2962 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2963 ptr[1] == CHAR_EQUALS_SIGN) &&
2964 check_posix_syntax(ptr, &tempptr))
2965 {
2966 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2967 goto FAILED;
2968 }
2969
2970 /* If the first character is '^', set the negation flag and skip it. Also,
2971 if the first few characters (either before or after ^) are \Q\E or \E we
2972 skip them too. This makes for compatibility with Perl. */
2973
2974 negate_class = FALSE;
2975 for (;;)
2976 {
2977 c = *(++ptr);
2978 if (c == CHAR_BACKSLASH)
2979 {
2980 if (ptr[1] == CHAR_E)
2981 ptr++;
2982 else if (strncmp((const char *)ptr+1,
2983 STR_Q STR_BACKSLASH STR_E, 3) == 0)
2984 ptr += 3;
2985 else
2986 break;
2987 }
2988 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2989 negate_class = TRUE;
2990 else break;
2991 }
2992
2993 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2994 an initial ']' is taken as a data character -- the code below handles
2995 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2996 [^] must match any character, so generate OP_ALLANY. */
2997
2998 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2999 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3000 {
3001 *code++ = negate_class? OP_ALLANY : OP_FAIL;
3002 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3003 zerofirstbyte = firstbyte;
3004 break;
3005 }
3006
3007 /* If a class contains a negative special such as \S, we need to flip the
3008 negation flag at the end, so that support for characters > 255 works
3009 correctly (they are all included in the class). */
3010
3011 should_flip_negation = FALSE;
3012
3013 /* Keep a count of chars with values < 256 so that we can optimize the case
3014 of just a single character (as long as it's < 256). However, For higher
3015 valued UTF-8 characters, we don't yet do any optimization. */
3016
3017 class_charcount = 0;
3018 class_lastchar = -1;
3019
3020 /* Initialize the 32-char bit map to all zeros. We build the map in a
3021 temporary bit of memory, in case the class contains only 1 character (less
3022 than 256), because in that case the compiled code doesn't use the bit map.
3023 */
3024
3025 memset(classbits, 0, 32 * sizeof(uschar));
3026
3027 #ifdef SUPPORT_UTF8
3028 class_utf8 = FALSE; /* No chars >= 256 */
3029 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3030 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3031 #endif
3032
3033 /* Process characters until ] is reached. By writing this as a "do" it
3034 means that an initial ] is taken as a data character. At the start of the
3035 loop, c contains the first byte of the character. */
3036
3037 if (c != 0) do
3038 {
3039 const uschar *oldptr;
3040
3041 #ifdef SUPPORT_UTF8
3042 if (utf8 && c > 127)
3043 { /* Braces are required because the */
3044 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3045 }
3046
3047 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3048 data and reset the pointer. This is so that very large classes that
3049 contain a zillion UTF-8 characters no longer overwrite the work space
3050 (which is on the stack). */
3051
3052 if (lengthptr != NULL)
3053 {
3054 *lengthptr += class_utf8data - class_utf8data_base;
3055 class_utf8data = class_utf8data_base;
3056 }
3057
3058 #endif
3059
3060 /* Inside \Q...\E everything is literal except \E */
3061
3062 if (inescq)
3063 {
3064 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3065 {
3066 inescq = FALSE; /* Reset literal state */
3067 ptr++; /* Skip the 'E' */
3068 continue; /* Carry on with next */
3069 }
3070 goto CHECK_RANGE; /* Could be range if \E follows */
3071 }
3072
3073 /* Handle POSIX class names. Perl allows a negation extension of the
3074 form [:^name:]. A square bracket that doesn't match the syntax is
3075 treated as a literal. We also recognize the POSIX constructions
3076 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3077 5.6 and 5.8 do. */
3078
3079 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3080 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3081 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3082 {
3083 BOOL local_negate = FALSE;
3084 int posix_class, taboffset, tabopt;
3085 register const uschar *cbits = cd->cbits;
3086 uschar pbits[32];
3087
3088 if (ptr[1] != CHAR_COLON)
3089 {
3090 *errorcodeptr = ERR31;
3091 goto FAILED;
3092 }
3093
3094 ptr += 2;
3095 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3096 {
3097 local_negate = TRUE;
3098 should_flip_negation = TRUE; /* Note negative special */
3099 ptr++;
3100 }
3101
3102 posix_class = check_posix_name(ptr, tempptr - ptr);
3103 if (posix_class < 0)
3104 {
3105 *errorcodeptr = ERR30;
3106 goto FAILED;
3107 }
3108
3109 /* If matching is caseless, upper and lower are converted to
3110 alpha. This relies on the fact that the class table starts with
3111 alpha, lower, upper as the first 3 entries. */
3112
3113 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3114 posix_class = 0;
3115
3116 /* We build the bit map for the POSIX class in a chunk of local store
3117 because we may be adding and subtracting from it, and we don't want to
3118 subtract bits that may be in the main map already. At the end we or the
3119 result into the bit map that is being built. */
3120
3121 posix_class *= 3;
3122
3123 /* Copy in the first table (always present) */
3124
3125 memcpy(pbits, cbits + posix_class_maps[posix_class],
3126 32 * sizeof(uschar));
3127
3128 /* If there is a second table, add or remove it as required. */
3129
3130 taboffset = posix_class_maps[posix_class + 1];
3131 tabopt = posix_class_maps[posix_class + 2];
3132
3133 if (taboffset >= 0)
3134 {
3135 if (tabopt >= 0)
3136 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3137 else
3138 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3139 }
3140
3141 /* Not see if we need to remove any special characters. An option
3142 value of 1 removes vertical space and 2 removes underscore. */
3143
3144 if (tabopt < 0) tabopt = -tabopt;
3145 if (tabopt == 1) pbits[1] &= ~0x3c;
3146 else if (tabopt == 2) pbits[11] &= 0x7f;
3147
3148 /* Add the POSIX table or its complement into the main table that is
3149 being built and we are done. */
3150
3151 if (local_negate)
3152 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3153 else
3154 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3155
3156 ptr = tempptr + 1;
3157 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3158 continue; /* End of POSIX syntax handling */
3159 }
3160
3161 /* Backslash may introduce a single character, or it may introduce one
3162 of the specials, which just set a flag. The sequence \b is a special
3163 case. Inside a class (and only there) it is treated as backspace.
3164 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3165 to 'or' into the one we are building. We assume they have more than one
3166 character in them, so set class_charcount bigger than one. */
3167
3168 if (c == CHAR_BACKSLASH)
3169 {
3170 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3171 if (*errorcodeptr != 0) goto FAILED;
3172
3173 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3174 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3175 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3176 else if (-c == ESC_Q) /* Handle start of quoted string */
3177 {
3178 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3179 {
3180 ptr += 2; /* avoid empty string */
3181 }
3182 else inescq = TRUE;
3183 continue;
3184 }
3185 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3186
3187 if (c < 0)
3188 {
3189 register const uschar *cbits = cd->cbits;
3190 class_charcount += 2; /* Greater than 1 is what matters */
3191
3192 /* Save time by not doing this in the pre-compile phase. */
3193
3194 if (lengthptr == NULL) switch (-c)
3195 {
3196 case ESC_d:
3197 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3198 continue;
3199
3200 case ESC_D:
3201 should_flip_negation = TRUE;
3202 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3203 continue;
3204
3205 case ESC_w:
3206 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3207 continue;
3208
3209 case ESC_W:
3210 should_flip_negation = TRUE;
3211 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3212 continue;
3213
3214 case ESC_s:
3215 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3216 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3217 continue;
3218
3219 case ESC_S:
3220 should_flip_negation = TRUE;
3221 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3222 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3223 continue;
3224
3225 default: /* Not recognized; fall through */
3226 break; /* Need "default" setting to stop compiler warning. */
3227 }
3228
3229 /* In the pre-compile phase, just do the recognition. */
3230
3231 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3232 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3233
3234 /* We need to deal with \H, \h, \V, and \v in both phases because
3235 they use extra memory. */
3236
3237 if (-c == ESC_h)
3238 {
3239 SETBIT(classbits, 0x09); /* VT */
3240 SETBIT(classbits, 0x20); /* SPACE */
3241 SETBIT(classbits, 0xa0); /* NSBP */
3242 #ifdef SUPPORT_UTF8
3243 if (utf8)
3244 {
3245 class_utf8 = TRUE;
3246 *class_utf8data++ = XCL_SINGLE;
3247 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3248 *class_utf8data++ = XCL_SINGLE;
3249 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3250 *class_utf8data++ = XCL_RANGE;
3251 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3252 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3253 *class_utf8data++ = XCL_SINGLE;
3254 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3255 *class_utf8data++ = XCL_SINGLE;
3256 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3257 *class_utf8data++ = XCL_SINGLE;
3258 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3259 }
3260 #endif
3261 continue;
3262 }
3263
3264 if (-c == ESC_H)
3265 {
3266 for (c = 0; c < 32; c++)
3267 {
3268 int x = 0xff;
3269 switch (c)
3270 {
3271 case 0x09/8: x ^= 1 << (0x09%8); break;
3272 case 0x20/8: x ^= 1 << (0x20%8); break;
3273 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3274 default: break;
3275 }
3276 classbits[c] |= x;
3277 }
3278
3279 #ifdef SUPPORT_UTF8
3280 if (utf8)
3281 {
3282 class_utf8 = TRUE;
3283 *class_utf8data++ = XCL_RANGE;
3284 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3285 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3286 *class_utf8data++ = XCL_RANGE;
3287 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3288 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3289 *class_utf8data++ = XCL_RANGE;
3290 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3291 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3292 *class_utf8data++ = XCL_RANGE;
3293 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3294 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3295 *class_utf8data++ = XCL_RANGE;
3296 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3297 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3298 *class_utf8data++ = XCL_RANGE;
3299 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3300 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3301 *class_utf8data++ = XCL_RANGE;
3302 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3303 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3304 }
3305 #endif
3306 continue;
3307 }
3308
3309 if (-c == ESC_v)
3310 {
3311 SETBIT(classbits, 0x0a); /* LF */
3312 SETBIT(classbits, 0x0b); /* VT */
3313 SETBIT(classbits, 0x0c); /* FF */
3314 SETBIT(classbits, 0x0d); /* CR */
3315 SETBIT(classbits, 0x85); /* NEL */
3316 #ifdef SUPPORT_UTF8
3317 if (utf8)
3318 {
3319 class_utf8 = TRUE;
3320 *class_utf8data++ = XCL_RANGE;
3321 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3322 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3323 }
3324 #endif
3325 continue;
3326 }
3327
3328 if (-c == ESC_V)
3329 {
3330 for (c = 0; c < 32; c++)
3331 {
3332 int x = 0xff;
3333 switch (c)
3334 {
3335 case 0x0a/8: x ^= 1 << (0x0a%8);
3336 x ^= 1 << (0x0b%8);
3337 x ^= 1 << (0x0c%8);
3338 x ^= 1 << (0x0d%8);
3339 break;
3340 case 0x85/8: x ^= 1 << (0x85%8); break;
3341 default: break;
3342 }
3343 classbits[c] |= x;
3344 }
3345
3346 #ifdef SUPPORT_UTF8
3347 if (utf8)
3348 {
3349 class_utf8 = TRUE;
3350 *class_utf8data++ = XCL_RANGE;
3351 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3352 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3353 *class_utf8data++ = XCL_RANGE;
3354 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3355 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3356 }
3357 #endif
3358 continue;
3359 }
3360
3361 /* We need to deal with \P and \p in both phases. */
3362
3363 #ifdef SUPPORT_UCP
3364 if (-c == ESC_p || -c == ESC_P)
3365 {
3366 BOOL negated;
3367 int pdata;
3368 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3369 if (ptype < 0) goto FAILED;
3370 class_utf8 = TRUE;
3371 *class_utf8data++ = ((-c == ESC_p) != negated)?
3372 XCL_PROP : XCL_NOTPROP;
3373 *class_utf8data++ = ptype;
3374 *class_utf8data++ = pdata;
3375 class_charcount -= 2; /* Not a < 256 character */
3376 continue;
3377 }
3378 #endif
3379 /* Unrecognized escapes are faulted if PCRE is running in its
3380 strict mode. By default, for compatibility with Perl, they are
3381 treated as literals. */
3382
3383 if ((options & PCRE_EXTRA) != 0)
3384 {
3385 *errorcodeptr = ERR7;
3386 goto FAILED;
3387 }
3388
3389 class_charcount -= 2; /* Undo the default count from above */
3390 c = *ptr; /* Get the final character and fall through */
3391 }
3392
3393 /* Fall through if we have a single character (c >= 0). This may be
3394 greater than 256 in UTF-8 mode. */
3395
3396 } /* End of backslash handling */
3397
3398 /* A single character may be followed by '-' to form a range. However,
3399 Perl does not permit ']' to be the end of the range. A '-' character
3400 at the end is treated as a literal. Perl ignores orphaned \E sequences
3401 entirely. The code for handling \Q and \E is messy. */
3402
3403 CHECK_RANGE:
3404 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3405 {
3406 inescq = FALSE;
3407 ptr += 2;
3408 }
3409
3410 oldptr = ptr;
3411
3412 /* Remember \r or \n */
3413
3414 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3415
3416 /* Check for range */
3417
3418 if (!inescq && ptr[1] == CHAR_MINUS)
3419 {
3420 int d;
3421 ptr += 2;
3422 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3423
3424 /* If we hit \Q (not followed by \E) at this point, go into escaped
3425 mode. */
3426
3427 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3428 {
3429 ptr += 2;
3430 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3431 { ptr += 2; continue; }
3432 inescq = TRUE;
3433 break;
3434 }
3435
3436 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3437 {
3438 ptr = oldptr;
3439 goto LONE_SINGLE_CHARACTER;
3440 }
3441
3442 #ifdef SUPPORT_UTF8
3443 if (utf8)
3444 { /* Braces are required because the */
3445 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3446 }
3447 else
3448 #endif
3449 d = *ptr; /* Not UTF-8 mode */
3450
3451 /* The second part of a range can be a single-character escape, but
3452 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3453 in such circumstances. */
3454
3455 if (!inescq && d == CHAR_BACKSLASH)
3456 {
3457 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3458 if (*errorcodeptr != 0) goto FAILED;
3459
3460 /* \b is backspace; \X is literal X; \R is literal R; any other
3461 special means the '-' was literal */
3462
3463 if (d < 0)
3464 {
3465 if (d == -ESC_b) d = CHAR_BS;
3466 else if (d == -ESC_X) d = CHAR_X;
3467 else if (d == -ESC_R) d = CHAR_R; else
3468 {
3469 ptr = oldptr;
3470 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3471 }
3472 }
3473 }
3474
3475 /* Check that the two values are in the correct order. Optimize
3476 one-character ranges */
3477
3478 if (d < c)
3479 {
3480 *errorcodeptr = ERR8;
3481 goto FAILED;
3482 }
3483
3484 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3485
3486 /* Remember \r or \n */
3487
3488 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3489
3490 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3491 matching, we have to use an XCLASS with extra data items. Caseless
3492 matching for characters > 127 is available only if UCP support is
3493 available. */
3494
3495 #ifdef SUPPORT_UTF8
3496 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3497 {
3498 class_utf8 = TRUE;
3499
3500 /* With UCP support, we can find the other case equivalents of
3501 the relevant characters. There may be several ranges. Optimize how
3502 they fit with the basic range. */
3503
3504 #ifdef SUPPORT_UCP
3505 if ((options & PCRE_CASELESS) != 0)
3506 {
3507 unsigned int occ, ocd;
3508 unsigned int cc = c;
3509 unsigned int origd = d;
3510 while (get_othercase_range(&cc, origd, &occ, &ocd))
3511 {
3512 if (occ >= (unsigned int)c &&
3513 ocd <= (unsigned int)d)
3514 continue; /* Skip embedded ranges */
3515
3516 if (occ < (unsigned int)c &&
3517 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3518 { /* if there is overlap, */
3519 c = occ; /* noting that if occ < c */
3520 continue; /* we can't have ocd > d */
3521 } /* because a subrange is */
3522 if (ocd > (unsigned int)d &&
3523 occ <= (unsigned int)d + 1) /* always shorter than */
3524 { /* the basic range. */
3525 d = ocd;
3526 continue;
3527 }
3528
3529 if (occ == ocd)
3530 {
3531 *class_utf8data++ = XCL_SINGLE;
3532 }
3533 else
3534 {
3535 *class_utf8data++ = XCL_RANGE;
3536 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3537 }
3538 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3539 }
3540 }
3541 #endif /* SUPPORT_UCP */
3542
3543 /* Now record the original range, possibly modified for UCP caseless
3544 overlapping ranges. */
3545
3546 *class_utf8data++ = XCL_RANGE;
3547 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3548 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3549
3550 /* With UCP support, we are done. Without UCP support, there is no
3551 caseless matching for UTF-8 characters > 127; we can use the bit map
3552 for the smaller ones. */
3553
3554 #ifdef SUPPORT_UCP
3555 continue; /* With next character in the class */
3556 #else
3557 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3558
3559 /* Adjust upper limit and fall through to set up the map */
3560
3561 d = 127;
3562
3563 #endif /* SUPPORT_UCP */
3564 }
3565 #endif /* SUPPORT_UTF8 */
3566
3567 /* We use the bit map for all cases when not in UTF-8 mode; else
3568 ranges that lie entirely within 0-127 when there is UCP support; else
3569 for partial ranges without UCP support. */
3570
3571 class_charcount += d - c + 1;
3572 class_lastchar = d;
3573
3574 /* We can save a bit of time by skipping this in the pre-compile. */
3575
3576 if (lengthptr == NULL) for (; c <= d; c++)
3577 {
3578 classbits[c/8] |= (1 << (c&7));
3579 if ((options & PCRE_CASELESS) != 0)
3580 {
3581 int uc = cd->fcc[c]; /* flip case */
3582 classbits[uc/8] |= (1 << (uc&7));
3583 }
3584 }
3585
3586 continue; /* Go get the next char in the class */
3587 }
3588
3589 /* Handle a lone single character - we can get here for a normal
3590 non-escape char, or after \ that introduces a single character or for an
3591 apparent range that isn't. */
3592
3593 LONE_SINGLE_CHARACTER:
3594
3595 /* Handle a character that cannot go in the bit map */
3596
3597 #ifdef SUPPORT_UTF8
3598 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3599 {
3600 class_utf8 = TRUE;
3601 *class_utf8data++ = XCL_SINGLE;
3602 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3603
3604 #ifdef SUPPORT_UCP
3605 if ((options & PCRE_CASELESS) != 0)
3606 {
3607 unsigned int othercase;
3608 if ((othercase = UCD_OTHERCASE(c)) != c)
3609 {
3610 *class_utf8data++ = XCL_SINGLE;
3611 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3612 }
3613 }
3614 #endif /* SUPPORT_UCP */
3615
3616 }
3617 else
3618 #endif /* SUPPORT_UTF8 */
3619
3620 /* Handle a single-byte character */
3621 {
3622 classbits[c/8] |= (1 << (c&7));
3623 if ((options & PCRE_CASELESS) != 0)
3624 {
3625 c = cd->fcc[c]; /* flip case */
3626 classbits[c/8] |= (1 << (c&7));
3627 }
3628 class_charcount++;
3629 class_lastchar = c;
3630 }
3631 }
3632
3633 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3634
3635 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3636
3637 if (c == 0) /* Missing terminating ']' */
3638 {
3639 *errorcodeptr = ERR6;
3640 goto FAILED;
3641 }
3642
3643
3644 /* This code has been disabled because it would mean that \s counts as
3645 an explicit \r or \n reference, and that's not really what is wanted. Now
3646 we set the flag only if there is a literal "\r" or "\n" in the class. */
3647
3648 #if 0
3649 /* Remember whether \r or \n are in this class */
3650
3651 if (negate_class)
3652 {
3653 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3654 }
3655 else
3656 {
3657 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3658 }
3659 #endif
3660
3661
3662 /* If class_charcount is 1, we saw precisely one character whose value is
3663 less than 256. As long as there were no characters >= 128 and there was no
3664 use of \p or \P, in other words, no use of any XCLASS features, we can
3665 optimize.
3666
3667 In UTF-8 mode, we can optimize the negative case only if there were no
3668 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3669 operate on single-bytes only. This is an historical hangover. Maybe one day
3670 we can tidy these opcodes to handle multi-byte characters.
3671
3672 The optimization throws away the bit map. We turn the item into a
3673 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3674 that OP_NOT does not support multibyte characters. In the positive case, it
3675 can cause firstbyte to be set. Otherwise, there can be no first char if
3676 this item is first, whatever repeat count may follow. In the case of
3677 reqbyte, save the previous value for reinstating. */
3678
3679 #ifdef SUPPORT_UTF8
3680 if (class_charcount == 1 && !class_utf8 &&
3681 (!utf8 || !negate_class || class_lastchar < 128))
3682 #else
3683 if (class_charcount == 1)
3684 #endif
3685 {
3686 zeroreqbyte = reqbyte;
3687
3688 /* The OP_NOT opcode works on one-byte characters only. */
3689
3690 if (negate_class)
3691 {
3692 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3693 zerofirstbyte = firstbyte;
3694 *code++ = OP_NOT;
3695 *code++ = class_lastchar;
3696 break;
3697 }
3698
3699 /* For a single, positive character, get the value into mcbuffer, and
3700 then we can handle this with the normal one-character code. */
3701
3702 #ifdef SUPPORT_UTF8
3703 if (utf8 && class_lastchar > 127)
3704 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3705 else
3706 #endif
3707 {
3708 mcbuffer[0] = class_lastchar;
3709 mclength = 1;
3710 }
3711 goto ONE_CHAR;
3712 } /* End of 1-char optimization */
3713
3714 /* The general case - not the one-char optimization. If this is the first
3715 thing in the branch, there can be no first char setting, whatever the
3716 repeat count. Any reqbyte setting must remain unchanged after any kind of
3717 repeat. */
3718
3719 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3720 zerofirstbyte = firstbyte;
3721 zeroreqbyte = reqbyte;
3722
3723 /* If there are characters with values > 255, we have to compile an
3724 extended class, with its own opcode, unless there was a negated special
3725 such as \S in the class, because in that case all characters > 255 are in
3726 the class, so any that were explicitly given as well can be ignored. If
3727 (when there are explicit characters > 255 that must be listed) there are no
3728 characters < 256, we can omit the bitmap in the actual compiled code. */
3729
3730 #ifdef SUPPORT_UTF8
3731 if (class_utf8 && !should_flip_negation)
3732 {
3733 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3734 *code++ = OP_XCLASS;
3735 code += LINK_SIZE;
3736 *code = negate_class? XCL_NOT : 0;
3737
3738 /* If the map is required, move up the extra data to make room for it;
3739 otherwise just move the code pointer to the end of the extra data. */
3740
3741 if (class_charcount > 0)
3742 {
3743 *code++ |= XCL_MAP;
3744 memmove(code + 32, code, class_utf8data - code);
3745 memcpy(code, classbits, 32);
3746 code = class_utf8data + 32;
3747 }
3748 else code = class_utf8data;
3749
3750 /* Now fill in the complete length of the item */
3751
3752 PUT(previous, 1, code - previous);
3753 break; /* End of class handling */
3754 }
3755 #endif
3756
3757 /* If there are no characters > 255, set the opcode to OP_CLASS or
3758 OP_NCLASS, depending on whether the whole class was negated and whether
3759 there were negative specials such as \S in the class. Then copy the 32-byte
3760 map into the code vector, negating it if necessary. */
3761
3762 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3763 if (negate_class)
3764 {
3765 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3766 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3767 }
3768 else
3769 {
3770 memcpy(code, classbits, 32);
3771 }
3772 code += 32;
3773 break;
3774
3775
3776 /* ===================================================================*/
3777 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3778 has been tested above. */
3779
3780 case CHAR_LEFT_CURLY_BRACKET:
3781 if (!is_quantifier) goto NORMAL_CHAR;
3782 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3783 if (*errorcodeptr != 0) goto FAILED;
3784 goto REPEAT;
3785
3786 case CHAR_ASTERISK:
3787 repeat_min = 0;
3788 repeat_max = -1;
3789 goto REPEAT;
3790
3791 case CHAR_PLUS:
3792 repeat_min = 1;
3793 repeat_max = -1;
3794 goto REPEAT;
3795
3796 case CHAR_QUESTION_MARK:
3797 repeat_min = 0;
3798 repeat_max = 1;
3799
3800 REPEAT:
3801 if (previous == NULL)
3802 {
3803 *errorcodeptr = ERR9;
3804 goto FAILED;
3805 }
3806
3807 if (repeat_min == 0)
3808 {
3809 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3810 reqbyte = zeroreqbyte; /* Ditto */
3811 }
3812
3813 /* Remember whether this is a variable length repeat */
3814
3815 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3816
3817 op_type = 0; /* Default single-char op codes */
3818 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3819
3820 /* Save start of previous item, in case we have to move it up to make space
3821 for an inserted OP_ONCE for the additional '+' extension. */
3822
3823 tempcode = previous;
3824
3825 /* If the next character is '+', we have a possessive quantifier. This
3826 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3827 If the next character is '?' this is a minimizing repeat, by default,
3828 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3829 repeat type to the non-default. */
3830
3831 if (ptr[1] == CHAR_PLUS)
3832 {
3833 repeat_type = 0; /* Force greedy */
3834 possessive_quantifier = TRUE;
3835 ptr++;
3836 }
3837 else if (ptr[1] == CHAR_QUESTION_MARK)
3838 {
3839 repeat_type = greedy_non_default;
3840 ptr++;
3841 }
3842 else repeat_type = greedy_default;
3843
3844 /* If previous was a character match, abolish the item and generate a
3845 repeat item instead. If a char item has a minumum of more than one, ensure
3846 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3847 the first thing in a branch because the x will have gone into firstbyte
3848 instead. */
3849
3850 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3851 {
3852 /* Deal with UTF-8 characters that take up more than one byte. It's
3853 easier to write this out separately than try to macrify it. Use c to
3854 hold the length of the character in bytes, plus 0x80 to flag that it's a
3855 length rather than a small character. */
3856
3857 #ifdef SUPPORT_UTF8
3858 if (utf8 && (code[-1] & 0x80) != 0)
3859 {
3860 uschar *lastchar = code - 1;
3861 while((*lastchar & 0xc0) == 0x80) lastchar--;
3862 c = code - lastchar; /* Length of UTF-8 character */
3863 memcpy(utf8_char, lastchar, c); /* Save the char */
3864 c |= 0x80; /* Flag c as a length */
3865 }
3866 else
3867 #endif
3868
3869 /* Handle the case of a single byte - either with no UTF8 support, or
3870 with UTF-8 disabled, or for a UTF-8 character < 128. */
3871
3872 {
3873 c = code[-1];
3874 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3875 }
3876
3877 /* If the repetition is unlimited, it pays to see if the next thing on
3878 the line is something that cannot possibly match this character. If so,
3879 automatically possessifying this item gains some performance in the case
3880 where the match fails. */
3881
3882 if (!possessive_quantifier &&
3883 repeat_max < 0 &&
3884 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3885 options, cd))
3886 {
3887 repeat_type = 0; /* Force greedy */
3888 possessive_quantifier = TRUE;
3889 }
3890
3891 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3892 }
3893
3894 /* If previous was a single negated character ([^a] or similar), we use
3895 one of the special opcodes, replacing it. The code is shared with single-
3896 character repeats by setting opt_type to add a suitable offset into
3897 repeat_type. We can also test for auto-possessification. OP_NOT is
3898 currently used only for single-byte chars. */
3899
3900 else if (*previous == OP_NOT)
3901 {
3902 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3903 c = previous[1];
3904 if (!possessive_quantifier &&
3905 repeat_max < 0 &&
3906 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3907 {
3908 repeat_type = 0; /* Force greedy */
3909 possessive_quantifier = TRUE;
3910 }
3911 goto OUTPUT_SINGLE_REPEAT;
3912 }
3913
3914 /* If previous was a character type match (\d or similar), abolish it and
3915 create a suitable repeat item. The code is shared with single-character
3916 repeats by setting op_type to add a suitable offset into repeat_type. Note
3917 the the Unicode property types will be present only when SUPPORT_UCP is
3918 defined, but we don't wrap the little bits of code here because it just
3919 makes it horribly messy. */
3920
3921 else if (*previous < OP_EODN)
3922 {
3923 uschar *oldcode;
3924 int prop_type, prop_value;
3925 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3926 c = *previous;
3927
3928 if (!possessive_quantifier &&
3929 repeat_max < 0 &&
3930 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3931 {
3932 repeat_type = 0; /* Force greedy */
3933 possessive_quantifier = TRUE;
3934 }
3935
3936 OUTPUT_SINGLE_REPEAT:
3937 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3938 {
3939 prop_type = previous[1];
3940 prop_value = previous[2];
3941 }
3942 else prop_type = prop_value = -1;
3943
3944 oldcode = code;
3945 code = previous; /* Usually overwrite previous item */
3946
3947 /* If the maximum is zero then the minimum must also be zero; Perl allows
3948 this case, so we do too - by simply omitting the item altogether. */
3949
3950 if (repeat_max == 0) goto END_REPEAT;
3951
3952 /*--------------------------------------------------------------------*/
3953 /* This code is obsolete from release 8.00; the restriction was finally
3954 removed: */
3955
3956 /* All real repeats make it impossible to handle partial matching (maybe
3957 one day we will be able to remove this restriction). */
3958
3959 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3960 /*--------------------------------------------------------------------*/
3961
3962 /* Combine the op_type with the repeat_type */
3963
3964 repeat_type += op_type;
3965
3966 /* A minimum of zero is handled either as the special case * or ?, or as
3967 an UPTO, with the maximum given. */
3968
3969 if (repeat_min == 0)
3970 {
3971 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3972 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3973 else
3974 {
3975 *code++ = OP_UPTO + repeat_type;
3976 PUT2INC(code, 0, repeat_max);
3977 }
3978 }
3979
3980 /* A repeat minimum of 1 is optimized into some special cases. If the
3981 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3982 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3983 one less than the maximum. */
3984
3985 else if (repeat_min == 1)
3986 {
3987 if (repeat_max == -1)
3988 *code++ = OP_PLUS + repeat_type;
3989 else
3990 {
3991 code = oldcode; /* leave previous item in place */
3992 if (repeat_max == 1) goto END_REPEAT;
3993 *code++ = OP_UPTO + repeat_type;
3994 PUT2INC(code, 0, repeat_max - 1);
3995 }
3996 }
3997
3998 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3999 handled as an EXACT followed by an UPTO. */
4000
4001 else
4002 {
4003 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4004 PUT2INC(code, 0, repeat_min);
4005
4006 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4007 we have to insert the character for the previous code. For a repeated
4008 Unicode property match, there are two extra bytes that define the
4009 required property. In UTF-8 mode, long characters have their length in
4010 c, with the 0x80 bit as a flag. */
4011
4012 if (repeat_max < 0)
4013 {
4014 #ifdef SUPPORT_UTF8
4015 if (utf8 && c >= 128)
4016 {
4017 memcpy(code, utf8_char, c & 7);
4018 code += c & 7;
4019 }
4020 else
4021 #endif
4022 {
4023 *code++ = c;
4024 if (prop_type >= 0)
4025 {
4026 *code++ = prop_type;
4027 *code++ = prop_value;
4028 }
4029 }
4030 *code++ = OP_STAR + repeat_type;
4031 }
4032
4033 /* Else insert an UPTO if the max is greater than the min, again
4034 preceded by the character, for the previously inserted code. If the
4035 UPTO is just for 1 instance, we can use QUERY instead. */
4036
4037 else if (repeat_max != repeat_min)
4038 {
4039 #ifdef SUPPORT_UTF8
4040 if (utf8 && c >= 128)
4041 {
4042 memcpy(code, utf8_char, c & 7);
4043 code += c & 7;
4044 }
4045 else
4046 #endif
4047 *code++ = c;
4048 if (prop_type >= 0)
4049 {
4050 *code++ = prop_type;
4051 *code++ = prop_value;
4052 }
4053 repeat_max -= repeat_min;
4054
4055 if (repeat_max == 1)
4056 {
4057 *code++ = OP_QUERY + repeat_type;
4058 }
4059 else
4060 {
4061 *code++ = OP_UPTO + repeat_type;
4062 PUT2INC(code, 0, repeat_max);
4063 }
4064 }
4065 }
4066
4067 /* The character or character type itself comes last in all cases. */
4068
4069 #ifdef SUPPORT_UTF8
4070 if (utf8 && c >= 128)
4071 {
4072 memcpy(code, utf8_char, c & 7);
4073 code += c & 7;
4074 }
4075 else
4076 #endif
4077 *code++ = c;
4078
4079 /* For a repeated Unicode property match, there are two extra bytes that
4080 define the required property. */
4081
4082 #ifdef SUPPORT_UCP
4083 if (prop_type >= 0)
4084 {
4085 *code++ = prop_type;
4086 *code++ = prop_value;
4087 }
4088 #endif
4089 }
4090
4091 /* If previous was a character class or a back reference, we put the repeat
4092 stuff after it, but just skip the item if the repeat was {0,0}. */
4093
4094 else if (*previous == OP_CLASS ||
4095 *previous == OP_NCLASS ||
4096 #ifdef SUPPORT_UTF8
4097 *previous == OP_XCLASS ||
4098 #endif
4099 *previous == OP_REF)
4100 {
4101 if (repeat_max == 0)
4102 {
4103 code = previous;
4104 goto END_REPEAT;
4105 }
4106
4107 /*--------------------------------------------------------------------*/
4108 /* This code is obsolete from release 8.00; the restriction was finally
4109 removed: */
4110
4111 /* All real repeats make it impossible to handle partial matching (maybe
4112 one day we will be able to remove this restriction). */
4113
4114 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4115 /*--------------------------------------------------------------------*/
4116
4117 if (repeat_min == 0 && repeat_max == -1)
4118 *code++ = OP_CRSTAR + repeat_type;
4119 else if (repeat_min == 1 && repeat_max == -1)
4120 *code++ = OP_CRPLUS + repeat_type;
4121 else if (repeat_min == 0 && repeat_max == 1)
4122 *code++ = OP_CRQUERY + repeat_type;
4123 else
4124 {
4125 *code++ = OP_CRRANGE + repeat_type;
4126 PUT2INC(code, 0, repeat_min);
4127 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4128 PUT2INC(code, 0, repeat_max);
4129 }
4130 }
4131
4132 /* If previous was a bracket group, we may have to replicate it in certain
4133 cases. */
4134
4135 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4136 *previous == OP_ONCE || *previous == OP_COND)
4137 {
4138 register int i;
4139 int ketoffset = 0;
4140 int len = code - previous;
4141 uschar *bralink = NULL;
4142
4143 /* Repeating a DEFINE group is pointless */
4144
4145 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4146 {
4147 *errorcodeptr = ERR55;
4148 goto FAILED;
4149 }
4150
4151 /* If the maximum repeat count is unlimited, find the end of the bracket
4152 by scanning through from the start, and compute the offset back to it
4153 from the current code pointer. There may be an OP_OPT setting following
4154 the final KET, so we can't find the end just by going back from the code
4155 pointer. */
4156
4157 if (repeat_max == -1)
4158 {
4159 register uschar *ket = previous;
4160 do ket += GET(ket, 1); while (*ket != OP_KET);
4161 ketoffset = code - ket;
4162 }
4163
4164 /* The case of a zero minimum is special because of the need to stick
4165 OP_BRAZERO in front of it, and because the group appears once in the
4166 data, whereas in other cases it appears the minimum number of times. For
4167 this reason, it is simplest to treat this case separately, as otherwise
4168 the code gets far too messy. There are several special subcases when the
4169 minimum is zero. */
4170
4171 if (repeat_min == 0)
4172 {
4173 /* If the maximum is also zero, we used to just omit the group from the
4174 output altogether, like this:
4175
4176 ** if (repeat_max == 0)
4177 ** {
4178 ** code = previous;
4179 ** goto END_REPEAT;
4180 ** }
4181
4182 However, that fails when a group is referenced as a subroutine from
4183 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4184 so that it is skipped on execution. As we don't have a list of which
4185 groups are referenced, we cannot do this selectively.
4186
4187 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4188 and do no more at this point. However, we do need to adjust any
4189 OP_RECURSE calls inside the group that refer to the group itself or any
4190 internal or forward referenced group, because the offset is from the
4191 start of the whole regex. Temporarily terminate the pattern while doing
4192 this. */
4193
4194 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4195 {
4196 *code = OP_END;
4197 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4198 memmove(previous+1, previous, len);
4199 code++;
4200 if (repeat_max == 0)
4201 {
4202 *previous++ = OP_SKIPZERO;
4203 goto END_REPEAT;
4204 }
4205 *previous++ = OP_BRAZERO + repeat_type;
4206 }
4207
4208 /* If the maximum is greater than 1 and limited, we have to replicate
4209 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4210 The first one has to be handled carefully because it's the original
4211 copy, which has to be moved up. The remainder can be handled by code
4212 that is common with the non-zero minimum case below. We have to
4213 adjust the value or repeat_max, since one less copy is required. Once
4214 again, we may have to adjust any OP_RECURSE calls inside the group. */
4215
4216 else
4217 {
4218 int offset;
4219 *code = OP_END;
4220 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4221 memmove(previous + 2 + LINK_SIZE, previous, len);
4222 code += 2 + LINK_SIZE;
4223 *previous++ = OP_BRAZERO + repeat_type;
4224 *previous++ = OP_BRA;
4225
4226 /* We chain together the bracket offset fields that have to be
4227 filled in later when the ends of the brackets are reached. */
4228
4229 offset = (bralink == NULL)? 0 : previous - bralink;
4230 bralink = previous;
4231 PUTINC(previous, 0, offset);
4232 }
4233
4234 repeat_max--;
4235 }
4236
4237 /* If the minimum is greater than zero, replicate the group as many
4238 times as necessary, and adjust the maximum to the number of subsequent
4239 copies that we need. If we set a first char from the group, and didn't
4240 set a required char, copy the latter from the former. If there are any
4241 forward reference subroutine calls in the group, there will be entries on
4242 the workspace list; replicate these with an appropriate increment. */
4243
4244 else
4245 {
4246 if (repeat_min > 1)
4247 {
4248 /* In the pre-compile phase, we don't actually do the replication. We
4249 just adjust the length as if we had. Do some paranoid checks for
4250 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4251 integer type when available, otherwise double. */
4252
4253 if (lengthptr != NULL)
4254 {
4255 int delta = (repeat_min - 1)*length_prevgroup;
4256 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4257 (INT64_OR_DOUBLE)length_prevgroup >
4258 (INT64_OR_DOUBLE)INT_MAX ||
4259 OFLOW_MAX - *lengthptr < delta)
4260 {
4261 *errorcodeptr = ERR20;
4262 goto FAILED;
4263 }
4264 *lengthptr += delta;
4265 }
4266
4267 /* This is compiling for real */
4268
4269 else
4270 {
4271 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4272 for (i = 1; i < repeat_min; i++)
4273 {
4274 uschar *hc;
4275 uschar *this_hwm = cd->hwm;
4276 memcpy(code, previous, len);
4277 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4278 {
4279 PUT(cd->hwm, 0, GET(hc, 0) + len);
4280 cd->hwm += LINK_SIZE;
4281 }
4282 save_hwm = this_hwm;
4283 code += len;
4284 }
4285 }
4286 }
4287
4288 if (repeat_max > 0) repeat_max -= repeat_min;
4289 }
4290
4291 /* This code is common to both the zero and non-zero minimum cases. If
4292 the maximum is limited, it replicates the group in a nested fashion,
4293 remembering the bracket starts on a stack. In the case of a zero minimum,
4294 the first one was set up above. In all cases the repeat_max now specifies
4295 the number of additional copies needed. Again, we must remember to
4296 replicate entries on the forward reference list. */
4297
4298 if (repeat_max >= 0)
4299 {
4300 /* In the pre-compile phase, we don't actually do the replication. We
4301 just adjust the length as if we had. For each repetition we must add 1
4302 to the length for BRAZERO and for all but the last repetition we must
4303 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4304 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4305 a 64-bit integer type when available, otherwise double. */
4306
4307 if (lengthptr != NULL && repeat_max > 0)
4308 {
4309 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4310 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4311 if ((INT64_OR_DOUBLE)repeat_max *
4312 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4313 > (INT64_OR_DOUBLE)INT_MAX ||
4314 OFLOW_MAX - *lengthptr < delta)
4315 {
4316 *errorcodeptr = ERR20;
4317 goto FAILED;
4318 }
4319 *lengthptr += delta;
4320 }
4321
4322 /* This is compiling for real */
4323
4324 else for (i = repeat_max - 1; i >= 0; i--)
4325 {
4326 uschar *hc;
4327 uschar *this_hwm = cd->hwm;
4328
4329 *code++ = OP_BRAZERO + repeat_type;
4330
4331 /* All but the final copy start a new nesting, maintaining the
4332 chain of brackets outstanding. */
4333
4334 if (i != 0)
4335 {
4336 int offset;
4337 *code++ = OP_BRA;
4338 offset = (bralink == NULL)? 0 : code - bralink;
4339 bralink = code;
4340 PUTINC(code, 0, offset);
4341 }
4342
4343 memcpy(code, previous, len);
4344 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4345 {
4346 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4347 cd->hwm += LINK_SIZE;
4348 }
4349 save_hwm = this_hwm;
4350 code += len;
4351 }
4352
4353 /* Now chain through the pending brackets, and fill in their length
4354 fields (which are holding the chain links pro tem). */
4355
4356 while (bralink != NULL)
4357 {
4358 int oldlinkoffset;
4359 int offset = code - bralink + 1;
4360 uschar *bra = code - offset;
4361 oldlinkoffset = GET(bra, 1);
4362 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4363 *code++ = OP_KET;
4364 PUTINC(code, 0, offset);
4365 PUT(bra, 1, offset);
4366 }
4367 }
4368
4369 /* If the maximum is unlimited, set a repeater in the final copy. We
4370 can't just offset backwards from the current code point, because we
4371 don't know if there's been an options resetting after the ket. The
4372 correct offset was computed above.
4373
4374 Then, when we are doing the actual compile phase, check to see whether
4375 this group is a non-atomic one that could match an empty string. If so,
4376 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4377 that runtime checking can be done. [This check is also applied to
4378 atomic groups at runtime, but in a different way.] */
4379
4380 else
4381 {
4382 uschar *ketcode = code - ketoffset;
4383 uschar *bracode = ketcode - GET(ketcode, 1);
4384 *ketcode = OP_KETRMAX + repeat_type;
4385 if (lengthptr == NULL && *bracode != OP_ONCE)
4386 {
4387 uschar *scode = bracode;
4388 do
4389 {
4390 if (could_be_empty_branch(scode, ketcode, utf8, cd))
4391 {
4392 *bracode += OP_SBRA - OP_BRA;
4393 break;
4394 }
4395 scode += GET(scode, 1);
4396 }
4397 while (*scode == OP_ALT);
4398 }
4399 }
4400 }
4401
4402 /* If previous is OP_FAIL, it was generated by an empty class [] in
4403 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4404 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4405 error above. We can just ignore the repeat in JS case. */
4406
4407 else if (*previous == OP_FAIL) goto END_REPEAT;
4408
4409 /* Else there's some kind of shambles */
4410
4411 else
4412 {
4413 *errorcodeptr = ERR11;
4414 goto FAILED;
4415 }
4416
4417 /* If the character following a repeat is '+', or if certain optimization
4418 tests above succeeded, possessive_quantifier is TRUE. For some of the
4419 simpler opcodes, there is an special alternative opcode for this. For
4420 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4421 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4422 but the special opcodes can optimize it a bit. The repeated item starts at
4423 tempcode, not at previous, which might be the first part of a string whose
4424 (former) last char we repeated.
4425
4426 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4427 an 'upto' may follow. We skip over an 'exact' item, and then test the
4428 length of what remains before proceeding. */
4429
4430 if (possessive_quantifier)
4431 {
4432 int len;
4433
4434 if (*tempcode == OP_TYPEEXACT)
4435 tempcode += _pcre_OP_lengths[*tempcode] +
4436 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4437
4438 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4439 {
4440 tempcode += _pcre_OP_lengths[*tempcode];
4441 #ifdef SUPPORT_UTF8
4442 if (utf8 && tempcode[-1] >= 0xc0)
4443 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4444 #endif
4445 }
4446
4447 len = code - tempcode;
4448 if (len > 0) switch (*tempcode)
4449 {
4450 case OP_STAR: *tempcode = OP_POSSTAR; break;
4451 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4452 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4453 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4454
4455 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4456 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4457 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4458 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4459
4460 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4461 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4462 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4463 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4464
4465 /* Because we are moving code along, we must ensure that any
4466 pending recursive references are updated. */
4467
4468 default:
4469 *code = OP_END;
4470 adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4471 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4472 code += 1 + LINK_SIZE;
4473 len += 1 + LINK_SIZE;
4474 tempcode[0] = OP_ONCE;
4475 *code++ = OP_KET;
4476 PUTINC(code, 0, len);
4477 PUT(tempcode, 1, len);
4478 break;
4479 }
4480 }
4481
4482 /* In all case we no longer have a previous item. We also set the
4483 "follows varying string" flag for subsequently encountered reqbytes if
4484 it isn't already set and we have just passed a varying length item. */
4485
4486 END_REPEAT:
4487 previous = NULL;
4488 cd->req_varyopt |= reqvary;
4489 break;
4490
4491
4492 /* ===================================================================*/
4493 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4494 lookbehind or option setting or condition or all the other extended
4495 parenthesis forms. */
4496
4497 case CHAR_LEFT_PARENTHESIS:
4498 newoptions = options;
4499 skipbytes = 0;
4500 bravalue = OP_CBRA;
4501 save_hwm = cd->hwm;
4502 reset_bracount = FALSE;
4503
4504 /* First deal with various "verbs" that can be introduced by '*'. */
4505
4506 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4507 {
4508 int i, namelen;
4509 const char *vn = verbnames;
4510 const uschar *name = ++ptr;
4511 previous = NULL;
4512 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4513 if (*ptr == CHAR_COLON)
4514 {
4515 *errorcodeptr = ERR59; /* Not supported */
4516 goto FAILED;
4517 }
4518 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4519 {
4520 *errorcodeptr = ERR60;
4521 goto FAILED;
4522 }
4523 namelen = ptr - name;
4524 for (i = 0; i < verbcount; i++)
4525 {
4526 if (namelen == verbs[i].len &&
4527 strncmp((char *)name, vn, namelen) == 0)
4528 {
4529 /* Check for open captures before ACCEPT */
4530
4531 if (verbs[i].op == OP_ACCEPT)
4532 {
4533 open_capitem *oc;
4534 cd->had_accept = TRUE;
4535 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4536 {
4537 *code++ = OP_CLOSE;
4538 PUT2INC(code, 0, oc->number);
4539 }
4540 }
4541 *code++ = verbs[i].op;
4542 break;
4543 }
4544 vn += verbs[i].len + 1;
4545 }
4546 if (i < verbcount) continue;
4547 *errorcodeptr = ERR60;
4548 goto FAILED;
4549 }
4550
4551 /* Deal with the extended parentheses; all are introduced by '?', and the
4552 appearance of any of them means that this is not a capturing group. */
4553
4554 else if (*ptr == CHAR_QUESTION_MARK)
4555 {
4556 int i, set, unset, namelen;
4557 int *optset;
4558 const uschar *name;
4559 uschar *slot;
4560
4561 switch (*(++ptr))
4562 {
4563 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4564 ptr++;
4565 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4566 if (*ptr == 0)
4567 {
4568 *errorcodeptr = ERR18;
4569 goto FAILED;
4570 }
4571 continue;
4572
4573
4574 /* ------------------------------------------------------------ */
4575 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4576 reset_bracount = TRUE;
4577 /* Fall through */
4578
4579 /* ------------------------------------------------------------ */
4580 case CHAR_COLON: /* Non-capturing bracket */
4581 bravalue = OP_BRA;
4582 ptr++;
4583 break;
4584
4585
4586 /* ------------------------------------------------------------ */
4587 case CHAR_LEFT_PARENTHESIS:
4588 bravalue = OP_COND; /* Conditional group */
4589
4590 /* A condition can be an assertion, a number (referring to a numbered
4591 group), a name (referring to a named group), or 'R', referring to
4592 recursion. R<digits> and R&name are also permitted for recursion tests.
4593
4594 There are several syntaxes for testing a named group: (?(name)) is used
4595 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4596
4597 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4598 be the recursive thing or the name 'R' (and similarly for 'R' followed
4599 by digits), and (b) a number could be a name that consists of digits.
4600 In both cases, we look for a name first; if not found, we try the other
4601 cases. */
4602
4603 /* For conditions that are assertions, check the syntax, and then exit
4604 the switch. This will take control down to where bracketed groups,
4605 including assertions, are processed. */
4606
4607 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4608 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4609 break;
4610
4611 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4612 below), and all need to skip 3 bytes at the start of the group. */
4613
4614 code[1+LINK_SIZE] = OP_CREF;
4615 skipbytes = 3;
4616 refsign = -1;
4617
4618 /* Check for a test for recursion in a named group. */
4619
4620 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4621 {
4622 terminator = -1;
4623 ptr += 2;
4624 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4625 }
4626
4627 /* Check for a test for a named group's having been set, using the Perl
4628 syntax (?(<name>) or (?('name') */
4629
4630 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4631 {
4632 terminator = CHAR_GREATER_THAN_SIGN;
4633 ptr++;
4634 }
4635 else if (ptr[1] == CHAR_APOSTROPHE)
4636 {
4637 terminator = CHAR_APOSTROPHE;
4638 ptr++;
4639 }
4640 else
4641 {
4642 terminator = 0;
4643 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4644 }
4645
4646 /* We now expect to read a name; any thing else is an error */
4647
4648 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4649 {
4650 ptr += 1; /* To get the right offset */
4651 *errorcodeptr = ERR28;
4652 goto FAILED;
4653 }
4654
4655 /* Read the name, but also get it as a number if it's all digits */
4656
4657 recno = 0;
4658 name = ++ptr;
4659 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4660 {
4661 if (recno >= 0)
4662 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4663 recno * 10 + *ptr - CHAR_0 : -1;
4664 ptr++;
4665 }
4666 namelen = ptr - name;
4667
4668 if ((terminator > 0 && *ptr++ != terminator) ||
4669 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4670 {
4671 ptr--; /* Error offset */
4672 *errorcodeptr = ERR26;
4673 goto FAILED;
4674 }
4675
4676 /* Do no further checking in the pre-compile phase. */
4677
4678 if (lengthptr != NULL) break;
4679
4680 /* In the real compile we do the work of looking for the actual
4681 reference. If the string started with "+" or "-" we require the rest to
4682 be digits, in which case recno will be set. */
4683
4684 if (refsign > 0)
4685 {
4686 if (recno <= 0)
4687 {
4688 *errorcodeptr = ERR58;
4689 goto FAILED;
4690 }
4691 recno = (refsign == CHAR_MINUS)?
4692 cd->bracount - recno + 1 : recno +cd->bracount;
4693 if (recno <= 0 || recno > cd->final_bracount)
4694 {
4695 *errorcodeptr = ERR15;
4696 goto FAILED;
4697 }
4698 PUT2(code, 2+LINK_SIZE, recno);
4699 break;
4700 }
4701
4702 /* Otherwise (did not start with "+" or "-"), start by looking for the
4703 name. If we find a name, add one to the opcode to change OP_CREF or
4704 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4705 except they record that the reference was originally to a name. The
4706 information is used to check duplicate names. */
4707
4708 slot = cd->name_table;
4709 for (i = 0; i < cd->names_found; i++)
4710 {
4711 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4712 slot += cd->name_entry_size;
4713 }
4714
4715 /* Found a previous named subpattern */
4716
4717 if (i < cd->names_found)
4718 {
4719 recno = GET2(slot, 0);
4720 PUT2(code, 2+LINK_SIZE, recno);
4721 code[1+LINK_SIZE]++;
4722 }
4723
4724 /* Search the pattern for a forward reference */
4725
4726 else if ((i = find_parens(cd, name, namelen,
4727 (options & PCRE_EXTENDED) != 0)) > 0)
4728 {
4729 PUT2(code, 2+LINK_SIZE, i);
4730 code[1+LINK_SIZE]++;
4731 }
4732
4733 /* If terminator == 0 it means that the name followed directly after
4734 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4735 some further alternatives to try. For the cases where terminator != 0
4736 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4737 now checked all the possibilities, so give an error. */
4738
4739 else if (terminator != 0)
4740 {
4741 *errorcodeptr = ERR15;
4742 goto FAILED;
4743 }
4744
4745 /* Check for (?(R) for recursion. Allow digits after R to specify a
4746 specific group number. */
4747
4748 else if (*name == CHAR_R)
4749 {
4750 recno = 0;
4751 for (i = 1; i < namelen; i++)
4752 {
4753 if ((digitab[name[i]] & ctype_digit) == 0)
4754 {
4755 *errorcodeptr = ERR15;
4756 goto FAILED;
4757 }
4758 recno = recno * 10 + name[i] - CHAR_0;
4759 }
4760 if (recno == 0) recno = RREF_ANY;
4761 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4762 PUT2(code, 2+LINK_SIZE, recno);
4763 }
4764
4765 /* Similarly, check for the (?(DEFINE) "condition", which is always
4766 false. */
4767
4768 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4769 {
4770 code[1+LINK_SIZE] = OP_DEF;
4771 skipbytes = 1;
4772 }
4773
4774 /* Check for the "name" actually being a subpattern number. We are
4775 in the second pass here, so final_bracount is set. */
4776
4777 else if (recno > 0 && recno <= cd->final_bracount)
4778 {
4779 PUT2(code, 2+LINK_SIZE, recno);
4780 }
4781
4782 /* Either an unidentified subpattern, or a reference to (?(0) */
4783
4784 else
4785 {
4786 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4787 goto FAILED;
4788 }
4789 break;
4790
4791
4792 /* ------------------------------------------------------------ */
4793 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4794 bravalue = OP_ASSERT;
4795 ptr++;
4796 break;
4797
4798
4799 /* ------------------------------------------------------------ */
4800 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4801 ptr++;
4802 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4803 {
4804 *code++ = OP_FAIL;
4805 previous = NULL;
4806 continue;
4807 }
4808 bravalue = OP_ASSERT_NOT;
4809 break;
4810
4811
4812 /* ------------------------------------------------------------ */
4813 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4814 switch (ptr[1])
4815 {
4816 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4817 bravalue = OP_ASSERTBACK;
4818 ptr += 2;
4819 break;
4820
4821 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4822 bravalue = OP_ASSERTBACK_NOT;
4823 ptr += 2;
4824 break;
4825
4826 default: /* Could be name define, else bad */
4827 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4828 ptr++; /* Correct offset for error */
4829 *errorcodeptr = ERR24;
4830 goto FAILED;
4831 }
4832 break;
4833
4834
4835 /* ------------------------------------------------------------ */
4836 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4837 bravalue = OP_ONCE;
4838 ptr++;
4839 break;
4840
4841
4842 /* ------------------------------------------------------------ */
4843 case CHAR_C: /* Callout - may be followed by digits; */
4844 previous_callout = code; /* Save for later completion */
4845 after_manual_callout = 1; /* Skip one item before completing */
4846 *code++ = OP_CALLOUT;
4847 {
4848 int n = 0;
4849 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4850 n = n * 10 + *ptr - CHAR_0;
4851 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4852 {
4853 *errorcodeptr = ERR39;
4854 goto FAILED;
4855 }
4856 if (n > 255)
4857 {
4858 *errorcodeptr = ERR38;
4859 goto FAILED;
4860 }
4861 *code++ = n;
4862 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4863 PUT(code, LINK_SIZE, 0); /* Default length */
4864 code += 2 * LINK_SIZE;
4865 }
4866 previous = NULL;
4867 continue;
4868
4869
4870 /* ------------------------------------------------------------ */
4871 case CHAR_P: /* Python-style named subpattern handling */
4872 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4873 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4874 {
4875 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4876 terminator = CHAR_RIGHT_PARENTHESIS;
4877 goto NAMED_REF_OR_RECURSE;
4878 }
4879 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4880 {
4881 *errorcodeptr = ERR41;
4882 goto FAILED;
4883 }
4884 /* Fall through to handle (?P< as (?< is handled */
4885
4886
4887 /* ------------------------------------------------------------ */
4888 DEFINE_NAME: /* Come here from (?< handling */
4889 case CHAR_APOSTROPHE:
4890 {
4891 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4892 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4893 name = ++ptr;
4894
4895 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4896 namelen = ptr - name;
4897
4898 /* In the pre-compile phase, just do a syntax check. */
4899
4900 if (lengthptr != NULL)
4901 {
4902 if (*ptr != terminator)
4903 {
4904 *errorcodeptr = ERR42;
4905 goto FAILED;
4906 }
4907 if (cd->names_found >= MAX_NAME_COUNT)
4908 {
4909 *errorcodeptr = ERR49;
4910 goto FAILED;
4911 }
4912 if (namelen + 3 > cd->name_entry_size)
4913 {
4914 cd->name_entry_size = namelen + 3;
4915 if (namelen > MAX_NAME_SIZE)
4916 {
4917 *errorcodeptr = ERR48;
4918 goto FAILED;
4919 }
4920 }
4921 }
4922
4923 /* In the real compile, create the entry in the table, maintaining
4924 alphabetical order. Duplicate names for different numbers are
4925 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4926 number are always OK. (An existing number can be re-used if (?|
4927 appears in the pattern.) In either event, a duplicate name results in
4928 a duplicate entry in the table, even if the number is the same. This
4929 is because the number of names, and hence the table size, is computed
4930 in the pre-compile, and it affects various numbers and pointers which
4931 would all have to be modified, and the compiled code moved down, if
4932 duplicates with the same number were omitted from the table. This
4933 doesn't seem worth the hassle. However, *different* names for the
4934 same number are not permitted. */
4935
4936 else
4937 {
4938 BOOL dupname = FALSE;
4939 slot = cd->name_table;
4940
4941 for (i = 0; i < cd->names_found; i++)
4942 {
4943 int crc = memcmp(name, slot+2, namelen);
4944 if (crc == 0)
4945 {
4946 if (slot[2+namelen] == 0)
4947 {
4948 if (GET2(slot, 0) != cd->bracount + 1 &&
4949 (options & PCRE_DUPNAMES) == 0)
4950 {
4951 *errorcodeptr = ERR43;
4952 goto FAILED;
4953 }
4954 else dupname = TRUE;
4955 }
4956 else crc = -1; /* Current name is a substring */
4957 }
4958
4959 /* Make space in the table and break the loop for an earlier
4960 name. For a duplicate or later name, carry on. We do this for
4961 duplicates so that in the simple case (when ?(| is not used) they
4962 are in order of their numbers. */
4963
4964 if (crc < 0)
4965 {
4966 memmove(slot + cd->name_entry_size, slot,
4967 (cd->names_found - i) * cd->name_entry_size);
4968 break;
4969 }
4970
4971 /* Continue the loop for a later or duplicate name */
4972
4973 slot += cd->name_entry_size;
4974 }
4975
4976 /* For non-duplicate names, check for a duplicate number before
4977 adding the new name. */
4978
4979 if (!dupname)
4980 {
4981 uschar *cslot = cd->name_table;
4982 for (i = 0; i < cd->names_found; i++)
4983 {
4984 if (cslot != slot)
4985 {
4986 if (GET2(cslot, 0) == cd->bracount + 1)
4987 {
4988 *errorcodeptr = ERR65;
4989 goto FAILED;
4990 }
4991 }
4992 else i--;
4993 cslot += cd->name_entry_size;
4994 }
4995 }
4996
4997 PUT2(slot, 0, cd->bracount + 1);
4998 memcpy(slot + 2, name, namelen);
4999 slot[2+namelen] = 0;
5000 }
5001 }
5002
5003 /* In both pre-compile and compile, count the number of names we've
5004 encountered. */
5005
5006 cd->names_found++;
5007 ptr++; /* Move past > or ' */
5008 goto NUMBERED_GROUP;
5009
5010
5011 /* ------------------------------------------------------------ */
5012 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
5013 terminator = CHAR_RIGHT_PARENTHESIS;
5014 is_recurse = TRUE;
5015 /* Fall through */
5016
5017 /* We come here from the Python syntax above that handles both
5018 references (?P=name) and recursion (?P>name), as well as falling
5019 through from the Perl recursion syntax (?&name). We also come here from
5020 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5021 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5022
5023 NAMED_REF_OR_RECURSE:
5024 name = ++ptr;
5025 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5026 namelen = ptr - name;
5027
5028 /* In the pre-compile phase, do a syntax check and set a dummy
5029 reference number. */
5030
5031 if (lengthptr != NULL)
5032 {
5033 if (namelen == 0)
5034 {
5035 *errorcodeptr = ERR62;
5036 goto FAILED;
5037 }
5038 if (*ptr != terminator)
5039 {
5040 *errorcodeptr = ERR42;
5041 goto FAILED;
5042 }
5043 if (namelen > MAX_NAME_SIZE)
5044 {
5045 *errorcodeptr = ERR48;
5046 goto FAILED;
5047 }
5048 recno = 0;
5049 }
5050
5051 /* In the real compile, seek the name in the table. We check the name
5052 first, and then check that we have reached the end of the name in the
5053 table. That way, if the name that is longer than any in the table,
5054 the comparison will fail without reading beyond the table entry. */
5055
5056 else
5057 {
5058 slot = cd->name_table;
5059 for (i = 0; i < cd->names_found; i++)
5060 {
5061 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5062 slot[2+namelen] == 0)
5063 break;
5064 slot += cd->name_entry_size;
5065 }
5066
5067 if (i < cd->names_found) /* Back reference */
5068 {
5069 recno = GET2(slot, 0);
5070 }
5071 else if ((recno = /* Forward back reference */
5072 find_parens(cd, name, namelen,
5073 (options & PCRE_EXTENDED) != 0)) <= 0)
5074 {
5075 *errorcodeptr = ERR15;
5076 goto FAILED;
5077 }
5078 }
5079
5080 /* In both phases, we can now go to the code than handles numerical
5081 recursion or backreferences. */
5082
5083 if (is_recurse) goto HANDLE_RECURSION;
5084 else goto HANDLE_REFERENCE;
5085
5086
5087 /* ------------------------------------------------------------ */
5088 case CHAR_R: /* Recursion */
5089 ptr++; /* Same as (?0) */
5090 /* Fall through */
5091
5092
5093 /* ------------------------------------------------------------ */
5094 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
5095 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5096 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5097 {
5098 const uschar *called;
5099 terminator = CHAR_RIGHT_PARENTHESIS;
5100
5101 /* Come here from the \g<...> and \g'...' code (Oniguruma
5102 compatibility). However, the syntax has been checked to ensure that
5103 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5104 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5105 ever be taken. */
5106
5107 HANDLE_NUMERICAL_RECURSION:
5108
5109 if ((refsign = *ptr) == CHAR_PLUS)
5110 {
5111 ptr++;
5112 if ((digitab[*ptr] & ctype_digit) == 0)
5113 {
5114 *errorcodeptr = ERR63;
5115 goto FAILED;
5116 }
5117 }
5118 else if (refsign == CHAR_MINUS)
5119 {
5120 if ((digitab[ptr[1]] & ctype_digit) == 0)
5121 goto OTHER_CHAR_AFTER_QUERY;
5122 ptr++;
5123 }
5124
5125 recno = 0;
5126 while((digitab[*ptr] & ctype_digit) != 0)
5127 recno = recno * 10 + *ptr++ - CHAR_0;
5128
5129 if (*ptr != terminator)
5130 {
5131 *errorcodeptr = ERR29;
5132 goto FAILED;
5133 }
5134
5135 if (refsign == CHAR_MINUS)
5136 {
5137 if (recno == 0)
5138 {
5139 *errorcodeptr = ERR58;
5140 goto FAILED;
5141 }
5142 recno = cd->bracount - recno + 1;
5143 if (recno <= 0)
5144 {
5145 *errorcodeptr = ERR15;
5146 goto FAILED;
5147 }
5148 }
5149 else if (refsign == CHAR_PLUS)
5150 {
5151 if (recno == 0)
5152 {
5153 *errorcodeptr = ERR58;
5154 goto FAILED;
5155 }
5156 recno += cd->bracount;
5157 }
5158
5159 /* Come here from code above that handles a named recursion */
5160
5161 HANDLE_RECURSION:
5162
5163 previous = code;
5164 called = cd->start_code;
5165
5166 /* When we are actually compiling, find the bracket that is being
5167 referenced. Temporarily end the regex in case it doesn't exist before
5168 this point. If we end up with a forward reference, first check that
5169 the bracket does occur later so we can give the error (and position)
5170 now. Then remember this forward reference in the workspace so it can
5171 be filled in at the end. */
5172
5173 if (lengthptr == NULL)
5174 {
5175 *code = OP_END;
5176 if (recno != 0)
5177 called = _pcre_find_bracket(cd->start_code, utf8, recno);
5178
5179 /* Forward reference */
5180
5181 if (called == NULL)
5182 {
5183 if (find_parens(cd, NULL, recno,
5184 (options & PCRE_EXTENDED) != 0) < 0)
5185 {
5186 *errorcodeptr = ERR15;
5187 goto FAILED;
5188 }
5189
5190 /* Fudge the value of "called" so that when it is inserted as an
5191 offset below, what it actually inserted is the reference number
5192 of the group. */
5193
5194 called = cd->start_code + recno;
5195 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5196 }
5197
5198 /* If not a forward reference, and the subpattern is still open,
5199 this is a recursive call. We check to see if this is a left
5200 recursion that could loop for ever, and diagnose that case. */
5201
5202 else if (GET(called, 1) == 0 &&
5203 could_be_empty(called, code, bcptr, utf8, cd))
5204 {
5205 *errorcodeptr = ERR40;
5206 goto FAILED;
5207 }
5208 }
5209
5210 /* Insert the recursion/subroutine item, automatically wrapped inside
5211 "once" brackets. Set up a "previous group" length so that a
5212 subsequent quantifier will work. */
5213
5214 *code = OP_ONCE;
5215 PUT(code, 1, 2 + 2*LINK_SIZE);
5216 code += 1 + LINK_SIZE;
5217
5218 *code = OP_RECURSE;
5219 PUT(code, 1, called - cd->start_code);
5220 code += 1 + LINK_SIZE;
5221
5222 *code = OP_KET;
5223 PUT(code, 1, 2 + 2*LINK_SIZE);
5224 code += 1 + LINK_SIZE;
5225
5226 length_prevgroup = 3 + 3*LINK_SIZE;
5227 }
5228
5229 /* Can't determine a first byte now */
5230
5231 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5232 continue;
5233
5234
5235 /* ------------------------------------------------------------ */
5236 default: /* Other characters: check option setting */
5237 OTHER_CHAR_AFTER_QUERY:
5238 set = unset = 0;
5239 optset = &set;
5240
5241 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5242 {
5243 switch (*ptr++)
5244 {
5245 case CHAR_MINUS: optset = &unset; break;
5246
5247 case CHAR_J: /* Record that it changed in the external options */
5248 *optset |= PCRE_DUPNAMES;
5249 cd->external_flags |= PCRE_JCHANGED;
5250 break;
5251
5252 case CHAR_i: *optset |= PCRE_CASELESS; break;
5253 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5254 case CHAR_s: *optset |= PCRE_DOTALL; break;
5255 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5256 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5257 case CHAR_X: *optset |= PCRE_EXTRA; break;
5258
5259 default: *errorcodeptr = ERR12;
5260 ptr--; /* Correct the offset */
5261 goto FAILED;
5262 }
5263 }
5264
5265 /* Set up the changed option bits, but don't change anything yet. */
5266
5267 newoptions = (options | set) & (~unset);
5268
5269 /* If the options ended with ')' this is not the start of a nested
5270 group with option changes, so the options change at this level. If this
5271 item is right at the start of the pattern, the options can be
5272 abstracted and made external in the pre-compile phase, and ignored in
5273 the compile phase. This can be helpful when matching -- for instance in
5274 caseless checking of required bytes.
5275
5276 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5277 definitely *not* at the start of the pattern because something has been
5278 compiled. In the pre-compile phase, however, the code pointer can have
5279 that value after the start, because it gets reset as code is discarded
5280 during the pre-compile. However, this can happen only at top level - if
5281 we are within parentheses, the starting BRA will still be present. At
5282 any parenthesis level, the length value can be used to test if anything
5283 has been compiled at that level. Thus, a test for both these conditions
5284 is necessary to ensure we correctly detect the start of the pattern in
5285 both phases.
5286
5287 If we are not at the pattern start, compile code to change the ims
5288 options if this setting actually changes any of them, and reset the
5289 greedy defaults and the case value for firstbyte and reqbyte. */
5290
5291 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5292 {
5293 if (code == cd->start_code + 1 + LINK_SIZE &&
5294 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5295 {
5296 cd->external_options = newoptions;
5297 }
5298 else
5299 {
5300 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5301 {
5302 *code++ = OP_OPT;
5303 *code++ = newoptions & PCRE_IMS;
5304 }
5305 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5306 greedy_non_default = greedy_default ^ 1;
5307 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5308 }
5309
5310 /* Change options at this level, and pass them back for use
5311 in subsequent branches. When not at the start of the pattern, this
5312 information is also necessary so that a resetting item can be
5313 compiled at the end of a group (if we are in a group). */
5314
5315 *optionsptr = options = newoptions;
5316 previous = NULL; /* This item can't be repeated */
5317 continue; /* It is complete */
5318 }
5319
5320 /* If the options ended with ':' we are heading into a nested group
5321 with possible change of options. Such groups are non-capturing and are
5322 not assertions of any kind. All we need to do is skip over the ':';
5323 the newoptions value is handled below. */
5324
5325 bravalue = OP_BRA;
5326 ptr++;
5327 } /* End of switch for character following (? */
5328 } /* End of (? handling */
5329
5330 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5331 all unadorned brackets become non-capturing and behave like (?:...)
5332 brackets. */
5333
5334 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5335 {
5336 bravalue = OP_BRA;
5337 }
5338
5339 /* Else we have a capturing group. */
5340
5341 else
5342 {
5343 NUMBERED_GROUP:
5344 cd->bracount += 1;
5345 PUT2(code, 1+LINK_SIZE, cd->bracount);
5346 skipbytes = 2;
5347 }
5348
5349 /* Process nested bracketed regex. Assertions may not be repeated, but
5350 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5351 non-register variable in order to be able to pass its address because some
5352 compilers complain otherwise. Pass in a new setting for the ims options if
5353 they have changed. */
5354
5355 previous = (bravalue >= OP_ONCE)? code : NULL;
5356 *code = bravalue;
5357 tempcode = code;
5358 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5359 length_prevgroup = 0; /* Initialize for pre-compile phase */
5360
5361 if (!compile_regex(
5362 newoptions, /* The complete new option state */
5363 options & PCRE_IMS, /* The previous ims option state */
5364 &tempcode, /* Where to put code (updated) */
5365 &ptr, /* Input pointer (updated) */
5366 errorcodeptr, /* Where to put an error message */
5367 (bravalue == OP_ASSERTBACK ||
5368 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5369 reset_bracount, /* True if (?| group */
5370 skipbytes, /* Skip over bracket number */
5371 &subfirstbyte, /* For possible first char */
5372 &subreqbyte, /* For possible last char */
5373 bcptr, /* Current branch chain */
5374 cd, /* Tables block */
5375 (lengthptr == NULL)? NULL : /* Actual compile phase */
5376 &length_prevgroup /* Pre-compile phase */
5377 ))
5378 goto FAILED;
5379
5380 /* At the end of compiling, code is still pointing to the start of the
5381 group, while tempcode has been updated to point past the end of the group
5382 and any option resetting that may follow it. The pattern pointer (ptr)
5383 is on the bracket. */
5384
5385 /* If this is a conditional bracket, check that there are no more than
5386 two branches in the group, or just one if it's a DEFINE group. We do this
5387 in the real compile phase, not in the pre-pass, where the whole group may
5388 not be available. */
5389
5390 if (bravalue == OP_COND && lengthptr == NULL)
5391 {
5392 uschar *tc = code;
5393 int condcount = 0;
5394
5395 do {
5396 condcount++;
5397 tc += GET(tc,1);
5398 }
5399 while (*tc != OP_KET);
5400
5401 /* A DEFINE group is never obeyed inline (the "condition" is always
5402 false). It must have only one branch. */
5403
5404 if (code[LINK_SIZE+1] == OP_DEF)
5405 {
5406 if (condcount > 1)
5407 {
5408 *errorcodeptr = ERR54;
5409 goto FAILED;
5410 }
5411 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5412 }
5413
5414 /* A "normal" conditional group. If there is just one branch, we must not
5415 make use of its firstbyte or reqbyte, because this is equivalent to an
5416 empty second branch. */
5417
5418 else
5419 {
5420 if (condcount > 2)
5421 {
5422 *errorcodeptr = ERR27;
5423 goto FAILED;
5424 }
5425 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5426 }
5427 }
5428
5429 /* Error if hit end of pattern */
5430
5431 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5432 {
5433 *errorcodeptr = ERR14;
5434 goto FAILED;
5435 }
5436
5437 /* In the pre-compile phase, update the length by the length of the group,
5438 less the brackets at either end. Then reduce the compiled code to just a
5439 set of non-capturing brackets so that it doesn't use much memory if it is
5440 duplicated by a quantifier.*/
5441
5442 if (lengthptr != NULL)
5443 {
5444 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5445 {
5446 *errorcodeptr = ERR20;
5447 goto FAILED;
5448 }
5449 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5450 *code++ = OP_BRA;
5451 PUTINC(code, 0, 1 + LINK_SIZE);
5452 *code++ = OP_KET;
5453 PUTINC(code, 0, 1 + LINK_SIZE);
5454 break; /* No need to waste time with special character handling */
5455 }
5456
5457 /* Otherwise update the main code pointer to the end of the group. */
5458
5459 code = tempcode;
5460
5461 /* For a DEFINE group, required and first character settings are not
5462 relevant. */
5463
5464 if (bravalue == OP_DEF) break;
5465
5466 /* Handle updating of the required and first characters for other types of
5467 group. Update for normal brackets of all kinds, and conditions with two
5468 branches (see code above). If the bracket is followed by a quantifier with
5469 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5470 zerofirstbyte outside the main loop so that they can be accessed for the
5471 back off. */
5472
5473 zeroreqbyte = reqbyte;
5474 zerofirstbyte = firstbyte;
5475 groupsetfirstbyte = FALSE;
5476
5477 if (bravalue >= OP_ONCE)
5478 {
5479 /* If we have not yet set a firstbyte in this branch, take it from the
5480 subpattern, remembering that it was set here so that a repeat of more
5481 than one can replicate it as reqbyte if necessary. If the subpattern has
5482 no firstbyte, set "none" for the whole branch. In both cases, a zero
5483 repeat forces firstbyte to "none". */
5484
5485 if (firstbyte == REQ_UNSET)
5486 {
5487 if (subfirstbyte >= 0)
5488 {
5489 firstbyte = subfirstbyte;
5490 groupsetfirstbyte = TRUE;
5491 }
5492 else firstbyte = REQ_NONE;
5493 zerofirstbyte = REQ_NONE;
5494 }
5495
5496 /* If firstbyte was previously set, convert the subpattern's firstbyte
5497 into reqbyte if there wasn't one, using the vary flag that was in
5498 existence beforehand. */
5499
5500 else if (subfirstbyte >= 0 && subreqbyte < 0)
5501 subreqbyte = subfirstbyte | tempreqvary;
5502
5503 /* If the subpattern set a required byte (or set a first byte that isn't
5504 really the first byte - see above), set it. */
5505
5506 if (subreqbyte >= 0) reqbyte = subreqbyte;
5507 }
5508
5509 /* For a forward assertion, we take the reqbyte, if set. This can be
5510 helpful if the pattern that follows the assertion doesn't set a different
5511 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5512 for an assertion, however because it leads to incorrect effect for patterns
5513 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5514 of a firstbyte. This is overcome by a scan at the end if there's no
5515 firstbyte, looking for an asserted first char. */
5516
5517 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5518 break; /* End of processing '(' */
5519
5520
5521 /* ===================================================================*/
5522 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5523 are arranged to be the negation of the corresponding OP_values. For the
5524 back references, the values are ESC_REF plus the reference number. Only
5525 back references and those types that consume a character may be repeated.
5526 We can test for values between ESC_b and ESC_Z for the latter; this may
5527 have to change if any new ones are ever created. */
5528
5529 case CHAR_BACKSLASH:
5530 tempptr = ptr;
5531 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5532 if (*errorcodeptr != 0) goto FAILED;
5533
5534 if (c < 0)
5535 {
5536 if (-c == ESC_Q) /* Handle start of quoted string */
5537 {
5538 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5539 ptr += 2; /* avoid empty string */
5540 else inescq = TRUE;
5541 continue;
5542 }
5543
5544 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5545
5546 /* For metasequences that actually match a character, we disable the
5547 setting of a first character if it hasn't already been set. */
5548
5549 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5550 firstbyte = REQ_NONE;
5551
5552 /* Set values to reset to if this is followed by a zero repeat. */
5553
5554 zerofirstbyte = firstbyte;
5555 zeroreqbyte = reqbyte;
5556
5557 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5558 is a subroutine call by number (Oniguruma syntax). In fact, the value
5559 -ESC_g is returned only for these cases. So we don't need to check for <
5560 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5561 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5562 that is a synonym for a named back reference). */
5563
5564 if (-c == ESC_g)
5565 {
5566 const uschar *p;
5567 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5568 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5569 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5570
5571 /* These two statements stop the compiler for warning about possibly
5572 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5573 fact, because we actually check for a number below, the paths that
5574 would actually be in error are never taken. */
5575
5576 skipbytes = 0;
5577 reset_bracount = FALSE;
5578
5579 /* Test for a name */
5580
5581 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5582 {
5583 BOOL isnumber = TRUE;
5584 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5585 {
5586 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5587 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5588 }
5589 if (*p != terminator)
5590 {
5591 *errorcodeptr = ERR57;
5592 break;
5593 }
5594 if (isnumber)
5595 {
5596 ptr++;
5597 goto HANDLE_NUMERICAL_RECURSION;
5598 }
5599 is_recurse = TRUE;
5600 goto NAMED_REF_OR_RECURSE;
5601 }
5602
5603 /* Test a signed number in angle brackets or quotes. */
5604
5605 p = ptr + 2;
5606 while ((digitab[*p] & ctype_digit) != 0) p++;
5607 if (*p != terminator)
5608 {
5609 *errorcodeptr = ERR57;
5610 break;
5611 }
5612 ptr++;
5613 goto HANDLE_NUMERICAL_RECURSION;
5614 }
5615
5616 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5617 We also support \k{name} (.NET syntax) */
5618
5619 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5620 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5621 {
5622 is_recurse = FALSE;
5623 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5624 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5625 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5626 goto NAMED_REF_OR_RECURSE;
5627 }
5628
5629 /* Back references are handled specially; must disable firstbyte if
5630 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5631 ':' later. */
5632
5633 if (-c >= ESC_REF)
5634 {
5635 open_capitem *oc;
5636 recno = -c - ESC_REF;
5637
5638 HANDLE_REFERENCE: /* Come here from named backref handling */
5639 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5640 previous = code;
5641 *code++ = OP_REF;
5642 PUT2INC(code, 0, recno);
5643 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5644 if (recno > cd->top_backref) cd->top_backref = recno;
5645
5646 /* Check to see if this back reference is recursive, that it, it
5647 is inside the group that it references. A flag is set so that the
5648 group can be made atomic. */
5649
5650 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5651 {
5652 if (oc->number == recno)
5653 {
5654 oc->flag = TRUE;
5655 break;
5656 }
5657 }
5658 }
5659
5660 /* So are Unicode property matches, if supported. */
5661
5662 #ifdef SUPPORT_UCP
5663 else if (-c == ESC_P || -c == ESC_p)
5664 {
5665 BOOL negated;
5666 int pdata;
5667 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5668 if (ptype < 0) goto FAILED;
5669 previous = code;
5670 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5671 *code++ = ptype;
5672 *code++ = pdata;
5673 }
5674 #else
5675
5676 /* If Unicode properties are not supported, \X, \P, and \p are not
5677 allowed. */
5678
5679 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5680 {
5681 *errorcodeptr = ERR45;
5682 goto FAILED;
5683 }
5684 #endif
5685
5686 /* For the rest (including \X when Unicode properties are supported), we
5687 can obtain the OP value by negating the escape value. */
5688
5689 else
5690 {
5691 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5692 *code++ = -c;
5693 }
5694 continue;
5695 }
5696
5697 /* We have a data character whose value is in c. In UTF-8 mode it may have
5698 a value > 127. We set its representation in the length/buffer, and then
5699 handle it as a data character. */
5700
5701 #ifdef SUPPORT_UTF8
5702 if (utf8 && c > 127)
5703 mclength = _pcre_ord2utf8(c, mcbuffer);
5704 else
5705 #endif
5706
5707 {
5708 mcbuffer[0] = c;
5709 mclength = 1;
5710 }
5711 goto ONE_CHAR;
5712
5713
5714 /* ===================================================================*/
5715 /* Handle a literal character. It is guaranteed not to be whitespace or #
5716 when the extended flag is set. If we are in UTF-8 mode, it may be a
5717 multi-byte literal character. */
5718
5719 default:
5720 NORMAL_CHAR:
5721 mclength = 1;
5722 mcbuffer[0] = c;
5723
5724 #ifdef SUPPORT_UTF8
5725 if (utf8 && c >= 0xc0)
5726 {
5727 while ((ptr[1] & 0xc0) == 0x80)
5728 mcbuffer[mclength++] = *(++ptr);
5729 }
5730 #endif
5731
5732 /* At this point we have the character's bytes in mcbuffer, and the length
5733 in mclength. When not in UTF-8 mode, the length is always 1. */
5734
5735 ONE_CHAR:
5736 previous = code;
5737 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5738 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5739
5740 /* Remember if \r or \n were seen */
5741
5742 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5743 cd->external_flags |= PCRE_HASCRORLF;
5744
5745 /* Set the first and required bytes appropriately. If no previous first
5746 byte, set it from this character, but revert to none on a zero repeat.
5747 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5748 repeat. */
5749
5750 if (firstbyte == REQ_UNSET)
5751 {
5752 zerofirstbyte = REQ_NONE;
5753 zeroreqbyte = reqbyte;
5754
5755 /* If the character is more than one byte long, we can set firstbyte
5756 only if it is not to be matched caselessly. */
5757
5758 if (mclength == 1 || req_caseopt == 0)
5759 {
5760 firstbyte = mcbuffer[0] | req_caseopt;
5761 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5762 }
5763 else firstbyte = reqbyte = REQ_NONE;
5764 }
5765
5766 /* firstbyte was previously set; we can set reqbyte only the length is
5767 1 or the matching is caseful. */
5768
5769 else
5770 {
5771 zerofirstbyte = firstbyte;
5772 zeroreqbyte = reqbyte;
5773 if (mclength == 1 || req_caseopt == 0)
5774 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5775 }
5776
5777 break; /* End of literal character handling */
5778 }
5779 } /* end of big loop */
5780
5781
5782 /* Control never reaches here by falling through, only by a goto for all the
5783 error states. Pass back the position in the pattern so that it can be displayed
5784 to the user for diagnosing the error. */
5785
5786 FAILED:
5787 *ptrptr = ptr;
5788 return FALSE;
5789 }
5790
5791
5792
5793
5794 /*************************************************
5795 * Compile sequence of alternatives *
5796 *************************************************/
5797
5798 /* On entry, ptr is pointing past the bracket character, but on return it
5799 points to the closing bracket, or vertical bar, or end of string. The code
5800 variable is pointing at the byte into which the BRA operator has been stored.
5801 If the ims options are changed at the start (for a (?ims: group) or during any
5802 branch, we need to insert an OP_OPT item at the start of every following branch
5803 to ensure they get set correctly at run time, and also pass the new options
5804 into every subsequent branch compile.
5805
5806 This function is used during the pre-compile phase when we are trying to find
5807 out the amount of memory needed, as well as during the real compile phase. The
5808 value of lengthptr distinguishes the two phases.
5809
5810 Arguments:
5811 options option bits, including any changes for this subpattern
5812 oldims previous settings of ims option bits
5813 codeptr -> the address of the current code pointer
5814 ptrptr -> the address of the current pattern pointer
5815 errorcodeptr -> pointer to error code variable
5816 lookbehind TRUE if this is a lookbehind assertion
5817 reset_bracount TRUE to reset the count for each branch
5818 skipbytes skip this many bytes at start (for brackets and OP_COND)
5819 firstbyteptr place to put the first required character, or a negative number
5820 reqbyteptr place to put the last required character, or a negative number
5821 bcptr pointer to the chain of currently open branches
5822 cd points to the data block with tables pointers etc.
5823 lengthptr NULL during the real compile phase
5824 points to length accumulator during pre-compile phase
5825
5826 Returns: TRUE on success
5827 */
5828
5829 static BOOL
5830 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5831 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5832 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5833 int *lengthptr)
5834 {
5835 const uschar *ptr = *ptrptr;
5836 uschar *code = *codeptr;
5837 uschar *last_branch = code;
5838 uschar *start_bracket = code;
5839 uschar *reverse_count = NULL;
5840 open_capitem capitem;
5841 int capnumber = 0;
5842 int firstbyte, reqbyte;
5843 int branchfirstbyte, branchreqbyte;
5844 int length;
5845 int orig_bracount;
5846 int max_bracount;
5847 int old_external_options = cd->external_options;
5848 branch_chain bc;
5849
5850 bc.outer = bcptr;
5851 bc.current_branch = code;
5852
5853 firstbyte = reqbyte = REQ_UNSET;
5854
5855 /* Accumulate the length for use in the pre-compile phase. Start with the
5856 length of the BRA and KET and any extra bytes that are required at the
5857 beginning. We accumulate in a local variable to save frequent testing of
5858 lenthptr for NULL. We cannot do this by looking at the value of code at the
5859 start and end of each alternative, because compiled items are discarded during
5860 the pre-compile phase so that the work space is not exceeded. */
5861
5862 length = 2 + 2*LINK_SIZE + skipbytes;
5863
5864 /* WARNING: If the above line is changed for any reason, you must also change
5865 the code that abstracts option settings at the start of the pattern and makes
5866 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5867 pre-compile phase to find out whether anything has yet been compiled or not. */
5868
5869 /* If this is a capturing subpattern, add to the chain of open capturing items
5870 so that we can detect them if (*ACCEPT) is encountered. This is also used to
5871 detect groups that contain recursive back references to themselves. */
5872
5873 if (*code == OP_CBRA)
5874 {
5875 capnumber = GET2(code, 1 + LINK_SIZE);
5876 capitem.number = capnumber;
5877 capitem.next = cd->open_caps;
5878 capitem.flag = FALSE;
5879 cd->open_caps = &capitem;
5880 }
5881
5882 /* Offset is set zero to mark that this bracket is still open */
5883
5884 PUT(code, 1, 0);
5885 code += 1 + LINK_SIZE + skipbytes;
5886
5887 /* Loop for each alternative branch */
5888
5889 orig_bracount = max_bracount = cd->bracount;
5890 for (;;)
5891 {
5892 /* For a (?| group, reset the capturing bracket count so that each branch
5893 uses the same numbers. */
5894
5895 if (reset_bracount) cd->bracount = orig_bracount;
5896
5897 /* Handle a change of ims options at the start of the branch */
5898
5899 if ((options & PCRE_IMS) != oldims)
5900 {
5901 *code++ = OP_OPT;
5902 *code++ = options & PCRE_IMS;
5903 length += 2;
5904 }
5905
5906 /* Set up dummy OP_REVERSE if lookbehind assertion */
5907
5908 if (lookbehind)
5909 {
5910 *code++ = OP_REVERSE;
5911 reverse_count = code;
5912 PUTINC(code, 0, 0);
5913 length += 1 + LINK_SIZE;
5914 }
5915
5916 /* Now compile the branch; in the pre-compile phase its length gets added
5917 into the length. */
5918
5919 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5920 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5921 {
5922 *ptrptr = ptr;
5923 return FALSE;
5924 }
5925
5926 /* If the external options have changed during this branch, it means that we
5927 are at the top level, and a leading option setting has been encountered. We
5928 need to re-set the original option values to take account of this so that,
5929 during the pre-compile phase, we know to allow for a re-set at the start of
5930 subsequent branches. */
5931
5932 if (old_external_options != cd->external_options)
5933 oldims = cd->external_options & PCRE_IMS;
5934
5935 /* Keep the highest bracket count in case (?| was used and some branch
5936 has fewer than the rest. */
5937
5938 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5939
5940 /* In the real compile phase, there is some post-processing to be done. */
5941
5942 if (lengthptr == NULL)
5943 {
5944 /* If this is the first branch, the firstbyte and reqbyte values for the
5945 branch become the values for the regex. */
5946
5947 if (*last_branch != OP_ALT)
5948 {
5949 firstbyte = branchfirstbyte;
5950 reqbyte = branchreqbyte;
5951 }
5952
5953 /* If this is not the first branch, the first char and reqbyte have to
5954 match the values from all the previous branches, except that if the
5955 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5956 and we set REQ_VARY for the regex. */
5957
5958 else
5959 {
5960 /* If we previously had a firstbyte, but it doesn't match the new branch,
5961 we have to abandon the firstbyte for the regex, but if there was
5962 previously no reqbyte, it takes on the value of the old firstbyte. */
5963
5964 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5965 {
5966 if (reqbyte < 0) reqbyte = firstbyte;
5967 firstbyte = REQ_NONE;
5968 }
5969
5970 /* If we (now or from before) have no firstbyte, a firstbyte from the
5971 branch becomes a reqbyte if there isn't a branch reqbyte. */
5972
5973 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5974 branchreqbyte = branchfirstbyte;
5975
5976 /* Now ensure that the reqbytes match */
5977
5978 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5979 reqbyte = REQ_NONE;
5980 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5981 }
5982
5983 /* If lookbehind, check that this branch matches a fixed-length string, and
5984 put the length into the OP_REVERSE item. Temporarily mark the end of the
5985 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5986 because there may be forward references that we can't check here. Set a
5987 flag to cause another lookbehind check at the end. Why not do it all at the
5988 end? Because common, erroneous checks are picked up here and the offset of
5989 the problem can be shown. */
5990
5991 if (lookbehind)
5992 {
5993 int fixed_length;
5994 *code = OP_END;
5995 fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5996 DPRINTF(("fixed length = %d\n", fixed_length));
5997 if (fixed_length == -3)
5998 {
5999 cd->check_lookbehind = TRUE;
6000 }
6001 else if (fixed_length < 0)
6002 {
6003 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
6004 *ptrptr = ptr;
6005 return FALSE;
6006 }
6007 else { PUT(reverse_count, 0, fixed_length); }
6008 }
6009 }
6010
6011 /* Reached end of expression, either ')' or end of pattern. In the real
6012 compile phase, go back through the alternative branches and reverse the chain
6013 of offsets, with the field in the BRA item now becoming an offset to the
6014 first alternative. If there are no alternatives, it points to the end of the
6015 group. The length in the terminating ket is always the length of the whole
6016 bracketed item. If any of the ims options were changed inside the group,
6017 compile a resetting op-code following, except at the very end of the pattern.
6018 Return leaving the pointer at the terminating char. */
6019
6020 if (*ptr != CHAR_VERTICAL_LINE)
6021 {
6022 if (lengthptr == NULL)
6023 {
6024 int branch_length = code - last_branch;
6025 do
6026 {
6027 int prev_length = GET(last_branch, 1);
6028 PUT(last_branch, 1, branch_length);
6029 branch_length = prev_length;
6030 last_branch -= branch_length;
6031 }
6032 while (branch_length > 0);
6033 }
6034
6035 /* Fill in the ket */
6036
6037 *code = OP_KET;
6038 PUT(code, 1, code - start_bracket);
6039 code += 1 + LINK_SIZE;
6040
6041 /* If it was a capturing subpattern, check to see if it contained any
6042 recursive back references. If so, we must wrap it in atomic brackets.
6043 In any event, remove the block from the chain. */
6044
6045 if (capnumber > 0)
6046 {
6047 if (cd->open_caps->flag)
6048 {
6049 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6050 code - start_bracket);
6051 *start_bracket = OP_ONCE;
6052 code += 1 + LINK_SIZE;
6053 PUT(start_bracket, 1, code - start_bracket);
6054 *code = OP_KET;
6055 PUT(code, 1, code - start_bracket);
6056 code += 1 + LINK_SIZE;
6057 length += 2 + 2*LINK_SIZE;
6058 }
6059 cd->open_caps = cd->open_caps->next;
6060 }
6061
6062 /* Reset options if needed. */
6063
6064 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6065 {
6066 *code++ = OP_OPT;
6067 *code++ = oldims;
6068 length += 2;
6069 }
6070
6071 /* Retain the highest bracket number, in case resetting was used. */
6072
6073 cd->bracount = max_bracount;
6074
6075 /* Set values to pass back */
6076
6077 *codeptr = code;
6078 *ptrptr = ptr;
6079 *firstbyteptr = firstbyte;
6080 *reqbyteptr = reqbyte;
6081 if (lengthptr != NULL)
6082 {
6083 if (OFLOW_MAX - *lengthptr < length)
6084 {
6085 *errorcodeptr = ERR20;
6086 return FALSE;
6087 }
6088 *lengthptr += length;
6089 }
6090 return TRUE;
6091 }
6092
6093 /* Another branch follows. In the pre-compile phase, we can move the code
6094 pointer back to where it was for the start of the first branch. (That is,
6095 pretend that each branch is the only one.)
6096
6097 In the real compile phase, insert an ALT node. Its length field points back
6098 to the previous branch while the bracket remains open. At the end the chain
6099 is reversed. It's done like this so that the start of the bracket has a
6100 zero offset until it is closed, making it possible to detect recursion. */
6101
6102 if (lengthptr != NULL)
6103 {
6104 code = *codeptr + 1 + LINK_SIZE + skipbytes;
6105 length += 1 + LINK_SIZE;
6106 }
6107 else
6108 {
6109 *code = OP_ALT;
6110 PUT(code, 1, code - last_branch);
6111 bc.current_branch = last_branch = code;
6112 code += 1 + LINK_SIZE;
6113 }
6114
6115 ptr++;
6116 }
6117 /* Control never reaches here */
6118 }
6119
6120
6121
6122
6123 /*************************************************
6124 * Check for anchored expression *
6125 *************************************************/
6126
6127 /* Try to find out if this is an anchored regular expression. Consider each
6128 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6129 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6130 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6131 counts, since OP_CIRC can match in the middle.
6132
6133 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6134 This is the code for \G, which means "match at start of match position, taking
6135 into account the match offset".
6136
6137 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6138 because that will try the rest of the pattern at all possible matching points,
6139 so there is no point trying again.... er ....
6140
6141 .... except when the .* appears inside capturing parentheses, and there is a
6142 subsequent back reference to those parentheses. We haven't enough information
6143 to catch that case precisely.
6144
6145 At first, the best we could do was to detect when .* was in capturing brackets
6146 and the highest back reference was greater than or equal to that level.
6147 However, by keeping a bitmap of the first 31 back references, we can catch some
6148 of the more common cases more precisely.
6149
6150 Arguments:
6151 code points to start of expression (the bracket)
6152 options points to the options setting
6153 bracket_map a bitmap of which brackets we are inside while testing; this
6154 handles up to substring 31; after that we just have to take
6155 the less precise approach
6156 backref_map the back reference bitmap
6157
6158 Returns: TRUE or FALSE
6159 */
6160
6161 static BOOL
6162 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6163 unsigned int backref_map)
6164 {
6165 do {
6166 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6167 options, PCRE_MULTILINE, FALSE);
6168 register int op = *scode;
6169
6170 /* Non-capturing brackets */
6171
6172 if (op == OP_BRA)
6173 {
6174 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6175 }
6176
6177 /* Capturing brackets */
6178
6179 else if (op == OP_CBRA)
6180 {
6181 int n = GET2(scode, 1+LINK_SIZE);
6182 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6183 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6184 }
6185
6186 /* Other brackets */
6187
6188 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6189 {
6190 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6191 }
6192
6193 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6194 it isn't in brackets that are or may be referenced. */
6195
6196 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6197 op == OP_TYPEPOSSTAR))
6198 {
6199 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6200 return FALSE;
6201 }
6202
6203 /* Check for explicit anchoring */
6204
6205 else if (op != OP_SOD && op != OP_SOM &&
6206 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6207 return FALSE;
6208 code += GET(code, 1);
6209 }
6210 while (*code == OP_ALT); /* Loop for each alternative */
6211 return TRUE;
6212 }
6213
6214
6215
6216 /*************************************************
6217 * Check for starting with ^ or .* *
6218 *************************************************/
6219
6220 /* This is called to find out if every branch starts with ^ or .* so that
6221 "first char" processing can be done to speed things up in multiline
6222 matching and for non-DOTALL patterns that start with .* (which must start at
6223 the beginning or after \n). As in the case of is_anchored() (see above), we
6224 have to take account of back references to capturing brackets that contain .*
6225 because in that case we can't make the assumption.
6226
6227 Arguments:
6228 code points to start of expression (the bracket)
6229 bracket_map a bitmap of which brackets we are inside while testing; this
6230 handles up to substring 31; after that we just have to take
6231 the less precise approach
6232 backref_map the back reference bitmap
6233
6234 Returns: TRUE or FALSE
6235 */
6236
6237 static BOOL
6238 is_startline(const uschar *code, unsigned int bracket_map,
6239 unsigned int backref_map)
6240 {
6241 do {
6242 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6243 NULL, 0, FALSE);
6244 register int op = *scode;
6245
6246 /* If we are at the start of a conditional assertion group, *both* the
6247 conditional assertion *and* what follows the condition must satisfy the test
6248 for start of line. Other kinds of condition fail. Note that there may be an
6249 auto-callout at the start of a condition. */
6250
6251 if (op == OP_COND)
6252 {
6253 scode += 1 + LINK_SIZE;
6254 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6255 switch (*scode)
6256 {
6257 case OP_CREF:
6258 case OP_NCREF:
6259 case OP_RREF:
6260 case OP_NRREF:
6261 case OP_DEF:
6262 return FALSE;
6263
6264 default: /* Assertion */
6265 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6266 do scode += GET(scode, 1); while (*scode == OP_ALT);
6267 scode += 1 + LINK_SIZE;
6268 break;
6269 }
6270 scode = first_significant_code(scode, NULL, 0, FALSE);
6271 op = *scode;
6272 }
6273
6274 /* Non-capturing brackets */
6275
6276 if (op == OP_BRA)
6277 {
6278 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6279 }
6280
6281 /* Capturing brackets */
6282
6283 else if (op == OP_CBRA)
6284 {
6285 int n = GET2(scode, 1+LINK_SIZE);
6286 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6287 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6288 }
6289
6290 /* Other brackets */
6291
6292 else if (op == OP_ASSERT || op == OP_ONCE)
6293 {
6294 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6295 }
6296
6297 /* .* means "start at start or after \n" if it isn't in brackets that
6298 may be referenced. */
6299
6300 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6301 {
6302 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6303 }
6304
6305 /* Check for explicit circumflex */
6306
6307 else if (op != OP_CIRC) return FALSE;
6308
6309 /* Move on to the next alternative */
6310
6311 code += GET(code, 1);
6312 }
6313 while (*code == OP_ALT); /* Loop for each alternative */
6314 return TRUE;
6315 }
6316
6317
6318
6319 /*************************************************
6320 * Check for asserted fixed first char *
6321 *************************************************/
6322
6323 /* During compilation, the "first char" settings from forward assertions are
6324 discarded, because they can cause conflicts with actual literals that follow.
6325 However, if we end up without a first char setting for an unanchored pattern,
6326 it is worth scanning the regex to see if there is an initial asserted first
6327 char. If all branches start with the same asserted char, or with a bracket all
6328 of whose alternatives start with the same asserted char (recurse ad lib), then
6329 we return that char, otherwise -1.
6330
6331 Arguments:
6332 code points to start of expression (the bracket)
6333 options pointer to the options (used to check casing changes)
6334 inassert TRUE if in an assertion
6335
6336 Returns: -1 or the fixed first char
6337 */
6338
6339 static int
6340 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6341 {
6342 register int c = -1;
6343 do {
6344 int d;
6345 const uschar *scode =
6346 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6347 register int op = *scode;
6348
6349 switch(op)
6350 {
6351 default:
6352 return -1;
6353
6354 case OP_BRA:
6355 case OP_CBRA:
6356 case OP_ASSERT:
6357 case OP_ONCE:
6358 case OP_COND:
6359 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6360 return -1;
6361 if (c < 0) c = d; else if (c != d) return -1;
6362 break;
6363
6364 case OP_EXACT: /* Fall through */
6365 scode += 2;
6366
6367 case OP_CHAR:
6368 case OP_CHARNC:
6369 case OP_PLUS:
6370 case OP_MINPLUS:
6371 case OP_POSPLUS:
6372 if (!inassert) return -1;
6373 if (c < 0)
6374 {
6375 c = scode[1];
6376 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6377 }
6378 else if (c != scode[1]) return -1;
6379 break;
6380 }
6381
6382 code += GET(code, 1);
6383 }
6384 while (*code == OP_ALT);
6385 return c;
6386 }
6387
6388
6389
6390 /*************************************************
6391 * Compile a Regular Expression *
6392 *************************************************/
6393
6394 /* This function takes a string and returns a pointer to a block of store
6395 holding a compiled version of the expression. The original API for this
6396 function had no error code return variable; it is retained for backwards
6397 compatibility. The new function is given a new name.
6398
6399 Arguments:
6400 pattern the regular expression
6401 options various option bits
6402 errorcodeptr pointer to error code variable (pcre_compile2() only)
6403 can be NULL if you don't want a code value
6404 errorptr pointer to pointer to error text
6405 erroroffset ptr offset in pattern where error was detected
6406 tables pointer to character tables or NULL
6407
6408 Returns: pointer to compiled data block, or NULL on error,
6409 with errorptr and erroroffset set
6410 */
6411
6412 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6413 pcre_compile(const char *pattern, int options, const char **errorptr,
6414 int *erroroffset, const unsigned char *tables)
6415 {
6416 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6417 }
6418
6419
6420 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6421 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6422 const char **errorptr, int *erroroffset, const unsigned char *tables)
6423 {
6424 real_pcre *re;
6425 int length = 1; /* For final END opcode */
6426 int firstbyte, reqbyte, newline;
6427 int errorcode = 0;
6428 int skipatstart = 0;
6429 BOOL utf8 = (options & PCRE_UTF8) != 0;
6430 size_t size;
6431 uschar *code;
6432 const uschar *codestart;
6433 const uschar *ptr;
6434 compile_data compile_block;
6435 compile_data *cd = &compile_block;
6436
6437 /* This space is used for "compiling" into during the first phase, when we are
6438 computing the amount of memory that is needed. Compiled items are thrown away
6439 as soon as possible, so that a fairly large buffer should be sufficient for
6440 this purpose. The same space is used in the second phase for remembering where
6441 to fill in forward references to subpatterns. */
6442
6443 uschar cworkspace[COMPILE_WORK_SIZE];
6444
6445 /* Set this early so that early errors get offset 0. */
6446
6447 ptr = (const uschar *)pattern;
6448
6449 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6450 can do is just return NULL, but we can set a code value if there is a code
6451 pointer. */
6452
6453 if (errorptr == NULL)
6454 {
6455 if (errorcodeptr != NULL) *errorcodeptr = 99;
6456 return NULL;
6457 }
6458
6459 *errorptr = NULL;
6460 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6461
6462 /* However, we can give a message for this error */
6463
6464 if (erroroffset == NULL)
6465 {
6466 errorcode = ERR16;
6467 goto PCRE_EARLY_ERROR_RETURN2;
6468 }
6469
6470 *erroroffset = 0;
6471
6472 /* Set up pointers to the individual character tables */
6473
6474 if (tables == NULL) tables = _pcre_default_tables;
6475 cd->lcc = tables + lcc_offset;
6476 cd->fcc = tables + fcc_offset;
6477 cd->cbits = tables + cbits_offset;
6478 cd->ctypes = tables + ctypes_offset;
6479
6480 /* Check that all undefined public option bits are zero */
6481
6482 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6483 {
6484 errorcode = ERR17;
6485 goto PCRE_EARLY_ERROR_RETURN;
6486 }
6487
6488 /* Check for global one-time settings at the start of the pattern, and remember
6489 the offset for later. */
6490
6491 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6492 ptr[skipatstart+1] == CHAR_ASTERISK)
6493 {
6494 int newnl = 0;
6495 int newbsr = 0;
6496
6497 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6498 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6499
6500 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6501 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6502 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6503 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6504 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6505 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6506 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6507 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6508 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6509 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6510
6511 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6512 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6513 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6514 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6515
6516 if (newnl != 0)
6517 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6518 else if (newbsr != 0)
6519 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6520 else break;
6521 }
6522
6523 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6524
6525 #ifdef SUPPORT_UTF8
6526 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6527 (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6528 {
6529 errorcode = ERR44;
6530 goto PCRE_EARLY_ERROR_RETURN2;
6531 }
6532 #else
6533 if (utf8)
6534 {
6535 errorcode = ERR32;
6536 goto PCRE_EARLY_ERROR_RETURN;
6537 }
6538 #endif
6539
6540 /* Check validity of \R options. */
6541
6542 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6543 {
6544 case 0:
6545 case PCRE_BSR_ANYCRLF:
6546 case PCRE_BSR_UNICODE:
6547 break;
6548 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6549 }
6550
6551 /* Handle different types of newline. The three bits give seven cases. The
6552 current code allows for fixed one- or two-byte sequences, plus "any" and
6553 "anycrlf". */
6554
6555 switch (options & PCRE_NEWLINE_BITS)
6556 {
6557 case 0: newline = NEWLINE; break; /* Build-time default */
6558 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6559 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6560 case PCRE_NEWLINE_CR+
6561 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6562 case PCRE_NEWLINE_ANY: newline = -1; break;
6563 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6564 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6565 }
6566
6567 if (newline == -2)
6568 {
6569 cd->nltype = NLTYPE_ANYCRLF;
6570 }
6571 else if (newline < 0)
6572 {
6573 cd->nltype = NLTYPE_ANY;
6574 }
6575 else
6576 {
6577 cd->nltype = NLTYPE_FIXED;
6578 if (newline > 255)
6579 {
6580 cd->nllen = 2;
6581 cd->nl[0] = (newline >> 8) & 255;
6582 cd->nl[1] = newline & 255;
6583 }
6584 else
6585 {
6586 cd->nllen = 1;
6587 cd->nl[0] = newline;
6588 }
6589 }
6590
6591 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6592 references to help in deciding whether (.*) can be treated as anchored or not.
6593 */
6594
6595 cd->top_backref = 0;
6596 cd->backref_map = 0;
6597
6598 /* Reflect pattern for debugging output */
6599
6600 DPRINTF(("------------------------------------------------------------------\n"));
6601 DPRINTF(("%s\n", pattern));
6602
6603 /* Pretend to compile the pattern while actually just accumulating the length
6604 of memory required. This behaviour is triggered by passing a non-NULL final
6605 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6606 to compile parts of the pattern into; the compiled code is discarded when it is
6607 no longer needed, so hopefully this workspace will never overflow, though there
6608 is a test for its doing so. */
6609
6610 cd->bracount = cd->final_bracount = 0;
6611 cd->names_found = 0;
6612 cd->name_entry_size = 0;
6613 cd->name_table = NULL;
6614 cd->start_workspace = cworkspace;
6615 cd->start_code = cworkspace;
6616 cd->hwm = cworkspace;
6617 cd->start_pattern = (const uschar *)pattern;
6618 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6619 cd->req_varyopt = 0;
6620 cd->external_options = options;
6621 cd->external_flags = 0;
6622 cd->open_caps = NULL;
6623
6624 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6625 don't need to look at the result of the function here. The initial options have
6626 been put into the cd block so that they can be changed if an option setting is
6627 found within the regex right at the beginning. Bringing initial option settings
6628 outside can help speed up starting point checks. */
6629
6630 ptr += skipatstart;
6631 code = cworkspace;
6632 *code = OP_BRA;
6633 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6634 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6635 &length);
6636 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6637
6638 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6639 cd->hwm - cworkspace));
6640
6641 if (length > MAX_PATTERN_SIZE)
6642 {
6643 errorcode = ERR20;
6644 goto PCRE_EARLY_ERROR_RETURN;
6645 }
6646
6647 /* Compute the size of data block needed and get it, either from malloc or
6648 externally provided function. Integer overflow should no longer be possible
6649 because nowadays we limit the maximum value of cd->names_found and
6650 cd->name_entry_size. */
6651
6652 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6653 re = (real_pcre *)(pcre_malloc)(size);
6654
6655 if (re == NULL)
6656 {
6657 errorcode = ERR21;
6658 goto PCRE_EARLY_ERROR_RETURN;
6659 }
6660
6661 /* Put in the magic number, and save the sizes, initial options, internal
6662 flags, and character table pointer. NULL is used for the default character
6663 tables. The nullpad field is at the end; it's there to help in the case when a
6664 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6665 pointers. */
6666
6667 re->magic_number = MAGIC_NUMBER;
6668 re->size = size;
6669