/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 457 - (show annotations)
Sat Oct 3 16:24:08 2009 UTC (5 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 219485 byte(s)
Error occurred while calculating annotation data.
Allow duplicate names for same-numbered groups; forbid different names.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC
101
102 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 in UTF-8 mode. */
104
105 static const short int escapes[] = {
106 0, 0,
107 0, 0,
108 0, 0,
109 0, 0,
110 0, 0,
111 CHAR_COLON, CHAR_SEMICOLON,
112 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 CHAR_COMMERCIAL_AT, -ESC_A,
115 -ESC_B, -ESC_C,
116 -ESC_D, -ESC_E,
117 0, -ESC_G,
118 -ESC_H, 0,
119 0, -ESC_K,
120 0, 0,
121 0, 0,
122 -ESC_P, -ESC_Q,
123 -ESC_R, -ESC_S,
124 0, 0,
125 -ESC_V, -ESC_W,
126 -ESC_X, 0,
127 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 CHAR_GRAVE_ACCENT, 7,
131 -ESC_b, 0,
132 -ESC_d, ESC_e,
133 ESC_f, 0,
134 -ESC_h, 0,
135 0, -ESC_k,
136 0, 0,
137 ESC_n, 0,
138 -ESC_p, 0,
139 ESC_r, -ESC_s,
140 ESC_tee, 0,
141 -ESC_v, -ESC_w,
142 0, 0,
143 -ESC_z
144 };
145
146 #else
147
148 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149
150 static const short int escapes[] = {
151 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174 };
175 #endif
176
177
178 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179 searched linearly. Put all the names into a single string, in order to reduce
180 the number of relocations when a shared library is dynamically linked. The
181 string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 platforms. */
183
184 typedef struct verbitem {
185 int len;
186 int op;
187 } verbitem;
188
189 static const char verbnames[] =
190 STRING_ACCEPT0
191 STRING_COMMIT0
192 STRING_F0
193 STRING_FAIL0
194 STRING_PRUNE0
195 STRING_SKIP0
196 STRING_THEN;
197
198 static const verbitem verbs[] = {
199 { 6, OP_ACCEPT },
200 { 6, OP_COMMIT },
201 { 1, OP_FAIL },
202 { 4, OP_FAIL },
203 { 5, OP_PRUNE },
204 { 4, OP_SKIP },
205 { 4, OP_THEN }
206 };
207
208 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209
210
211 /* Tables of names of POSIX character classes and their lengths. The names are
212 now all in a single string, to reduce the number of relocations when a shared
213 library is dynamically loaded. The list of lengths is terminated by a zero
214 length entry. The first three must be alpha, lower, upper, as this is assumed
215 for handling case independence. */
216
217 static const char posix_names[] =
218 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221 STRING_word0 STRING_xdigit;
222
223 static const uschar posix_name_lengths[] = {
224 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225
226 /* Table of class bit maps for each POSIX class. Each class is formed from a
227 base map, with an optional addition or removal of another map. Then, for some
228 classes, there is some additional tweaking: for [:blank:] the vertical space
229 characters are removed, and for [:alpha:] and [:alnum:] the underscore
230 character is removed. The triples in the table consist of the base map offset,
231 second map offset or -1 if no second map, and a non-negative value for map
232 addition or a negative value for map subtraction (if there are two maps). The
233 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234 remove vertical space characters, 2 => remove underscore. */
235
236 static const int posix_class_maps[] = {
237 cbit_word, cbit_digit, -2, /* alpha */
238 cbit_lower, -1, 0, /* lower */
239 cbit_upper, -1, 0, /* upper */
240 cbit_word, -1, 2, /* alnum - word without underscore */
241 cbit_print, cbit_cntrl, 0, /* ascii */
242 cbit_space, -1, 1, /* blank - a GNU extension */
243 cbit_cntrl, -1, 0, /* cntrl */
244 cbit_digit, -1, 0, /* digit */
245 cbit_graph, -1, 0, /* graph */
246 cbit_print, -1, 0, /* print */
247 cbit_punct, -1, 0, /* punct */
248 cbit_space, -1, 0, /* space */
249 cbit_word, -1, 0, /* word - a Perl extension */
250 cbit_xdigit,-1, 0 /* xdigit */
251 };
252
253
254 #define STRING(a) # a
255 #define XSTRING(s) STRING(s)
256
257 /* The texts of compile-time error messages. These are "char *" because they
258 are passed to the outside world. Do not ever re-use any error number, because
259 they are documented. Always add a new error instead. Messages marked DEAD below
260 are no longer used. This used to be a table of strings, but in order to reduce
261 the number of relocations needed when a shared library is loaded dynamically,
262 it is now one long string. We cannot use a table of offsets, because the
263 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264 simply count through to the one we want - this isn't a performance issue
265 because these strings are used only when there is a compilation error. */
266
267 static const char error_texts[] =
268 "no error\0"
269 "\\ at end of pattern\0"
270 "\\c at end of pattern\0"
271 "unrecognized character follows \\\0"
272 "numbers out of order in {} quantifier\0"
273 /* 5 */
274 "number too big in {} quantifier\0"
275 "missing terminating ] for character class\0"
276 "invalid escape sequence in character class\0"
277 "range out of order in character class\0"
278 "nothing to repeat\0"
279 /* 10 */
280 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281 "internal error: unexpected repeat\0"
282 "unrecognized character after (? or (?-\0"
283 "POSIX named classes are supported only within a class\0"
284 "missing )\0"
285 /* 15 */
286 "reference to non-existent subpattern\0"
287 "erroffset passed as NULL\0"
288 "unknown option bit(s) set\0"
289 "missing ) after comment\0"
290 "parentheses nested too deeply\0" /** DEAD **/
291 /* 20 */
292 "regular expression is too large\0"
293 "failed to get memory\0"
294 "unmatched parentheses\0"
295 "internal error: code overflow\0"
296 "unrecognized character after (?<\0"
297 /* 25 */
298 "lookbehind assertion is not fixed length\0"
299 "malformed number or name after (?(\0"
300 "conditional group contains more than two branches\0"
301 "assertion expected after (?(\0"
302 "(?R or (?[+-]digits must be followed by )\0"
303 /* 30 */
304 "unknown POSIX class name\0"
305 "POSIX collating elements are not supported\0"
306 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307 "spare error\0" /** DEAD **/
308 "character value in \\x{...} sequence is too large\0"
309 /* 35 */
310 "invalid condition (?(0)\0"
311 "\\C not allowed in lookbehind assertion\0"
312 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313 "number after (?C is > 255\0"
314 "closing ) for (?C expected\0"
315 /* 40 */
316 "recursive call could loop indefinitely\0"
317 "unrecognized character after (?P\0"
318 "syntax error in subpattern name (missing terminator)\0"
319 "two named subpatterns have the same name\0"
320 "invalid UTF-8 string\0"
321 /* 45 */
322 "support for \\P, \\p, and \\X has not been compiled\0"
323 "malformed \\P or \\p sequence\0"
324 "unknown property name after \\P or \\p\0"
325 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 /* 50 */
328 "repeated subpattern is too long\0" /** DEAD **/
329 "octal value is greater than \\377 (not in UTF-8 mode)\0"
330 "internal error: overran compiling workspace\0"
331 "internal error: previously-checked referenced subpattern not found\0"
332 "DEFINE group contains more than one branch\0"
333 /* 55 */
334 "repeating a DEFINE group is not allowed\0"
335 "inconsistent NEWLINE options\0"
336 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337 "a numbered reference must not be zero\0"
338 "(*VERB) with an argument is not supported\0"
339 /* 60 */
340 "(*VERB) not recognized\0"
341 "number is too big\0"
342 "subpattern name expected\0"
343 "digit expected after (?+\0"
344 "] is an invalid data character in JavaScript compatibility mode\0"
345 /* 65 */
346 "different names for subpatterns of the same number are not allowed";
347
348
349 /* Table to identify digits and hex digits. This is used when compiling
350 patterns. Note that the tables in chartables are dependent on the locale, and
351 may mark arbitrary characters as digits - but the PCRE compiling code expects
352 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
353 a private table here. It costs 256 bytes, but it is a lot faster than doing
354 character value tests (at least in some simple cases I timed), and in some
355 applications one wants PCRE to compile efficiently as well as match
356 efficiently.
357
358 For convenience, we use the same bit definitions as in chartables:
359
360 0x04 decimal digit
361 0x08 hexadecimal digit
362
363 Then we can use ctype_digit and ctype_xdigit in the code. */
364
365 #ifndef EBCDIC
366
367 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
368 UTF-8 mode. */
369
370 static const unsigned char digitab[] =
371 {
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
378 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
379 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
380 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
384 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
395 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
404
405 #else
406
407 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
408
409 static const unsigned char digitab[] =
410 {
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
414 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
416 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
417 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
418 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
419 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
420 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
421 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
422 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
425 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
427 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
431 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
433 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
435 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
439 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
441 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
442 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
443
444 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
445 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
446 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
447 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
449 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
450 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
451 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
452 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
453 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
454 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
455 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
456 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
457 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
458 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
459 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
460 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
461 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
462 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
463 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
464 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
465 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
466 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
467 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
468 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
469 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
470 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
471 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
472 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
473 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
474 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
475 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
476 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
477 #endif
478
479
480 /* Definition to allow mutual recursion */
481
482 static BOOL
483 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
484 int *, int *, branch_chain *, compile_data *, int *);
485
486
487
488 /*************************************************
489 * Find an error text *
490 *************************************************/
491
492 /* The error texts are now all in one long string, to save on relocations. As
493 some of the text is of unknown length, we can't use a table of offsets.
494 Instead, just count through the strings. This is not a performance issue
495 because it happens only when there has been a compilation error.
496
497 Argument: the error number
498 Returns: pointer to the error string
499 */
500
501 static const char *
502 find_error_text(int n)
503 {
504 const char *s = error_texts;
505 for (; n > 0; n--) while (*s++ != 0) {};
506 return s;
507 }
508
509
510 /*************************************************
511 * Handle escapes *
512 *************************************************/
513
514 /* This function is called when a \ has been encountered. It either returns a
515 positive value for a simple escape such as \n, or a negative value which
516 encodes one of the more complicated things such as \d. A backreference to group
517 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
518 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
519 ptr is pointing at the \. On exit, it is on the final character of the escape
520 sequence.
521
522 Arguments:
523 ptrptr points to the pattern position pointer
524 errorcodeptr points to the errorcode variable
525 bracount number of previous extracting brackets
526 options the options bits
527 isclass TRUE if inside a character class
528
529 Returns: zero or positive => a data character
530 negative => a special escape sequence
531 on error, errorcodeptr is set
532 */
533
534 static int
535 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
536 int options, BOOL isclass)
537 {
538 BOOL utf8 = (options & PCRE_UTF8) != 0;
539 const uschar *ptr = *ptrptr + 1;
540 int c, i;
541
542 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
543 ptr--; /* Set pointer back to the last byte */
544
545 /* If backslash is at the end of the pattern, it's an error. */
546
547 if (c == 0) *errorcodeptr = ERR1;
548
549 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
550 in a table. A non-zero result is something that can be returned immediately.
551 Otherwise further processing may be required. */
552
553 #ifndef EBCDIC /* ASCII/UTF-8 coding */
554 else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
555 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
556
557 #else /* EBCDIC coding */
558 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
559 else if ((i = escapes[c - 0x48]) != 0) c = i;
560 #endif
561
562 /* Escapes that need further processing, or are illegal. */
563
564 else
565 {
566 const uschar *oldptr;
567 BOOL braced, negated;
568
569 switch (c)
570 {
571 /* A number of Perl escapes are not handled by PCRE. We give an explicit
572 error. */
573
574 case CHAR_l:
575 case CHAR_L:
576 case CHAR_N:
577 case CHAR_u:
578 case CHAR_U:
579 *errorcodeptr = ERR37;
580 break;
581
582 /* \g must be followed by one of a number of specific things:
583
584 (1) A number, either plain or braced. If positive, it is an absolute
585 backreference. If negative, it is a relative backreference. This is a Perl
586 5.10 feature.
587
588 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
589 is part of Perl's movement towards a unified syntax for back references. As
590 this is synonymous with \k{name}, we fudge it up by pretending it really
591 was \k.
592
593 (3) For Oniguruma compatibility we also support \g followed by a name or a
594 number either in angle brackets or in single quotes. However, these are
595 (possibly recursive) subroutine calls, _not_ backreferences. Just return
596 the -ESC_g code (cf \k). */
597
598 case CHAR_g:
599 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
600 {
601 c = -ESC_g;
602 break;
603 }
604
605 /* Handle the Perl-compatible cases */
606
607 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
608 {
609 const uschar *p;
610 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
611 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
612 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
613 {
614 c = -ESC_k;
615 break;
616 }
617 braced = TRUE;
618 ptr++;
619 }
620 else braced = FALSE;
621
622 if (ptr[1] == CHAR_MINUS)
623 {
624 negated = TRUE;
625 ptr++;
626 }
627 else negated = FALSE;
628
629 c = 0;
630 while ((digitab[ptr[1]] & ctype_digit) != 0)
631 c = c * 10 + *(++ptr) - CHAR_0;
632
633 if (c < 0) /* Integer overflow */
634 {
635 *errorcodeptr = ERR61;
636 break;
637 }
638
639 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
640 {
641 *errorcodeptr = ERR57;
642 break;
643 }
644
645 if (c == 0)
646 {
647 *errorcodeptr = ERR58;
648 break;
649 }
650
651 if (negated)
652 {
653 if (c > bracount)
654 {
655 *errorcodeptr = ERR15;
656 break;
657 }
658 c = bracount - (c - 1);
659 }
660
661 c = -(ESC_REF + c);
662 break;
663
664 /* The handling of escape sequences consisting of a string of digits
665 starting with one that is not zero is not straightforward. By experiment,
666 the way Perl works seems to be as follows:
667
668 Outside a character class, the digits are read as a decimal number. If the
669 number is less than 10, or if there are that many previous extracting
670 left brackets, then it is a back reference. Otherwise, up to three octal
671 digits are read to form an escaped byte. Thus \123 is likely to be octal
672 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
673 value is greater than 377, the least significant 8 bits are taken. Inside a
674 character class, \ followed by a digit is always an octal number. */
675
676 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
677 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
678
679 if (!isclass)
680 {
681 oldptr = ptr;
682 c -= CHAR_0;
683 while ((digitab[ptr[1]] & ctype_digit) != 0)
684 c = c * 10 + *(++ptr) - CHAR_0;
685 if (c < 0) /* Integer overflow */
686 {
687 *errorcodeptr = ERR61;
688 break;
689 }
690 if (c < 10 || c <= bracount)
691 {
692 c = -(ESC_REF + c);
693 break;
694 }
695 ptr = oldptr; /* Put the pointer back and fall through */
696 }
697
698 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
699 generates a binary zero byte and treats the digit as a following literal.
700 Thus we have to pull back the pointer by one. */
701
702 if ((c = *ptr) >= CHAR_8)
703 {
704 ptr--;
705 c = 0;
706 break;
707 }
708
709 /* \0 always starts an octal number, but we may drop through to here with a
710 larger first octal digit. The original code used just to take the least
711 significant 8 bits of octal numbers (I think this is what early Perls used
712 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
713 than 3 octal digits. */
714
715 case CHAR_0:
716 c -= CHAR_0;
717 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
718 c = c * 8 + *(++ptr) - CHAR_0;
719 if (!utf8 && c > 255) *errorcodeptr = ERR51;
720 break;
721
722 /* \x is complicated. \x{ddd} is a character number which can be greater
723 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
724 treated as a data character. */
725
726 case CHAR_x:
727 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
728 {
729 const uschar *pt = ptr + 2;
730 int count = 0;
731
732 c = 0;
733 while ((digitab[*pt] & ctype_xdigit) != 0)
734 {
735 register int cc = *pt++;
736 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
737 count++;
738
739 #ifndef EBCDIC /* ASCII/UTF-8 coding */
740 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
741 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
742 #else /* EBCDIC coding */
743 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
744 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
745 #endif
746 }
747
748 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
749 {
750 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
751 ptr = pt;
752 break;
753 }
754
755 /* If the sequence of hex digits does not end with '}', then we don't
756 recognize this construct; fall through to the normal \x handling. */
757 }
758
759 /* Read just a single-byte hex-defined char */
760
761 c = 0;
762 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
763 {
764 int cc; /* Some compilers don't like */
765 cc = *(++ptr); /* ++ in initializers */
766 #ifndef EBCDIC /* ASCII/UTF-8 coding */
767 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
768 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
769 #else /* EBCDIC coding */
770 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
771 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
772 #endif
773 }
774 break;
775
776 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
777 This coding is ASCII-specific, but then the whole concept of \cx is
778 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
779
780 case CHAR_c:
781 c = *(++ptr);
782 if (c == 0)
783 {
784 *errorcodeptr = ERR2;
785 break;
786 }
787
788 #ifndef EBCDIC /* ASCII/UTF-8 coding */
789 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
790 c ^= 0x40;
791 #else /* EBCDIC coding */
792 if (c >= CHAR_a && c <= CHAR_z) c += 64;
793 c ^= 0xC0;
794 #endif
795 break;
796
797 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
798 other alphanumeric following \ is an error if PCRE_EXTRA was set;
799 otherwise, for Perl compatibility, it is a literal. This code looks a bit
800 odd, but there used to be some cases other than the default, and there may
801 be again in future, so I haven't "optimized" it. */
802
803 default:
804 if ((options & PCRE_EXTRA) != 0) switch(c)
805 {
806 default:
807 *errorcodeptr = ERR3;
808 break;
809 }
810 break;
811 }
812 }
813
814 *ptrptr = ptr;
815 return c;
816 }
817
818
819
820 #ifdef SUPPORT_UCP
821 /*************************************************
822 * Handle \P and \p *
823 *************************************************/
824
825 /* This function is called after \P or \p has been encountered, provided that
826 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
827 pointing at the P or p. On exit, it is pointing at the final character of the
828 escape sequence.
829
830 Argument:
831 ptrptr points to the pattern position pointer
832 negptr points to a boolean that is set TRUE for negation else FALSE
833 dptr points to an int that is set to the detailed property value
834 errorcodeptr points to the error code variable
835
836 Returns: type value from ucp_type_table, or -1 for an invalid type
837 */
838
839 static int
840 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
841 {
842 int c, i, bot, top;
843 const uschar *ptr = *ptrptr;
844 char name[32];
845
846 c = *(++ptr);
847 if (c == 0) goto ERROR_RETURN;
848
849 *negptr = FALSE;
850
851 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
852 negation. */
853
854 if (c == CHAR_LEFT_CURLY_BRACKET)
855 {
856 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
857 {
858 *negptr = TRUE;
859 ptr++;
860 }
861 for (i = 0; i < (int)sizeof(name) - 1; i++)
862 {
863 c = *(++ptr);
864 if (c == 0) goto ERROR_RETURN;
865 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
866 name[i] = c;
867 }
868 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
869 name[i] = 0;
870 }
871
872 /* Otherwise there is just one following character */
873
874 else
875 {
876 name[0] = c;
877 name[1] = 0;
878 }
879
880 *ptrptr = ptr;
881
882 /* Search for a recognized property name using binary chop */
883
884 bot = 0;
885 top = _pcre_utt_size;
886
887 while (bot < top)
888 {
889 i = (bot + top) >> 1;
890 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
891 if (c == 0)
892 {
893 *dptr = _pcre_utt[i].value;
894 return _pcre_utt[i].type;
895 }
896 if (c > 0) bot = i + 1; else top = i;
897 }
898
899 *errorcodeptr = ERR47;
900 *ptrptr = ptr;
901 return -1;
902
903 ERROR_RETURN:
904 *errorcodeptr = ERR46;
905 *ptrptr = ptr;
906 return -1;
907 }
908 #endif
909
910
911
912
913 /*************************************************
914 * Check for counted repeat *
915 *************************************************/
916
917 /* This function is called when a '{' is encountered in a place where it might
918 start a quantifier. It looks ahead to see if it really is a quantifier or not.
919 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
920 where the ddds are digits.
921
922 Arguments:
923 p pointer to the first char after '{'
924
925 Returns: TRUE or FALSE
926 */
927
928 static BOOL
929 is_counted_repeat(const uschar *p)
930 {
931 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
932 while ((digitab[*p] & ctype_digit) != 0) p++;
933 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
934
935 if (*p++ != CHAR_COMMA) return FALSE;
936 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
937
938 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
939 while ((digitab[*p] & ctype_digit) != 0) p++;
940
941 return (*p == CHAR_RIGHT_CURLY_BRACKET);
942 }
943
944
945
946 /*************************************************
947 * Read repeat counts *
948 *************************************************/
949
950 /* Read an item of the form {n,m} and return the values. This is called only
951 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
952 so the syntax is guaranteed to be correct, but we need to check the values.
953
954 Arguments:
955 p pointer to first char after '{'
956 minp pointer to int for min
957 maxp pointer to int for max
958 returned as -1 if no max
959 errorcodeptr points to error code variable
960
961 Returns: pointer to '}' on success;
962 current ptr on error, with errorcodeptr set non-zero
963 */
964
965 static const uschar *
966 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
967 {
968 int min = 0;
969 int max = -1;
970
971 /* Read the minimum value and do a paranoid check: a negative value indicates
972 an integer overflow. */
973
974 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
975 if (min < 0 || min > 65535)
976 {
977 *errorcodeptr = ERR5;
978 return p;
979 }
980
981 /* Read the maximum value if there is one, and again do a paranoid on its size.
982 Also, max must not be less than min. */
983
984 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
985 {
986 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
987 {
988 max = 0;
989 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
990 if (max < 0 || max > 65535)
991 {
992 *errorcodeptr = ERR5;
993 return p;
994 }
995 if (max < min)
996 {
997 *errorcodeptr = ERR4;
998 return p;
999 }
1000 }
1001 }
1002
1003 /* Fill in the required variables, and pass back the pointer to the terminating
1004 '}'. */
1005
1006 *minp = min;
1007 *maxp = max;
1008 return p;
1009 }
1010
1011
1012
1013 /*************************************************
1014 * Subroutine for finding forward reference *
1015 *************************************************/
1016
1017 /* This recursive function is called only from find_parens() below. The
1018 top-level call starts at the beginning of the pattern. All other calls must
1019 start at a parenthesis. It scans along a pattern's text looking for capturing
1020 subpatterns, and counting them. If it finds a named pattern that matches the
1021 name it is given, it returns its number. Alternatively, if the name is NULL, it
1022 returns when it reaches a given numbered subpattern. We know that if (?P< is
1023 encountered, the name will be terminated by '>' because that is checked in the
1024 first pass. Recursion is used to keep track of subpatterns that reset the
1025 capturing group numbers - the (?| feature.
1026
1027 Arguments:
1028 ptrptr address of the current character pointer (updated)
1029 cd compile background data
1030 name name to seek, or NULL if seeking a numbered subpattern
1031 lorn name length, or subpattern number if name is NULL
1032 xmode TRUE if we are in /x mode
1033 count pointer to the current capturing subpattern number (updated)
1034
1035 Returns: the number of the named subpattern, or -1 if not found
1036 */
1037
1038 static int
1039 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1040 BOOL xmode, int *count)
1041 {
1042 uschar *ptr = *ptrptr;
1043 int start_count = *count;
1044 int hwm_count = start_count;
1045 BOOL dup_parens = FALSE;
1046
1047 /* If the first character is a parenthesis, check on the type of group we are
1048 dealing with. The very first call may not start with a parenthesis. */
1049
1050 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1051 {
1052 if (ptr[1] == CHAR_QUESTION_MARK &&
1053 ptr[2] == CHAR_VERTICAL_LINE)
1054 {
1055 ptr += 3;
1056 dup_parens = TRUE;
1057 }
1058
1059 /* Handle a normal, unnamed capturing parenthesis */
1060
1061 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1062 {
1063 *count += 1;
1064 if (name == NULL && *count == lorn) return *count;
1065 ptr++;
1066 }
1067
1068 /* Handle a condition. If it is an assertion, just carry on so that it
1069 is processed as normal. If not, skip to the closing parenthesis of the
1070 condition (there can't be any nested parens. */
1071
1072 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1073 {
1074 ptr += 2;
1075 if (ptr[1] != CHAR_QUESTION_MARK)
1076 {
1077 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1078 if (*ptr != 0) ptr++;
1079 }
1080 }
1081
1082 /* We have either (? or (* and not a condition */
1083
1084 else
1085 {
1086 ptr += 2;
1087 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1088
1089 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1090
1091 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1092 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1093 {
1094 int term;
1095 const uschar *thisname;
1096 *count += 1;
1097 if (name == NULL && *count == lorn) return *count;
1098 term = *ptr++;
1099 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1100 thisname = ptr;
1101 while (*ptr != term) ptr++;
1102 if (name != NULL && lorn == ptr - thisname &&
1103 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1104 return *count;
1105 term++;
1106 }
1107 }
1108 }
1109
1110 /* Past any initial parenthesis handling, scan for parentheses or vertical
1111 bars. */
1112
1113 for (; *ptr != 0; ptr++)
1114 {
1115 /* Skip over backslashed characters and also entire \Q...\E */
1116
1117 if (*ptr == CHAR_BACKSLASH)
1118 {
1119 if (*(++ptr) == 0) goto FAIL_EXIT;
1120 if (*ptr == CHAR_Q) for (;;)
1121 {
1122 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1123 if (*ptr == 0) goto FAIL_EXIT;
1124 if (*(++ptr) == CHAR_E) break;
1125 }
1126 continue;
1127 }
1128
1129 /* Skip over character classes; this logic must be similar to the way they
1130 are handled for real. If the first character is '^', skip it. Also, if the
1131 first few characters (either before or after ^) are \Q\E or \E we skip them
1132 too. This makes for compatibility with Perl. Note the use of STR macros to
1133 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1134
1135 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1136 {
1137 BOOL negate_class = FALSE;
1138 for (;;)
1139 {
1140 if (ptr[1] == CHAR_BACKSLASH)
1141 {
1142 if (ptr[2] == CHAR_E)
1143 ptr+= 2;
1144 else if (strncmp((const char *)ptr+2,
1145 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1146 ptr += 4;
1147 else
1148 break;
1149 }
1150 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1151 {
1152 negate_class = TRUE;
1153 ptr++;
1154 }
1155 else break;
1156 }
1157
1158 /* If the next character is ']', it is a data character that must be
1159 skipped, except in JavaScript compatibility mode. */
1160
1161 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1162 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1163 ptr++;
1164
1165 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1166 {
1167 if (*ptr == 0) return -1;
1168 if (*ptr == CHAR_BACKSLASH)
1169 {
1170 if (*(++ptr) == 0) goto FAIL_EXIT;
1171 if (*ptr == CHAR_Q) for (;;)
1172 {
1173 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1174 if (*ptr == 0) goto FAIL_EXIT;
1175 if (*(++ptr) == CHAR_E) break;
1176 }
1177 continue;
1178 }
1179 }
1180 continue;
1181 }
1182
1183 /* Skip comments in /x mode */
1184
1185 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1186 {
1187 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1188 if (*ptr == 0) goto FAIL_EXIT;
1189 continue;
1190 }
1191
1192 /* Check for the special metacharacters */
1193
1194 if (*ptr == CHAR_LEFT_PARENTHESIS)
1195 {
1196 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1197 if (rc > 0) return rc;
1198 if (*ptr == 0) goto FAIL_EXIT;
1199 }
1200
1201 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1202 {
1203 if (dup_parens && *count < hwm_count) *count = hwm_count;
1204 *ptrptr = ptr;
1205 return -1;
1206 }
1207
1208 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1209 {
1210 if (*count > hwm_count) hwm_count = *count;
1211 *count = start_count;
1212 }
1213 }
1214
1215 FAIL_EXIT:
1216 *ptrptr = ptr;
1217 return -1;
1218 }
1219
1220
1221
1222
1223 /*************************************************
1224 * Find forward referenced subpattern *
1225 *************************************************/
1226
1227 /* This function scans along a pattern's text looking for capturing
1228 subpatterns, and counting them. If it finds a named pattern that matches the
1229 name it is given, it returns its number. Alternatively, if the name is NULL, it
1230 returns when it reaches a given numbered subpattern. This is used for forward
1231 references to subpatterns. We used to be able to start this scan from the
1232 current compiling point, using the current count value from cd->bracount, and
1233 do it all in a single loop, but the addition of the possibility of duplicate
1234 subpattern numbers means that we have to scan from the very start, in order to
1235 take account of such duplicates, and to use a recursive function to keep track
1236 of the different types of group.
1237
1238 Arguments:
1239 cd compile background data
1240 name name to seek, or NULL if seeking a numbered subpattern
1241 lorn name length, or subpattern number if name is NULL
1242 xmode TRUE if we are in /x mode
1243
1244 Returns: the number of the found subpattern, or -1 if not found
1245 */
1246
1247 static int
1248 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1249 {
1250 uschar *ptr = (uschar *)cd->start_pattern;
1251 int count = 0;
1252 int rc;
1253
1254 /* If the pattern does not start with an opening parenthesis, the first call
1255 to find_parens_sub() will scan right to the end (if necessary). However, if it
1256 does start with a parenthesis, find_parens_sub() will return when it hits the
1257 matching closing parens. That is why we have to have a loop. */
1258
1259 for (;;)
1260 {
1261 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1262 if (rc > 0 || *ptr++ == 0) break;
1263 }
1264
1265 return rc;
1266 }
1267
1268
1269
1270
1271 /*************************************************
1272 * Find first significant op code *
1273 *************************************************/
1274
1275 /* This is called by several functions that scan a compiled expression looking
1276 for a fixed first character, or an anchoring op code etc. It skips over things
1277 that do not influence this. For some calls, a change of option is important.
1278 For some calls, it makes sense to skip negative forward and all backward
1279 assertions, and also the \b assertion; for others it does not.
1280
1281 Arguments:
1282 code pointer to the start of the group
1283 options pointer to external options
1284 optbit the option bit whose changing is significant, or
1285 zero if none are
1286 skipassert TRUE if certain assertions are to be skipped
1287
1288 Returns: pointer to the first significant opcode
1289 */
1290
1291 static const uschar*
1292 first_significant_code(const uschar *code, int *options, int optbit,
1293 BOOL skipassert)
1294 {
1295 for (;;)
1296 {
1297 switch ((int)*code)
1298 {
1299 case OP_OPT:
1300 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1301 *options = (int)code[1];
1302 code += 2;
1303 break;
1304
1305 case OP_ASSERT_NOT:
1306 case OP_ASSERTBACK:
1307 case OP_ASSERTBACK_NOT:
1308 if (!skipassert) return code;
1309 do code += GET(code, 1); while (*code == OP_ALT);
1310 code += _pcre_OP_lengths[*code];
1311 break;
1312
1313 case OP_WORD_BOUNDARY:
1314 case OP_NOT_WORD_BOUNDARY:
1315 if (!skipassert) return code;
1316 /* Fall through */
1317
1318 case OP_CALLOUT:
1319 case OP_CREF:
1320 case OP_RREF:
1321 case OP_DEF:
1322 code += _pcre_OP_lengths[*code];
1323 break;
1324
1325 default:
1326 return code;
1327 }
1328 }
1329 /* Control never reaches here */
1330 }
1331
1332
1333
1334
1335 /*************************************************
1336 * Find the fixed length of a branch *
1337 *************************************************/
1338
1339 /* Scan a branch and compute the fixed length of subject that will match it,
1340 if the length is fixed. This is needed for dealing with backward assertions.
1341 In UTF8 mode, the result is in characters rather than bytes. The branch is
1342 temporarily terminated with OP_END when this function is called.
1343
1344 This function is called when a backward assertion is encountered, so that if it
1345 fails, the error message can point to the correct place in the pattern.
1346 However, we cannot do this when the assertion contains subroutine calls,
1347 because they can be forward references. We solve this by remembering this case
1348 and doing the check at the end; a flag specifies which mode we are running in.
1349
1350 Arguments:
1351 code points to the start of the pattern (the bracket)
1352 options the compiling options
1353 atend TRUE if called when the pattern is complete
1354 cd the "compile data" structure
1355
1356 Returns: the fixed length,
1357 or -1 if there is no fixed length,
1358 or -2 if \C was encountered
1359 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1360 */
1361
1362 static int
1363 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1364 {
1365 int length = -1;
1366
1367 register int branchlength = 0;
1368 register uschar *cc = code + 1 + LINK_SIZE;
1369
1370 /* Scan along the opcodes for this branch. If we get to the end of the
1371 branch, check the length against that of the other branches. */
1372
1373 for (;;)
1374 {
1375 int d;
1376 uschar *ce, *cs;
1377 register int op = *cc;
1378 switch (op)
1379 {
1380 case OP_CBRA:
1381 case OP_BRA:
1382 case OP_ONCE:
1383 case OP_COND:
1384 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1385 if (d < 0) return d;
1386 branchlength += d;
1387 do cc += GET(cc, 1); while (*cc == OP_ALT);
1388 cc += 1 + LINK_SIZE;
1389 break;
1390
1391 /* Reached end of a branch; if it's a ket it is the end of a nested
1392 call. If it's ALT it is an alternation in a nested call. If it is
1393 END it's the end of the outer call. All can be handled by the same code. */
1394
1395 case OP_ALT:
1396 case OP_KET:
1397 case OP_KETRMAX:
1398 case OP_KETRMIN:
1399 case OP_END:
1400 if (length < 0) length = branchlength;
1401 else if (length != branchlength) return -1;
1402 if (*cc != OP_ALT) return length;
1403 cc += 1 + LINK_SIZE;
1404 branchlength = 0;
1405 break;
1406
1407 /* A true recursion implies not fixed length, but a subroutine call may
1408 be OK. If the subroutine is a forward reference, we can't deal with
1409 it until the end of the pattern, so return -3. */
1410
1411 case OP_RECURSE:
1412 if (!atend) return -3;
1413 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1414 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1415 if (cc > cs && cc < ce) return -1; /* Recursion */
1416 d = find_fixedlength(cs + 2, options, atend, cd);
1417 if (d < 0) return d;
1418 branchlength += d;
1419 cc += 1 + LINK_SIZE;
1420 break;
1421
1422 /* Skip over assertive subpatterns */
1423
1424 case OP_ASSERT:
1425 case OP_ASSERT_NOT:
1426 case OP_ASSERTBACK:
1427 case OP_ASSERTBACK_NOT:
1428 do cc += GET(cc, 1); while (*cc == OP_ALT);
1429 /* Fall through */
1430
1431 /* Skip over things that don't match chars */
1432
1433 case OP_REVERSE:
1434 case OP_CREF:
1435 case OP_RREF:
1436 case OP_DEF:
1437 case OP_OPT:
1438 case OP_CALLOUT:
1439 case OP_SOD:
1440 case OP_SOM:
1441 case OP_EOD:
1442 case OP_EODN:
1443 case OP_CIRC:
1444 case OP_DOLL:
1445 case OP_NOT_WORD_BOUNDARY:
1446 case OP_WORD_BOUNDARY:
1447 cc += _pcre_OP_lengths[*cc];
1448 break;
1449
1450 /* Handle literal characters */
1451
1452 case OP_CHAR:
1453 case OP_CHARNC:
1454 case OP_NOT:
1455 branchlength++;
1456 cc += 2;
1457 #ifdef SUPPORT_UTF8
1458 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1459 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1460 #endif
1461 break;
1462
1463 /* Handle exact repetitions. The count is already in characters, but we
1464 need to skip over a multibyte character in UTF8 mode. */
1465
1466 case OP_EXACT:
1467 branchlength += GET2(cc,1);
1468 cc += 4;
1469 #ifdef SUPPORT_UTF8
1470 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1471 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1472 #endif
1473 break;
1474
1475 case OP_TYPEEXACT:
1476 branchlength += GET2(cc,1);
1477 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1478 cc += 4;
1479 break;
1480
1481 /* Handle single-char matchers */
1482
1483 case OP_PROP:
1484 case OP_NOTPROP:
1485 cc += 2;
1486 /* Fall through */
1487
1488 case OP_NOT_DIGIT:
1489 case OP_DIGIT:
1490 case OP_NOT_WHITESPACE:
1491 case OP_WHITESPACE:
1492 case OP_NOT_WORDCHAR:
1493 case OP_WORDCHAR:
1494 case OP_ANY:
1495 case OP_ALLANY:
1496 branchlength++;
1497 cc++;
1498 break;
1499
1500 /* The single-byte matcher isn't allowed */
1501
1502 case OP_ANYBYTE:
1503 return -2;
1504
1505 /* Check a class for variable quantification */
1506
1507 #ifdef SUPPORT_UTF8
1508 case OP_XCLASS:
1509 cc += GET(cc, 1) - 33;
1510 /* Fall through */
1511 #endif
1512
1513 case OP_CLASS:
1514 case OP_NCLASS:
1515 cc += 33;
1516
1517 switch (*cc)
1518 {
1519 case OP_CRSTAR:
1520 case OP_CRMINSTAR:
1521 case OP_CRQUERY:
1522 case OP_CRMINQUERY:
1523 return -1;
1524
1525 case OP_CRRANGE:
1526 case OP_CRMINRANGE:
1527 if (GET2(cc,1) != GET2(cc,3)) return -1;
1528 branchlength += GET2(cc,1);
1529 cc += 5;
1530 break;
1531
1532 default:
1533 branchlength++;
1534 }
1535 break;
1536
1537 /* Anything else is variable length */
1538
1539 default:
1540 return -1;
1541 }
1542 }
1543 /* Control never gets here */
1544 }
1545
1546
1547
1548
1549 /*************************************************
1550 * Scan compiled regex for specific bracket *
1551 *************************************************/
1552
1553 /* This little function scans through a compiled pattern until it finds a
1554 capturing bracket with the given number, or, if the number is negative, an
1555 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1556 so that it can be called from pcre_study() when finding the minimum matching
1557 length.
1558
1559 Arguments:
1560 code points to start of expression
1561 utf8 TRUE in UTF-8 mode
1562 number the required bracket number or negative to find a lookbehind
1563
1564 Returns: pointer to the opcode for the bracket, or NULL if not found
1565 */
1566
1567 const uschar *
1568 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1569 {
1570 for (;;)
1571 {
1572 register int c = *code;
1573 if (c == OP_END) return NULL;
1574
1575 /* XCLASS is used for classes that cannot be represented just by a bit
1576 map. This includes negated single high-valued characters. The length in
1577 the table is zero; the actual length is stored in the compiled code. */
1578
1579 if (c == OP_XCLASS) code += GET(code, 1);
1580
1581 /* Handle recursion */
1582
1583 else if (c == OP_REVERSE)
1584 {
1585 if (number < 0) return (uschar *)code;
1586 code += _pcre_OP_lengths[c];
1587 }
1588
1589 /* Handle capturing bracket */
1590
1591 else if (c == OP_CBRA)
1592 {
1593 int n = GET2(code, 1+LINK_SIZE);
1594 if (n == number) return (uschar *)code;
1595 code += _pcre_OP_lengths[c];
1596 }
1597
1598 /* Otherwise, we can get the item's length from the table, except that for
1599 repeated character types, we have to test for \p and \P, which have an extra
1600 two bytes of parameters. */
1601
1602 else
1603 {
1604 switch(c)
1605 {
1606 case OP_TYPESTAR:
1607 case OP_TYPEMINSTAR:
1608 case OP_TYPEPLUS:
1609 case OP_TYPEMINPLUS:
1610 case OP_TYPEQUERY:
1611 case OP_TYPEMINQUERY:
1612 case OP_TYPEPOSSTAR:
1613 case OP_TYPEPOSPLUS:
1614 case OP_TYPEPOSQUERY:
1615 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1616 break;
1617
1618 case OP_TYPEUPTO:
1619 case OP_TYPEMINUPTO:
1620 case OP_TYPEEXACT:
1621 case OP_TYPEPOSUPTO:
1622 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1623 break;
1624 }
1625
1626 /* Add in the fixed length from the table */
1627
1628 code += _pcre_OP_lengths[c];
1629
1630 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1631 a multi-byte character. The length in the table is a minimum, so we have to
1632 arrange to skip the extra bytes. */
1633
1634 #ifdef SUPPORT_UTF8
1635 if (utf8) switch(c)
1636 {
1637 case OP_CHAR:
1638 case OP_CHARNC:
1639 case OP_EXACT:
1640 case OP_UPTO:
1641 case OP_MINUPTO:
1642 case OP_POSUPTO:
1643 case OP_STAR:
1644 case OP_MINSTAR:
1645 case OP_POSSTAR:
1646 case OP_PLUS:
1647 case OP_MINPLUS:
1648 case OP_POSPLUS:
1649 case OP_QUERY:
1650 case OP_MINQUERY:
1651 case OP_POSQUERY:
1652 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1653 break;
1654 }
1655 #else
1656 (void)(utf8); /* Keep compiler happy by referencing function argument */
1657 #endif
1658 }
1659 }
1660 }
1661
1662
1663
1664 /*************************************************
1665 * Scan compiled regex for recursion reference *
1666 *************************************************/
1667
1668 /* This little function scans through a compiled pattern until it finds an
1669 instance of OP_RECURSE.
1670
1671 Arguments:
1672 code points to start of expression
1673 utf8 TRUE in UTF-8 mode
1674
1675 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1676 */
1677
1678 static const uschar *
1679 find_recurse(const uschar *code, BOOL utf8)
1680 {
1681 for (;;)
1682 {
1683 register int c = *code;
1684 if (c == OP_END) return NULL;
1685 if (c == OP_RECURSE) return code;
1686
1687 /* XCLASS is used for classes that cannot be represented just by a bit
1688 map. This includes negated single high-valued characters. The length in
1689 the table is zero; the actual length is stored in the compiled code. */
1690
1691 if (c == OP_XCLASS) code += GET(code, 1);
1692
1693 /* Otherwise, we can get the item's length from the table, except that for
1694 repeated character types, we have to test for \p and \P, which have an extra
1695 two bytes of parameters. */
1696
1697 else
1698 {
1699 switch(c)
1700 {
1701 case OP_TYPESTAR:
1702 case OP_TYPEMINSTAR:
1703 case OP_TYPEPLUS:
1704 case OP_TYPEMINPLUS:
1705 case OP_TYPEQUERY:
1706 case OP_TYPEMINQUERY:
1707 case OP_TYPEPOSSTAR:
1708 case OP_TYPEPOSPLUS:
1709 case OP_TYPEPOSQUERY:
1710 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1711 break;
1712
1713 case OP_TYPEPOSUPTO:
1714 case OP_TYPEUPTO:
1715 case OP_TYPEMINUPTO:
1716 case OP_TYPEEXACT:
1717 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1718 break;
1719 }
1720
1721 /* Add in the fixed length from the table */
1722
1723 code += _pcre_OP_lengths[c];
1724
1725 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1726 by a multi-byte character. The length in the table is a minimum, so we have
1727 to arrange to skip the extra bytes. */
1728
1729 #ifdef SUPPORT_UTF8
1730 if (utf8) switch(c)
1731 {
1732 case OP_CHAR:
1733 case OP_CHARNC:
1734 case OP_EXACT:
1735 case OP_UPTO:
1736 case OP_MINUPTO:
1737 case OP_POSUPTO:
1738 case OP_STAR:
1739 case OP_MINSTAR:
1740 case OP_POSSTAR:
1741 case OP_PLUS:
1742 case OP_MINPLUS:
1743 case OP_POSPLUS:
1744 case OP_QUERY:
1745 case OP_MINQUERY:
1746 case OP_POSQUERY:
1747 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1748 break;
1749 }
1750 #else
1751 (void)(utf8); /* Keep compiler happy by referencing function argument */
1752 #endif
1753 }
1754 }
1755 }
1756
1757
1758
1759 /*************************************************
1760 * Scan compiled branch for non-emptiness *
1761 *************************************************/
1762
1763 /* This function scans through a branch of a compiled pattern to see whether it
1764 can match the empty string or not. It is called from could_be_empty()
1765 below and from compile_branch() when checking for an unlimited repeat of a
1766 group that can match nothing. Note that first_significant_code() skips over
1767 backward and negative forward assertions when its final argument is TRUE. If we
1768 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1769 bracket whose current branch will already have been scanned.
1770
1771 Arguments:
1772 code points to start of search
1773 endcode points to where to stop
1774 utf8 TRUE if in UTF8 mode
1775
1776 Returns: TRUE if what is matched could be empty
1777 */
1778
1779 static BOOL
1780 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1781 {
1782 register int c;
1783 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1784 code < endcode;
1785 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1786 {
1787 const uschar *ccode;
1788
1789 c = *code;
1790
1791 /* Skip over forward assertions; the other assertions are skipped by
1792 first_significant_code() with a TRUE final argument. */
1793
1794 if (c == OP_ASSERT)
1795 {
1796 do code += GET(code, 1); while (*code == OP_ALT);
1797 c = *code;
1798 continue;
1799 }
1800
1801 /* Groups with zero repeats can of course be empty; skip them. */
1802
1803 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1804 {
1805 code += _pcre_OP_lengths[c];
1806 do code += GET(code, 1); while (*code == OP_ALT);
1807 c = *code;
1808 continue;
1809 }
1810
1811 /* For other groups, scan the branches. */
1812
1813 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1814 {
1815 BOOL empty_branch;
1816 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1817
1818 /* If a conditional group has only one branch, there is a second, implied,
1819 empty branch, so just skip over the conditional, because it could be empty.
1820 Otherwise, scan the individual branches of the group. */
1821
1822 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1823 code += GET(code, 1);
1824 else
1825 {
1826 empty_branch = FALSE;
1827 do
1828 {
1829 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1830 empty_branch = TRUE;
1831 code += GET(code, 1);
1832 }
1833 while (*code == OP_ALT);
1834 if (!empty_branch) return FALSE; /* All branches are non-empty */
1835 }
1836
1837 c = *code;
1838 continue;
1839 }
1840
1841 /* Handle the other opcodes */
1842
1843 switch (c)
1844 {
1845 /* Check for quantifiers after a class. XCLASS is used for classes that
1846 cannot be represented just by a bit map. This includes negated single
1847 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1848 actual length is stored in the compiled code, so we must update "code"
1849 here. */
1850
1851 #ifdef SUPPORT_UTF8
1852 case OP_XCLASS:
1853 ccode = code += GET(code, 1);
1854 goto CHECK_CLASS_REPEAT;
1855 #endif
1856
1857 case OP_CLASS:
1858 case OP_NCLASS:
1859 ccode = code + 33;
1860
1861 #ifdef SUPPORT_UTF8
1862 CHECK_CLASS_REPEAT:
1863 #endif
1864
1865 switch (*ccode)
1866 {
1867 case OP_CRSTAR: /* These could be empty; continue */
1868 case OP_CRMINSTAR:
1869 case OP_CRQUERY:
1870 case OP_CRMINQUERY:
1871 break;
1872
1873 default: /* Non-repeat => class must match */
1874 case OP_CRPLUS: /* These repeats aren't empty */
1875 case OP_CRMINPLUS:
1876 return FALSE;
1877
1878 case OP_CRRANGE:
1879 case OP_CRMINRANGE:
1880 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1881 break;
1882 }
1883 break;
1884
1885 /* Opcodes that must match a character */
1886
1887 case OP_PROP:
1888 case OP_NOTPROP:
1889 case OP_EXTUNI:
1890 case OP_NOT_DIGIT:
1891 case OP_DIGIT:
1892 case OP_NOT_WHITESPACE:
1893 case OP_WHITESPACE:
1894 case OP_NOT_WORDCHAR:
1895 case OP_WORDCHAR:
1896 case OP_ANY:
1897 case OP_ALLANY:
1898 case OP_ANYBYTE:
1899 case OP_CHAR:
1900 case OP_CHARNC:
1901 case OP_NOT:
1902 case OP_PLUS:
1903 case OP_MINPLUS:
1904 case OP_POSPLUS:
1905 case OP_EXACT:
1906 case OP_NOTPLUS:
1907 case OP_NOTMINPLUS:
1908 case OP_NOTPOSPLUS:
1909 case OP_NOTEXACT:
1910 case OP_TYPEPLUS:
1911 case OP_TYPEMINPLUS:
1912 case OP_TYPEPOSPLUS:
1913 case OP_TYPEEXACT:
1914 return FALSE;
1915
1916 /* These are going to continue, as they may be empty, but we have to
1917 fudge the length for the \p and \P cases. */
1918
1919 case OP_TYPESTAR:
1920 case OP_TYPEMINSTAR:
1921 case OP_TYPEPOSSTAR:
1922 case OP_TYPEQUERY:
1923 case OP_TYPEMINQUERY:
1924 case OP_TYPEPOSQUERY:
1925 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1926 break;
1927
1928 /* Same for these */
1929
1930 case OP_TYPEUPTO:
1931 case OP_TYPEMINUPTO:
1932 case OP_TYPEPOSUPTO:
1933 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1934 break;
1935
1936 /* End of branch */
1937
1938 case OP_KET:
1939 case OP_KETRMAX:
1940 case OP_KETRMIN:
1941 case OP_ALT:
1942 return TRUE;
1943
1944 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1945 MINUPTO, and POSUPTO may be followed by a multibyte character */
1946
1947 #ifdef SUPPORT_UTF8
1948 case OP_STAR:
1949 case OP_MINSTAR:
1950 case OP_POSSTAR:
1951 case OP_QUERY:
1952 case OP_MINQUERY:
1953 case OP_POSQUERY:
1954 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1955 break;
1956
1957 case OP_UPTO:
1958 case OP_MINUPTO:
1959 case OP_POSUPTO:
1960 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1961 break;
1962 #endif
1963 }
1964 }
1965
1966 return TRUE;
1967 }
1968
1969
1970
1971 /*************************************************
1972 * Scan compiled regex for non-emptiness *
1973 *************************************************/
1974
1975 /* This function is called to check for left recursive calls. We want to check
1976 the current branch of the current pattern to see if it could match the empty
1977 string. If it could, we must look outwards for branches at other levels,
1978 stopping when we pass beyond the bracket which is the subject of the recursion.
1979
1980 Arguments:
1981 code points to start of the recursion
1982 endcode points to where to stop (current RECURSE item)
1983 bcptr points to the chain of current (unclosed) branch starts
1984 utf8 TRUE if in UTF-8 mode
1985
1986 Returns: TRUE if what is matched could be empty
1987 */
1988
1989 static BOOL
1990 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1991 BOOL utf8)
1992 {
1993 while (bcptr != NULL && bcptr->current >= code)
1994 {
1995 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1996 bcptr = bcptr->outer;
1997 }
1998 return TRUE;
1999 }
2000
2001
2002
2003 /*************************************************
2004 * Check for POSIX class syntax *
2005 *************************************************/
2006
2007 /* This function is called when the sequence "[:" or "[." or "[=" is
2008 encountered in a character class. It checks whether this is followed by a
2009 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2010 reach an unescaped ']' without the special preceding character, return FALSE.
2011
2012 Originally, this function only recognized a sequence of letters between the
2013 terminators, but it seems that Perl recognizes any sequence of characters,
2014 though of course unknown POSIX names are subsequently rejected. Perl gives an
2015 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2016 didn't consider this to be a POSIX class. Likewise for [:1234:].
2017
2018 The problem in trying to be exactly like Perl is in the handling of escapes. We
2019 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2020 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2021 below handles the special case of \], but does not try to do any other escape
2022 processing. This makes it different from Perl for cases such as [:l\ower:]
2023 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2024 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2025 I think.
2026
2027 Arguments:
2028 ptr pointer to the initial [
2029 endptr where to return the end pointer
2030
2031 Returns: TRUE or FALSE
2032 */
2033
2034 static BOOL
2035 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2036 {
2037 int terminator; /* Don't combine these lines; the Solaris cc */
2038 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2039 for (++ptr; *ptr != 0; ptr++)
2040 {
2041 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2042 {
2043 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2044 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2045 {
2046 *endptr = ptr;
2047 return TRUE;
2048 }
2049 }
2050 }
2051 return FALSE;
2052 }
2053
2054
2055
2056
2057 /*************************************************
2058 * Check POSIX class name *
2059 *************************************************/
2060
2061 /* This function is called to check the name given in a POSIX-style class entry
2062 such as [:alnum:].
2063
2064 Arguments:
2065 ptr points to the first letter
2066 len the length of the name
2067
2068 Returns: a value representing the name, or -1 if unknown
2069 */
2070
2071 static int
2072 check_posix_name(const uschar *ptr, int len)
2073 {
2074 const char *pn = posix_names;
2075 register int yield = 0;
2076 while (posix_name_lengths[yield] != 0)
2077 {
2078 if (len == posix_name_lengths[yield] &&
2079 strncmp((const char *)ptr, pn, len) == 0) return yield;
2080 pn += posix_name_lengths[yield] + 1;
2081 yield++;
2082 }
2083 return -1;
2084 }
2085
2086
2087 /*************************************************
2088 * Adjust OP_RECURSE items in repeated group *
2089 *************************************************/
2090
2091 /* OP_RECURSE items contain an offset from the start of the regex to the group
2092 that is referenced. This means that groups can be replicated for fixed
2093 repetition simply by copying (because the recursion is allowed to refer to
2094 earlier groups that are outside the current group). However, when a group is
2095 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2096 inserted before it, after it has been compiled. This means that any OP_RECURSE
2097 items within it that refer to the group itself or any contained groups have to
2098 have their offsets adjusted. That one of the jobs of this function. Before it
2099 is called, the partially compiled regex must be temporarily terminated with
2100 OP_END.
2101
2102 This function has been extended with the possibility of forward references for
2103 recursions and subroutine calls. It must also check the list of such references
2104 for the group we are dealing with. If it finds that one of the recursions in
2105 the current group is on this list, it adjusts the offset in the list, not the
2106 value in the reference (which is a group number).
2107
2108 Arguments:
2109 group points to the start of the group
2110 adjust the amount by which the group is to be moved
2111 utf8 TRUE in UTF-8 mode
2112 cd contains pointers to tables etc.
2113 save_hwm the hwm forward reference pointer at the start of the group
2114
2115 Returns: nothing
2116 */
2117
2118 static void
2119 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2120 uschar *save_hwm)
2121 {
2122 uschar *ptr = group;
2123
2124 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2125 {
2126 int offset;
2127 uschar *hc;
2128
2129 /* See if this recursion is on the forward reference list. If so, adjust the
2130 reference. */
2131
2132 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2133 {
2134 offset = GET(hc, 0);
2135 if (cd->start_code + offset == ptr + 1)
2136 {
2137 PUT(hc, 0, offset + adjust);
2138 break;
2139 }
2140 }
2141
2142 /* Otherwise, adjust the recursion offset if it's after the start of this
2143 group. */
2144
2145 if (hc >= cd->hwm)
2146 {
2147 offset = GET(ptr, 1);
2148 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2149 }
2150
2151 ptr += 1 + LINK_SIZE;
2152 }
2153 }
2154
2155
2156
2157 /*************************************************
2158 * Insert an automatic callout point *
2159 *************************************************/
2160
2161 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2162 callout points before each pattern item.
2163
2164 Arguments:
2165 code current code pointer
2166 ptr current pattern pointer
2167 cd pointers to tables etc
2168
2169 Returns: new code pointer
2170 */
2171
2172 static uschar *
2173 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2174 {
2175 *code++ = OP_CALLOUT;
2176 *code++ = 255;
2177 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2178 PUT(code, LINK_SIZE, 0); /* Default length */
2179 return code + 2*LINK_SIZE;
2180 }
2181
2182
2183
2184 /*************************************************
2185 * Complete a callout item *
2186 *************************************************/
2187
2188 /* A callout item contains the length of the next item in the pattern, which
2189 we can't fill in till after we have reached the relevant point. This is used
2190 for both automatic and manual callouts.
2191
2192 Arguments:
2193 previous_callout points to previous callout item
2194 ptr current pattern pointer
2195 cd pointers to tables etc
2196
2197 Returns: nothing
2198 */
2199
2200 static void
2201 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2202 {
2203 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2204 PUT(previous_callout, 2 + LINK_SIZE, length);
2205 }
2206
2207
2208
2209 #ifdef SUPPORT_UCP
2210 /*************************************************
2211 * Get othercase range *
2212 *************************************************/
2213
2214 /* This function is passed the start and end of a class range, in UTF-8 mode
2215 with UCP support. It searches up the characters, looking for internal ranges of
2216 characters in the "other" case. Each call returns the next one, updating the
2217 start address.
2218
2219 Arguments:
2220 cptr points to starting character value; updated
2221 d end value
2222 ocptr where to put start of othercase range
2223 odptr where to put end of othercase range
2224
2225 Yield: TRUE when range returned; FALSE when no more
2226 */
2227
2228 static BOOL
2229 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2230 unsigned int *odptr)
2231 {
2232 unsigned int c, othercase, next;
2233
2234 for (c = *cptr; c <= d; c++)
2235 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2236
2237 if (c > d) return FALSE;
2238
2239 *ocptr = othercase;
2240 next = othercase + 1;
2241
2242 for (++c; c <= d; c++)
2243 {
2244 if (UCD_OTHERCASE(c) != next) break;
2245 next++;
2246 }
2247
2248 *odptr = next - 1;
2249 *cptr = c;
2250
2251 return TRUE;
2252 }
2253 #endif /* SUPPORT_UCP */
2254
2255
2256
2257 /*************************************************
2258 * Check if auto-possessifying is possible *
2259 *************************************************/
2260
2261 /* This function is called for unlimited repeats of certain items, to see
2262 whether the next thing could possibly match the repeated item. If not, it makes
2263 sense to automatically possessify the repeated item.
2264
2265 Arguments:
2266 op_code the repeated op code
2267 this data for this item, depends on the opcode
2268 utf8 TRUE in UTF-8 mode
2269 utf8_char used for utf8 character bytes, NULL if not relevant
2270 ptr next character in pattern
2271 options options bits
2272 cd contains pointers to tables etc.
2273
2274 Returns: TRUE if possessifying is wanted
2275 */
2276
2277 static BOOL
2278 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2279 const uschar *ptr, int options, compile_data *cd)
2280 {
2281 int next;
2282
2283 /* Skip whitespace and comments in extended mode */
2284
2285 if ((options & PCRE_EXTENDED) != 0)
2286 {
2287 for (;;)
2288 {
2289 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2290 if (*ptr == CHAR_NUMBER_SIGN)
2291 {
2292 while (*(++ptr) != 0)
2293 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2294 }
2295 else break;
2296 }
2297 }
2298
2299 /* If the next item is one that we can handle, get its value. A non-negative
2300 value is a character, a negative value is an escape value. */
2301
2302 if (*ptr == CHAR_BACKSLASH)
2303 {
2304 int temperrorcode = 0;
2305 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2306 if (temperrorcode != 0) return FALSE;
2307 ptr++; /* Point after the escape sequence */
2308 }
2309
2310 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2311 {
2312 #ifdef SUPPORT_UTF8
2313 if (utf8) { GETCHARINC(next, ptr); } else
2314 #endif
2315 next = *ptr++;
2316 }
2317
2318 else return FALSE;
2319
2320 /* Skip whitespace and comments in extended mode */
2321
2322 if ((options & PCRE_EXTENDED) != 0)
2323 {
2324 for (;;)
2325 {
2326 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2327 if (*ptr == CHAR_NUMBER_SIGN)
2328 {
2329 while (*(++ptr) != 0)
2330 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2331 }
2332 else break;
2333 }
2334 }
2335
2336 /* If the next thing is itself optional, we have to give up. */
2337
2338 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2339 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2340 return FALSE;
2341
2342 /* Now compare the next item with the previous opcode. If the previous is a
2343 positive single character match, "item" either contains the character or, if
2344 "item" is greater than 127 in utf8 mode, the character's bytes are in
2345 utf8_char. */
2346
2347
2348 /* Handle cases when the next item is a character. */
2349
2350 if (next >= 0) switch(op_code)
2351 {
2352 case OP_CHAR:
2353 #ifdef SUPPORT_UTF8
2354 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2355 #else
2356 (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2357 #endif
2358 return item != next;
2359
2360 /* For CHARNC (caseless character) we must check the other case. If we have
2361 Unicode property support, we can use it to test the other case of
2362 high-valued characters. */
2363
2364 case OP_CHARNC:
2365 #ifdef SUPPORT_UTF8
2366 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2367 #endif
2368 if (item == next) return FALSE;
2369 #ifdef SUPPORT_UTF8
2370 if (utf8)
2371 {
2372 unsigned int othercase;
2373 if (next < 128) othercase = cd->fcc[next]; else
2374 #ifdef SUPPORT_UCP
2375 othercase = UCD_OTHERCASE((unsigned int)next);
2376 #else
2377 othercase = NOTACHAR;
2378 #endif
2379 return (unsigned int)item != othercase;
2380 }
2381 else
2382 #endif /* SUPPORT_UTF8 */
2383 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2384
2385 /* For OP_NOT, "item" must be a single-byte character. */
2386
2387 case OP_NOT:
2388 if (item == next) return TRUE;
2389 if ((options & PCRE_CASELESS) == 0) return FALSE;
2390 #ifdef SUPPORT_UTF8
2391 if (utf8)
2392 {
2393 unsigned int othercase;
2394 if (next < 128) othercase = cd->fcc[next]; else
2395 #ifdef SUPPORT_UCP
2396 othercase = UCD_OTHERCASE(next);
2397 #else
2398 othercase = NOTACHAR;
2399 #endif
2400 return (unsigned int)item == othercase;
2401 }
2402 else
2403 #endif /* SUPPORT_UTF8 */
2404 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2405
2406 case OP_DIGIT:
2407 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2408
2409 case OP_NOT_DIGIT:
2410 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2411
2412 case OP_WHITESPACE:
2413 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2414
2415 case OP_NOT_WHITESPACE:
2416 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2417
2418 case OP_WORDCHAR:
2419 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2420
2421 case OP_NOT_WORDCHAR:
2422 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2423
2424 case OP_HSPACE:
2425 case OP_NOT_HSPACE:
2426 switch(next)
2427 {
2428 case 0x09:
2429 case 0x20:
2430 case 0xa0:
2431 case 0x1680:
2432 case 0x180e:
2433 case 0x2000:
2434 case 0x2001:
2435 case 0x2002:
2436 case 0x2003:
2437 case 0x2004:
2438 case 0x2005:
2439 case 0x2006:
2440 case 0x2007:
2441 case 0x2008:
2442 case 0x2009:
2443 case 0x200A:
2444 case 0x202f:
2445 case 0x205f:
2446 case 0x3000:
2447 return op_code != OP_HSPACE;
2448 default:
2449 return op_code == OP_HSPACE;
2450 }
2451
2452 case OP_VSPACE:
2453 case OP_NOT_VSPACE:
2454 switch(next)
2455 {
2456 case 0x0a:
2457 case 0x0b:
2458 case 0x0c:
2459 case 0x0d:
2460 case 0x85:
2461 case 0x2028:
2462 case 0x2029:
2463 return op_code != OP_VSPACE;
2464 default:
2465 return op_code == OP_VSPACE;
2466 }
2467
2468 default:
2469 return FALSE;
2470 }
2471
2472
2473 /* Handle the case when the next item is \d, \s, etc. */
2474
2475 switch(op_code)
2476 {
2477 case OP_CHAR:
2478 case OP_CHARNC:
2479 #ifdef SUPPORT_UTF8
2480 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2481 #endif
2482 switch(-next)
2483 {
2484 case ESC_d:
2485 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2486
2487 case ESC_D:
2488 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2489
2490 case ESC_s:
2491 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2492
2493 case ESC_S:
2494 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2495
2496 case ESC_w:
2497 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2498
2499 case ESC_W:
2500 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2501
2502 case ESC_h:
2503 case ESC_H:
2504 switch(item)
2505 {
2506 case 0x09:
2507 case 0x20:
2508 case 0xa0:
2509 case 0x1680:
2510 case 0x180e:
2511 case 0x2000:
2512 case 0x2001:
2513 case 0x2002:
2514 case 0x2003:
2515 case 0x2004:
2516 case 0x2005:
2517 case 0x2006:
2518 case 0x2007:
2519 case 0x2008:
2520 case 0x2009:
2521 case 0x200A:
2522 case 0x202f:
2523 case 0x205f:
2524 case 0x3000:
2525 return -next != ESC_h;
2526 default:
2527 return -next == ESC_h;
2528 }
2529
2530 case ESC_v:
2531 case ESC_V:
2532 switch(item)
2533 {
2534 case 0x0a:
2535 case 0x0b:
2536 case 0x0c:
2537 case 0x0d:
2538 case 0x85:
2539 case 0x2028:
2540 case 0x2029:
2541 return -next != ESC_v;
2542 default:
2543 return -next == ESC_v;
2544 }
2545
2546 default:
2547 return FALSE;
2548 }
2549
2550 case OP_DIGIT:
2551 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2552 next == -ESC_h || next == -ESC_v;
2553
2554 case OP_NOT_DIGIT:
2555 return next == -ESC_d;
2556
2557 case OP_WHITESPACE:
2558 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2559
2560 case OP_NOT_WHITESPACE:
2561 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2562
2563 case OP_HSPACE:
2564 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2565
2566 case OP_NOT_HSPACE:
2567 return next == -ESC_h;
2568
2569 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2570 case OP_VSPACE:
2571 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2572
2573 case OP_NOT_VSPACE:
2574 return next == -ESC_v;
2575
2576 case OP_WORDCHAR:
2577 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2578
2579 case OP_NOT_WORDCHAR:
2580 return next == -ESC_w || next == -ESC_d;
2581
2582 default:
2583 return FALSE;
2584 }
2585
2586 /* Control does not reach here */
2587 }
2588
2589
2590
2591 /*************************************************
2592 * Compile one branch *
2593 *************************************************/
2594
2595 /* Scan the pattern, compiling it into the a vector. If the options are
2596 changed during the branch, the pointer is used to change the external options
2597 bits. This function is used during the pre-compile phase when we are trying
2598 to find out the amount of memory needed, as well as during the real compile
2599 phase. The value of lengthptr distinguishes the two phases.
2600
2601 Arguments:
2602 optionsptr pointer to the option bits
2603 codeptr points to the pointer to the current code point
2604 ptrptr points to the current pattern pointer
2605 errorcodeptr points to error code variable
2606 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2607 reqbyteptr set to the last literal character required, else < 0
2608 bcptr points to current branch chain
2609 cd contains pointers to tables etc.
2610 lengthptr NULL during the real compile phase
2611 points to length accumulator during pre-compile phase
2612
2613 Returns: TRUE on success
2614 FALSE, with *errorcodeptr set non-zero on error
2615 */
2616
2617 static BOOL
2618 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2619 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2620 compile_data *cd, int *lengthptr)
2621 {
2622 int repeat_type, op_type;
2623 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2624 int bravalue = 0;
2625 int greedy_default, greedy_non_default;
2626 int firstbyte, reqbyte;
2627 int zeroreqbyte, zerofirstbyte;
2628 int req_caseopt, reqvary, tempreqvary;
2629 int options = *optionsptr;
2630 int after_manual_callout = 0;
2631 int length_prevgroup = 0;
2632 register int c;
2633 register uschar *code = *codeptr;
2634 uschar *last_code = code;
2635 uschar *orig_code = code;
2636 uschar *tempcode;
2637 BOOL inescq = FALSE;
2638 BOOL groupsetfirstbyte = FALSE;
2639 const uschar *ptr = *ptrptr;
2640 const uschar *tempptr;
2641 uschar *previous = NULL;
2642 uschar *previous_callout = NULL;
2643 uschar *save_hwm = NULL;
2644 uschar classbits[32];
2645
2646 #ifdef SUPPORT_UTF8
2647 BOOL class_utf8;
2648 BOOL utf8 = (options & PCRE_UTF8) != 0;
2649 uschar *class_utf8data;
2650 uschar *class_utf8data_base;
2651 uschar utf8_char[6];
2652 #else
2653 BOOL utf8 = FALSE;
2654 uschar *utf8_char = NULL;
2655 #endif
2656
2657 #ifdef DEBUG
2658 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2659 #endif
2660
2661 /* Set up the default and non-default settings for greediness */
2662
2663 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2664 greedy_non_default = greedy_default ^ 1;
2665
2666 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2667 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2668 matches a non-fixed char first char; reqbyte just remains unset if we never
2669 find one.
2670
2671 When we hit a repeat whose minimum is zero, we may have to adjust these values
2672 to take the zero repeat into account. This is implemented by setting them to
2673 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2674 item types that can be repeated set these backoff variables appropriately. */
2675
2676 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2677
2678 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2679 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2680 value > 255. It is added into the firstbyte or reqbyte variables to record the
2681 case status of the value. This is used only for ASCII characters. */
2682
2683 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2684
2685 /* Switch on next character until the end of the branch */
2686
2687 for (;; ptr++)
2688 {
2689 BOOL negate_class;
2690 BOOL should_flip_negation;
2691 BOOL possessive_quantifier;
2692 BOOL is_quantifier;
2693 BOOL is_recurse;
2694 BOOL reset_bracount;
2695 int class_charcount;
2696 int class_lastchar;
2697 int newoptions;
2698 int recno;
2699 int refsign;
2700 int skipbytes;
2701 int subreqbyte;
2702 int subfirstbyte;
2703 int terminator;
2704 int mclength;
2705 uschar mcbuffer[8];
2706
2707 /* Get next byte in the pattern */
2708
2709 c = *ptr;
2710
2711 /* If we are in the pre-compile phase, accumulate the length used for the
2712 previous cycle of this loop. */
2713
2714 if (lengthptr != NULL)
2715 {
2716 #ifdef DEBUG
2717 if (code > cd->hwm) cd->hwm = code; /* High water info */
2718 #endif
2719 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2720 {
2721 *errorcodeptr = ERR52;
2722 goto FAILED;
2723 }
2724
2725 /* There is at least one situation where code goes backwards: this is the
2726 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2727 the class is simply eliminated. However, it is created first, so we have to
2728 allow memory for it. Therefore, don't ever reduce the length at this point.
2729 */
2730
2731 if (code < last_code) code = last_code;
2732
2733 /* Paranoid check for integer overflow */
2734
2735 if (OFLOW_MAX - *lengthptr < code - last_code)
2736 {
2737 *errorcodeptr = ERR20;
2738 goto FAILED;
2739 }
2740
2741 *lengthptr += code - last_code;
2742 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2743
2744 /* If "previous" is set and it is not at the start of the work space, move
2745 it back to there, in order to avoid filling up the work space. Otherwise,
2746 if "previous" is NULL, reset the current code pointer to the start. */
2747
2748 if (previous != NULL)
2749 {
2750 if (previous > orig_code)
2751 {
2752 memmove(orig_code, previous, code - previous);
2753 code -= previous - orig_code;
2754 previous = orig_code;
2755 }
2756 }
2757 else code = orig_code;
2758
2759 /* Remember where this code item starts so we can pick up the length
2760 next time round. */
2761
2762 last_code = code;
2763 }
2764
2765 /* In the real compile phase, just check the workspace used by the forward
2766 reference list. */
2767
2768 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2769 {
2770 *errorcodeptr = ERR52;
2771 goto FAILED;
2772 }
2773
2774 /* If in \Q...\E, check for the end; if not, we have a literal */
2775
2776 if (inescq && c != 0)
2777 {
2778 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2779 {
2780 inescq = FALSE;
2781 ptr++;
2782 continue;
2783 }
2784 else
2785 {
2786 if (previous_callout != NULL)
2787 {
2788 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2789 complete_callout(previous_callout, ptr, cd);
2790 previous_callout = NULL;
2791 }
2792 if ((options & PCRE_AUTO_CALLOUT) != 0)
2793 {
2794 previous_callout = code;
2795 code = auto_callout(code, ptr, cd);
2796 }
2797 goto NORMAL_CHAR;
2798 }
2799 }
2800
2801 /* Fill in length of a previous callout, except when the next thing is
2802 a quantifier. */
2803
2804 is_quantifier =
2805 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2806 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2807
2808 if (!is_quantifier && previous_callout != NULL &&
2809 after_manual_callout-- <= 0)
2810 {
2811 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2812 complete_callout(previous_callout, ptr, cd);
2813 previous_callout = NULL;
2814 }
2815
2816 /* In extended mode, skip white space and comments */
2817
2818 if ((options & PCRE_EXTENDED) != 0)
2819 {
2820 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2821 if (c == CHAR_NUMBER_SIGN)
2822 {
2823 while (*(++ptr) != 0)
2824 {
2825 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2826 }
2827 if (*ptr != 0) continue;
2828
2829 /* Else fall through to handle end of string */
2830 c = 0;
2831 }
2832 }
2833
2834 /* No auto callout for quantifiers. */
2835
2836 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2837 {
2838 previous_callout = code;
2839 code = auto_callout(code, ptr, cd);
2840 }
2841
2842 switch(c)
2843 {
2844 /* ===================================================================*/
2845 case 0: /* The branch terminates at string end */
2846 case CHAR_VERTICAL_LINE: /* or | or ) */
2847 case CHAR_RIGHT_PARENTHESIS:
2848 *firstbyteptr = firstbyte;
2849 *reqbyteptr = reqbyte;
2850 *codeptr = code;
2851 *ptrptr = ptr;
2852 if (lengthptr != NULL)
2853 {
2854 if (OFLOW_MAX - *lengthptr < code - last_code)
2855 {
2856 *errorcodeptr = ERR20;
2857 goto FAILED;
2858 }
2859 *lengthptr += code - last_code; /* To include callout length */
2860 DPRINTF((">> end branch\n"));
2861 }
2862 return TRUE;
2863
2864
2865 /* ===================================================================*/
2866 /* Handle single-character metacharacters. In multiline mode, ^ disables
2867 the setting of any following char as a first character. */
2868
2869 case CHAR_CIRCUMFLEX_ACCENT:
2870 if ((options & PCRE_MULTILINE) != 0)
2871 {
2872 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2873 }
2874 previous = NULL;
2875 *code++ = OP_CIRC;
2876 break;
2877
2878 case CHAR_DOLLAR_SIGN:
2879 previous = NULL;
2880 *code++ = OP_DOLL;
2881 break;
2882
2883 /* There can never be a first char if '.' is first, whatever happens about
2884 repeats. The value of reqbyte doesn't change either. */
2885
2886 case CHAR_DOT:
2887 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2888 zerofirstbyte = firstbyte;
2889 zeroreqbyte = reqbyte;
2890 previous = code;
2891 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2892 break;
2893
2894
2895 /* ===================================================================*/
2896 /* Character classes. If the included characters are all < 256, we build a
2897 32-byte bitmap of the permitted characters, except in the special case
2898 where there is only one such character. For negated classes, we build the
2899 map as usual, then invert it at the end. However, we use a different opcode
2900 so that data characters > 255 can be handled correctly.
2901
2902 If the class contains characters outside the 0-255 range, a different
2903 opcode is compiled. It may optionally have a bit map for characters < 256,
2904 but those above are are explicitly listed afterwards. A flag byte tells
2905 whether the bitmap is present, and whether this is a negated class or not.
2906
2907 In JavaScript compatibility mode, an isolated ']' causes an error. In
2908 default (Perl) mode, it is treated as a data character. */
2909
2910 case CHAR_RIGHT_SQUARE_BRACKET:
2911 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2912 {
2913 *errorcodeptr = ERR64;
2914 goto FAILED;
2915 }
2916 goto NORMAL_CHAR;
2917
2918 case CHAR_LEFT_SQUARE_BRACKET:
2919 previous = code;
2920
2921 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2922 they are encountered at the top level, so we'll do that too. */
2923
2924 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2925 ptr[1] == CHAR_EQUALS_SIGN) &&
2926 check_posix_syntax(ptr, &tempptr))
2927 {
2928 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2929 goto FAILED;
2930 }
2931
2932 /* If the first character is '^', set the negation flag and skip it. Also,
2933 if the first few characters (either before or after ^) are \Q\E or \E we
2934 skip them too. This makes for compatibility with Perl. */
2935
2936 negate_class = FALSE;
2937 for (;;)
2938 {
2939 c = *(++ptr);
2940 if (c == CHAR_BACKSLASH)
2941 {
2942 if (ptr[1] == CHAR_E)
2943 ptr++;
2944 else if (strncmp((const char *)ptr+1,
2945 STR_Q STR_BACKSLASH STR_E, 3) == 0)
2946 ptr += 3;
2947 else
2948 break;
2949 }
2950 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2951 negate_class = TRUE;
2952 else break;
2953 }
2954
2955 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2956 an initial ']' is taken as a data character -- the code below handles
2957 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2958 [^] must match any character, so generate OP_ALLANY. */
2959
2960 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2961 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2962 {
2963 *code++ = negate_class? OP_ALLANY : OP_FAIL;
2964 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2965 zerofirstbyte = firstbyte;
2966 break;
2967 }
2968
2969 /* If a class contains a negative special such as \S, we need to flip the
2970 negation flag at the end, so that support for characters > 255 works
2971 correctly (they are all included in the class). */
2972
2973 should_flip_negation = FALSE;
2974
2975 /* Keep a count of chars with values < 256 so that we can optimize the case
2976 of just a single character (as long as it's < 256). However, For higher
2977 valued UTF-8 characters, we don't yet do any optimization. */
2978
2979 class_charcount = 0;
2980 class_lastchar = -1;
2981
2982 /* Initialize the 32-char bit map to all zeros. We build the map in a
2983 temporary bit of memory, in case the class contains only 1 character (less
2984 than 256), because in that case the compiled code doesn't use the bit map.
2985 */
2986
2987 memset(classbits, 0, 32 * sizeof(uschar));
2988
2989 #ifdef SUPPORT_UTF8
2990 class_utf8 = FALSE; /* No chars >= 256 */
2991 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2992 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2993 #endif
2994
2995 /* Process characters until ] is reached. By writing this as a "do" it
2996 means that an initial ] is taken as a data character. At the start of the
2997 loop, c contains the first byte of the character. */
2998
2999 if (c != 0) do
3000 {
3001 const uschar *oldptr;
3002
3003 #ifdef SUPPORT_UTF8
3004 if (utf8 && c > 127)
3005 { /* Braces are required because the */
3006 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3007 }
3008
3009 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3010 data and reset the pointer. This is so that very large classes that
3011 contain a zillion UTF-8 characters no longer overwrite the work space
3012 (which is on the stack). */
3013
3014 if (lengthptr != NULL)
3015 {
3016 *lengthptr += class_utf8data - class_utf8data_base;
3017 class_utf8data = class_utf8data_base;
3018 }
3019
3020 #endif
3021
3022 /* Inside \Q...\E everything is literal except \E */
3023
3024 if (inescq)
3025 {
3026 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3027 {
3028 inescq = FALSE; /* Reset literal state */
3029 ptr++; /* Skip the 'E' */
3030 continue; /* Carry on with next */
3031 }
3032 goto CHECK_RANGE; /* Could be range if \E follows */
3033 }
3034
3035 /* Handle POSIX class names. Perl allows a negation extension of the
3036 form [:^name:]. A square bracket that doesn't match the syntax is
3037 treated as a literal. We also recognize the POSIX constructions
3038 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3039 5.6 and 5.8 do. */
3040
3041 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3042 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3043 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3044 {
3045 BOOL local_negate = FALSE;
3046 int posix_class, taboffset, tabopt;
3047 register const uschar *cbits = cd->cbits;
3048 uschar pbits[32];
3049
3050 if (ptr[1] != CHAR_COLON)
3051 {
3052 *errorcodeptr = ERR31;
3053 goto FAILED;
3054 }
3055
3056 ptr += 2;
3057 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3058 {
3059 local_negate = TRUE;
3060 should_flip_negation = TRUE; /* Note negative special */
3061 ptr++;
3062 }
3063
3064 posix_class = check_posix_name(ptr, tempptr - ptr);
3065 if (posix_class < 0)
3066 {
3067 *errorcodeptr = ERR30;
3068 goto FAILED;
3069 }
3070
3071 /* If matching is caseless, upper and lower are converted to
3072 alpha. This relies on the fact that the class table starts with
3073 alpha, lower, upper as the first 3 entries. */
3074
3075 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3076 posix_class = 0;
3077
3078 /* We build the bit map for the POSIX class in a chunk of local store
3079 because we may be adding and subtracting from it, and we don't want to
3080 subtract bits that may be in the main map already. At the end we or the
3081 result into the bit map that is being built. */
3082
3083 posix_class *= 3;
3084
3085 /* Copy in the first table (always present) */
3086
3087 memcpy(pbits, cbits + posix_class_maps[posix_class],
3088 32 * sizeof(uschar));
3089
3090 /* If there is a second table, add or remove it as required. */
3091
3092 taboffset = posix_class_maps[posix_class + 1];
3093 tabopt = posix_class_maps[posix_class + 2];
3094
3095 if (taboffset >= 0)
3096 {
3097 if (tabopt >= 0)
3098 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3099 else
3100 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3101 }
3102
3103 /* Not see if we need to remove any special characters. An option
3104 value of 1 removes vertical space and 2 removes underscore. */
3105
3106 if (tabopt < 0) tabopt = -tabopt;
3107 if (tabopt == 1) pbits[1] &= ~0x3c;
3108 else if (tabopt == 2) pbits[11] &= 0x7f;
3109
3110 /* Add the POSIX table or its complement into the main table that is
3111 being built and we are done. */
3112
3113 if (local_negate)
3114 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3115 else
3116 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3117
3118 ptr = tempptr + 1;
3119 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3120 continue; /* End of POSIX syntax handling */
3121 }
3122
3123 /* Backslash may introduce a single character, or it may introduce one
3124 of the specials, which just set a flag. The sequence \b is a special
3125 case. Inside a class (and only there) it is treated as backspace.
3126 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3127 to 'or' into the one we are building. We assume they have more than one
3128 character in them, so set class_charcount bigger than one. */
3129
3130 if (c == CHAR_BACKSLASH)
3131 {
3132 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3133 if (*errorcodeptr != 0) goto FAILED;
3134
3135 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3136 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3137 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3138 else if (-c == ESC_Q) /* Handle start of quoted string */
3139 {
3140 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3141 {
3142 ptr += 2; /* avoid empty string */
3143 }
3144 else inescq = TRUE;
3145 continue;
3146 }
3147 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3148
3149 if (c < 0)
3150 {
3151 register const uschar *cbits = cd->cbits;
3152 class_charcount += 2; /* Greater than 1 is what matters */
3153
3154 /* Save time by not doing this in the pre-compile phase. */
3155
3156 if (lengthptr == NULL) switch (-c)
3157 {
3158 case ESC_d:
3159 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3160 continue;
3161
3162 case ESC_D:
3163 should_flip_negation = TRUE;
3164 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3165 continue;
3166
3167 case ESC_w:
3168 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3169 continue;
3170
3171 case ESC_W:
3172 should_flip_negation = TRUE;
3173 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3174 continue;
3175
3176 case ESC_s:
3177 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3178 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3179 continue;
3180
3181 case ESC_S:
3182 should_flip_negation = TRUE;
3183 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3184 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3185 continue;
3186
3187 default: /* Not recognized; fall through */
3188 break; /* Need "default" setting to stop compiler warning. */
3189 }
3190
3191 /* In the pre-compile phase, just do the recognition. */
3192
3193 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3194 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3195
3196 /* We need to deal with \H, \h, \V, and \v in both phases because
3197 they use extra memory. */
3198
3199 if (-c == ESC_h)
3200 {
3201 SETBIT(classbits, 0x09); /* VT */
3202 SETBIT(classbits, 0x20); /* SPACE */
3203 SETBIT(classbits, 0xa0); /* NSBP */
3204 #ifdef SUPPORT_UTF8
3205 if (utf8)
3206 {
3207 class_utf8 = TRUE;
3208 *class_utf8data++ = XCL_SINGLE;
3209 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3210 *class_utf8data++ = XCL_SINGLE;
3211 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3212 *class_utf8data++ = XCL_RANGE;
3213 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3214 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3215 *class_utf8data++ = XCL_SINGLE;
3216 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3217 *class_utf8data++ = XCL_SINGLE;
3218 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3219 *class_utf8data++ = XCL_SINGLE;
3220 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3221 }
3222 #endif
3223 continue;
3224 }
3225
3226 if (-c == ESC_H)
3227 {
3228 for (c = 0; c < 32; c++)
3229 {
3230 int x = 0xff;
3231 switch (c)
3232 {
3233 case 0x09/8: x ^= 1 << (0x09%8); break;
3234 case 0x20/8: x ^= 1 << (0x20%8); break;
3235 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3236 default: break;
3237 }
3238 classbits[c] |= x;
3239 }
3240
3241 #ifdef SUPPORT_UTF8
3242 if (utf8)
3243 {
3244 class_utf8 = TRUE;
3245 *class_utf8data++ = XCL_RANGE;
3246 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3247 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3248 *class_utf8data++ = XCL_RANGE;
3249 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3250 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3251 *class_utf8data++ = XCL_RANGE;
3252 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3253 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3254 *class_utf8data++ = XCL_RANGE;
3255 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3256 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3257 *class_utf8data++ = XCL_RANGE;
3258 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3259 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3260 *class_utf8data++ = XCL_RANGE;
3261 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3262 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3263 *class_utf8data++ = XCL_RANGE;
3264 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3265 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3266 }
3267 #endif
3268 continue;
3269 }
3270
3271 if (-c == ESC_v)
3272 {
3273 SETBIT(classbits, 0x0a); /* LF */
3274 SETBIT(classbits, 0x0b); /* VT */
3275 SETBIT(classbits, 0x0c); /* FF */
3276 SETBIT(classbits, 0x0d); /* CR */
3277 SETBIT(classbits, 0x85); /* NEL */
3278 #ifdef SUPPORT_UTF8
3279 if (utf8)
3280 {
3281 class_utf8 = TRUE;
3282 *class_utf8data++ = XCL_RANGE;
3283 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3284 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3285 }
3286 #endif
3287 continue;
3288 }
3289
3290 if (-c == ESC_V)
3291 {
3292 for (c = 0; c < 32; c++)
3293 {
3294 int x = 0xff;
3295 switch (c)
3296 {
3297 case 0x0a/8: x ^= 1 << (0x0a%8);
3298 x ^= 1 << (0x0b%8);
3299 x ^= 1 << (0x0c%8);
3300 x ^= 1 << (0x0d%8);
3301 break;
3302 case 0x85/8: x ^= 1 << (0x85%8); break;
3303 default: break;
3304 }
3305 classbits[c] |= x;
3306 }
3307
3308 #ifdef SUPPORT_UTF8
3309 if (utf8)
3310 {
3311 class_utf8 = TRUE;
3312 *class_utf8data++ = XCL_RANGE;
3313 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3314 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3315 *class_utf8data++ = XCL_RANGE;
3316 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3317 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3318 }
3319 #endif
3320 continue;
3321 }
3322
3323 /* We need to deal with \P and \p in both phases. */
3324
3325 #ifdef SUPPORT_UCP
3326 if (-c == ESC_p || -c == ESC_P)
3327 {
3328 BOOL negated;
3329 int pdata;
3330 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3331 if (ptype < 0) goto FAILED;
3332 class_utf8 = TRUE;
3333 *class_utf8data++ = ((-c == ESC_p) != negated)?
3334 XCL_PROP : XCL_NOTPROP;
3335 *class_utf8data++ = ptype;
3336 *class_utf8data++ = pdata;
3337 class_charcount -= 2; /* Not a < 256 character */
3338 continue;
3339 }
3340 #endif
3341 /* Unrecognized escapes are faulted if PCRE is running in its
3342 strict mode. By default, for compatibility with Perl, they are
3343 treated as literals. */
3344
3345 if ((options & PCRE_EXTRA) != 0)
3346 {
3347 *errorcodeptr = ERR7;
3348 goto FAILED;
3349 }
3350
3351 class_charcount -= 2; /* Undo the default count from above */
3352 c = *ptr; /* Get the final character and fall through */
3353 }
3354
3355 /* Fall through if we have a single character (c >= 0). This may be
3356 greater than 256 in UTF-8 mode. */
3357
3358 } /* End of backslash handling */
3359
3360 /* A single character may be followed by '-' to form a range. However,
3361 Perl does not permit ']' to be the end of the range. A '-' character
3362 at the end is treated as a literal. Perl ignores orphaned \E sequences
3363 entirely. The code for handling \Q and \E is messy. */
3364
3365 CHECK_RANGE:
3366 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3367 {
3368 inescq = FALSE;
3369 ptr += 2;
3370 }
3371
3372 oldptr = ptr;
3373
3374 /* Remember \r or \n */
3375
3376 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3377
3378 /* Check for range */
3379
3380 if (!inescq && ptr[1] == CHAR_MINUS)
3381 {
3382 int d;
3383 ptr += 2;
3384 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3385
3386 /* If we hit \Q (not followed by \E) at this point, go into escaped
3387 mode. */
3388
3389 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3390 {
3391 ptr += 2;
3392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3393 { ptr += 2; continue; }
3394 inescq = TRUE;
3395 break;
3396 }
3397
3398 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3399 {
3400 ptr = oldptr;
3401 goto LONE_SINGLE_CHARACTER;
3402 }
3403
3404 #ifdef SUPPORT_UTF8
3405 if (utf8)
3406 { /* Braces are required because the */
3407 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3408 }
3409 else
3410 #endif
3411 d = *ptr; /* Not UTF-8 mode */
3412
3413 /* The second part of a range can be a single-character escape, but
3414 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3415 in such circumstances. */
3416
3417 if (!inescq && d == CHAR_BACKSLASH)
3418 {
3419 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3420 if (*errorcodeptr != 0) goto FAILED;
3421
3422 /* \b is backspace; \X is literal X; \R is literal R; any other
3423 special means the '-' was literal */
3424
3425 if (d < 0)
3426 {
3427 if (d == -ESC_b) d = CHAR_BS;
3428 else if (d == -ESC_X) d = CHAR_X;
3429 else if (d == -ESC_R) d = CHAR_R; else
3430 {
3431 ptr = oldptr;
3432 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3433 }
3434 }
3435 }
3436
3437 /* Check that the two values are in the correct order. Optimize
3438 one-character ranges */
3439
3440 if (d < c)
3441 {
3442 *errorcodeptr = ERR8;
3443 goto FAILED;
3444 }
3445
3446 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3447
3448 /* Remember \r or \n */
3449
3450 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3451
3452 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3453 matching, we have to use an XCLASS with extra data items. Caseless
3454 matching for characters > 127 is available only if UCP support is
3455 available. */
3456
3457 #ifdef SUPPORT_UTF8
3458 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3459 {
3460 class_utf8 = TRUE;
3461
3462 /* With UCP support, we can find the other case equivalents of
3463 the relevant characters. There may be several ranges. Optimize how
3464 they fit with the basic range. */
3465
3466 #ifdef SUPPORT_UCP
3467 if ((options & PCRE_CASELESS) != 0)
3468 {
3469 unsigned int occ, ocd;
3470 unsigned int cc = c;
3471 unsigned int origd = d;
3472 while (get_othercase_range(&cc, origd, &occ, &ocd))
3473 {
3474 if (occ >= (unsigned int)c &&
3475 ocd <= (unsigned int)d)
3476 continue; /* Skip embedded ranges */
3477
3478 if (occ < (unsigned int)c &&
3479 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3480 { /* if there is overlap, */
3481 c = occ; /* noting that if occ < c */
3482 continue; /* we can't have ocd > d */
3483 } /* because a subrange is */
3484 if (ocd > (unsigned int)d &&
3485 occ <= (unsigned int)d + 1) /* always shorter than */
3486 { /* the basic range. */
3487 d = ocd;
3488 continue;
3489 }
3490
3491 if (occ == ocd)
3492 {
3493 *class_utf8data++ = XCL_SINGLE;
3494 }
3495 else
3496 {
3497 *class_utf8data++ = XCL_RANGE;
3498 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3499 }
3500 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3501 }
3502 }
3503 #endif /* SUPPORT_UCP */
3504
3505 /* Now record the original range, possibly modified for UCP caseless
3506 overlapping ranges. */
3507
3508 *class_utf8data++ = XCL_RANGE;
3509 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3510 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3511
3512 /* With UCP support, we are done. Without UCP support, there is no
3513 caseless matching for UTF-8 characters > 127; we can use the bit map
3514 for the smaller ones. */
3515
3516 #ifdef SUPPORT_UCP
3517 continue; /* With next character in the class */
3518 #else
3519 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3520
3521 /* Adjust upper limit and fall through to set up the map */
3522
3523 d = 127;
3524
3525 #endif /* SUPPORT_UCP */
3526 }
3527 #endif /* SUPPORT_UTF8 */
3528
3529 /* We use the bit map for all cases when not in UTF-8 mode; else
3530 ranges that lie entirely within 0-127 when there is UCP support; else
3531 for partial ranges without UCP support. */
3532
3533 class_charcount += d - c + 1;
3534 class_lastchar = d;
3535
3536 /* We can save a bit of time by skipping this in the pre-compile. */
3537
3538 if (lengthptr == NULL) for (; c <= d; c++)
3539 {
3540 classbits[c/8] |= (1 << (c&7));
3541 if ((options & PCRE_CASELESS) != 0)
3542 {
3543 int uc = cd->fcc[c]; /* flip case */
3544 classbits[uc/8] |= (1 << (uc&7));
3545 }
3546 }
3547
3548 continue; /* Go get the next char in the class */
3549 }
3550
3551 /* Handle a lone single character - we can get here for a normal
3552 non-escape char, or after \ that introduces a single character or for an
3553 apparent range that isn't. */
3554
3555 LONE_SINGLE_CHARACTER:
3556
3557 /* Handle a character that cannot go in the bit map */
3558
3559 #ifdef SUPPORT_UTF8
3560 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3561 {
3562 class_utf8 = TRUE;
3563 *class_utf8data++ = XCL_SINGLE;
3564 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3565
3566 #ifdef SUPPORT_UCP
3567 if ((options & PCRE_CASELESS) != 0)
3568 {
3569 unsigned int othercase;
3570 if ((othercase = UCD_OTHERCASE(c)) != c)
3571 {
3572 *class_utf8data++ = XCL_SINGLE;
3573 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3574 }
3575 }
3576 #endif /* SUPPORT_UCP */
3577
3578 }
3579 else
3580 #endif /* SUPPORT_UTF8 */
3581
3582 /* Handle a single-byte character */
3583 {
3584 classbits[c/8] |= (1 << (c&7));
3585 if ((options & PCRE_CASELESS) != 0)
3586 {
3587 c = cd->fcc[c]; /* flip case */
3588 classbits[c/8] |= (1 << (c&7));
3589 }
3590 class_charcount++;
3591 class_lastchar = c;
3592 }
3593 }
3594
3595 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3596
3597 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3598
3599 if (c == 0) /* Missing terminating ']' */
3600 {
3601 *errorcodeptr = ERR6;
3602 goto FAILED;
3603 }
3604
3605
3606 /* This code has been disabled because it would mean that \s counts as
3607 an explicit \r or \n reference, and that's not really what is wanted. Now
3608 we set the flag only if there is a literal "\r" or "\n" in the class. */
3609
3610 #if 0
3611 /* Remember whether \r or \n are in this class */
3612
3613 if (negate_class)
3614 {
3615 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3616 }
3617 else
3618 {
3619 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3620 }
3621 #endif
3622
3623
3624 /* If class_charcount is 1, we saw precisely one character whose value is
3625 less than 256. As long as there were no characters >= 128 and there was no
3626 use of \p or \P, in other words, no use of any XCLASS features, we can
3627 optimize.
3628
3629 In UTF-8 mode, we can optimize the negative case only if there were no
3630 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3631 operate on single-bytes only. This is an historical hangover. Maybe one day
3632 we can tidy these opcodes to handle multi-byte characters.
3633
3634 The optimization throws away the bit map. We turn the item into a
3635 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3636 that OP_NOT does not support multibyte characters. In the positive case, it
3637 can cause firstbyte to be set. Otherwise, there can be no first char if
3638 this item is first, whatever repeat count may follow. In the case of
3639 reqbyte, save the previous value for reinstating. */
3640
3641 #ifdef SUPPORT_UTF8
3642 if (class_charcount == 1 && !class_utf8 &&
3643 (!utf8 || !negate_class || class_lastchar < 128))
3644 #else
3645 if (class_charcount == 1)
3646 #endif
3647 {
3648 zeroreqbyte = reqbyte;
3649
3650 /* The OP_NOT opcode works on one-byte characters only. */
3651
3652 if (negate_class)
3653 {
3654 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3655 zerofirstbyte = firstbyte;
3656 *code++ = OP_NOT;
3657 *code++ = class_lastchar;
3658 break;
3659 }
3660
3661 /* For a single, positive character, get the value into mcbuffer, and
3662 then we can handle this with the normal one-character code. */
3663
3664 #ifdef SUPPORT_UTF8
3665 if (utf8 && class_lastchar > 127)
3666 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3667 else
3668 #endif
3669 {
3670 mcbuffer[0] = class_lastchar;
3671 mclength = 1;
3672 }
3673 goto ONE_CHAR;
3674 } /* End of 1-char optimization */
3675
3676 /* The general case - not the one-char optimization. If this is the first
3677 thing in the branch, there can be no first char setting, whatever the
3678 repeat count. Any reqbyte setting must remain unchanged after any kind of
3679 repeat. */
3680
3681 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3682 zerofirstbyte = firstbyte;
3683 zeroreqbyte = reqbyte;
3684
3685 /* If there are characters with values > 255, we have to compile an
3686 extended class, with its own opcode, unless there was a negated special
3687 such as \S in the class, because in that case all characters > 255 are in
3688 the class, so any that were explicitly given as well can be ignored. If
3689 (when there are explicit characters > 255 that must be listed) there are no
3690 characters < 256, we can omit the bitmap in the actual compiled code. */
3691
3692 #ifdef SUPPORT_UTF8
3693 if (class_utf8 && !should_flip_negation)
3694 {
3695 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3696 *code++ = OP_XCLASS;
3697 code += LINK_SIZE;
3698 *code = negate_class? XCL_NOT : 0;
3699
3700 /* If the map is required, move up the extra data to make room for it;
3701 otherwise just move the code pointer to the end of the extra data. */
3702
3703 if (class_charcount > 0)
3704 {
3705 *code++ |= XCL_MAP;
3706 memmove(code + 32, code, class_utf8data - code);
3707 memcpy(code, classbits, 32);
3708 code = class_utf8data + 32;
3709 }
3710 else code = class_utf8data;
3711
3712 /* Now fill in the complete length of the item */
3713
3714 PUT(previous, 1, code - previous);
3715 break; /* End of class handling */
3716 }
3717 #endif
3718
3719 /* If there are no characters > 255, set the opcode to OP_CLASS or
3720 OP_NCLASS, depending on whether the whole class was negated and whether
3721 there were negative specials such as \S in the class. Then copy the 32-byte
3722 map into the code vector, negating it if necessary. */
3723
3724 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3725 if (negate_class)
3726 {
3727 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3728 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3729 }
3730 else
3731 {
3732 memcpy(code, classbits, 32);
3733 }
3734 code += 32;
3735 break;
3736
3737
3738 /* ===================================================================*/
3739 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3740 has been tested above. */
3741
3742 case CHAR_LEFT_CURLY_BRACKET:
3743 if (!is_quantifier) goto NORMAL_CHAR;
3744 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3745 if (*errorcodeptr != 0) goto FAILED;
3746 goto REPEAT;
3747
3748 case CHAR_ASTERISK:
3749 repeat_min = 0;
3750 repeat_max = -1;
3751 goto REPEAT;
3752
3753 case CHAR_PLUS:
3754 repeat_min = 1;
3755 repeat_max = -1;
3756 goto REPEAT;
3757
3758 case CHAR_QUESTION_MARK:
3759 repeat_min = 0;
3760 repeat_max = 1;
3761
3762 REPEAT:
3763 if (previous == NULL)
3764 {
3765 *errorcodeptr = ERR9;
3766 goto FAILED;
3767 }
3768
3769 if (repeat_min == 0)
3770 {
3771 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3772 reqbyte = zeroreqbyte; /* Ditto */
3773 }
3774
3775 /* Remember whether this is a variable length repeat */
3776
3777 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3778
3779 op_type = 0; /* Default single-char op codes */
3780 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3781
3782 /* Save start of previous item, in case we have to move it up to make space
3783 for an inserted OP_ONCE for the additional '+' extension. */
3784
3785 tempcode = previous;
3786
3787 /* If the next character is '+', we have a possessive quantifier. This
3788 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3789 If the next character is '?' this is a minimizing repeat, by default,
3790 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3791 repeat type to the non-default. */
3792
3793 if (ptr[1] == CHAR_PLUS)
3794 {
3795 repeat_type = 0; /* Force greedy */
3796 possessive_quantifier = TRUE;
3797 ptr++;
3798 }
3799 else if (ptr[1] == CHAR_QUESTION_MARK)
3800 {
3801 repeat_type = greedy_non_default;
3802 ptr++;
3803 }
3804 else repeat_type = greedy_default;
3805
3806 /* If previous was a character match, abolish the item and generate a
3807 repeat item instead. If a char item has a minumum of more than one, ensure
3808 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3809 the first thing in a branch because the x will have gone into firstbyte
3810 instead. */
3811
3812 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3813 {
3814 /* Deal with UTF-8 characters that take up more than one byte. It's
3815 easier to write this out separately than try to macrify it. Use c to
3816 hold the length of the character in bytes, plus 0x80 to flag that it's a
3817 length rather than a small character. */
3818
3819 #ifdef SUPPORT_UTF8
3820 if (utf8 && (code[-1] & 0x80) != 0)
3821 {
3822 uschar *lastchar = code - 1;
3823 while((*lastchar & 0xc0) == 0x80) lastchar--;
3824 c = code - lastchar; /* Length of UTF-8 character */
3825 memcpy(utf8_char, lastchar, c); /* Save the char */
3826 c |= 0x80; /* Flag c as a length */
3827 }
3828 else
3829 #endif
3830
3831 /* Handle the case of a single byte - either with no UTF8 support, or
3832 with UTF-8 disabled, or for a UTF-8 character < 128. */
3833
3834 {
3835 c = code[-1];
3836 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3837 }
3838
3839 /* If the repetition is unlimited, it pays to see if the next thing on
3840 the line is something that cannot possibly match this character. If so,
3841 automatically possessifying this item gains some performance in the case
3842 where the match fails. */
3843
3844 if (!possessive_quantifier &&
3845 repeat_max < 0 &&
3846 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3847 options, cd))
3848 {
3849 repeat_type = 0; /* Force greedy */
3850 possessive_quantifier = TRUE;
3851 }
3852
3853 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3854 }
3855
3856 /* If previous was a single negated character ([^a] or similar), we use
3857 one of the special opcodes, replacing it. The code is shared with single-
3858 character repeats by setting opt_type to add a suitable offset into
3859 repeat_type. We can also test for auto-possessification. OP_NOT is
3860 currently used only for single-byte chars. */
3861
3862 else if (*previous == OP_NOT)
3863 {
3864 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3865 c = previous[1];
3866 if (!possessive_quantifier &&
3867 repeat_max < 0 &&
3868 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3869 {
3870 repeat_type = 0; /* Force greedy */
3871 possessive_quantifier = TRUE;
3872 }
3873 goto OUTPUT_SINGLE_REPEAT;
3874 }
3875
3876 /* If previous was a character type match (\d or similar), abolish it and
3877 create a suitable repeat item. The code is shared with single-character
3878 repeats by setting op_type to add a suitable offset into repeat_type. Note
3879 the the Unicode property types will be present only when SUPPORT_UCP is
3880 defined, but we don't wrap the little bits of code here because it just
3881 makes it horribly messy. */
3882
3883 else if (*previous < OP_EODN)
3884 {
3885 uschar *oldcode;
3886 int prop_type, prop_value;
3887 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3888 c = *previous;
3889
3890 if (!possessive_quantifier &&
3891 repeat_max < 0 &&
3892 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3893 {
3894 repeat_type = 0; /* Force greedy */
3895 possessive_quantifier = TRUE;
3896 }
3897
3898 OUTPUT_SINGLE_REPEAT:
3899 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3900 {
3901 prop_type = previous[1];
3902 prop_value = previous[2];
3903 }
3904 else prop_type = prop_value = -1;
3905
3906 oldcode = code;
3907 code = previous; /* Usually overwrite previous item */
3908
3909 /* If the maximum is zero then the minimum must also be zero; Perl allows
3910 this case, so we do too - by simply omitting the item altogether. */
3911
3912 if (repeat_max == 0) goto END_REPEAT;
3913
3914 /*--------------------------------------------------------------------*/
3915 /* This code is obsolete from release 8.00; the restriction was finally
3916 removed: */
3917
3918 /* All real repeats make it impossible to handle partial matching (maybe
3919 one day we will be able to remove this restriction). */
3920
3921 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3922 /*--------------------------------------------------------------------*/
3923
3924 /* Combine the op_type with the repeat_type */
3925
3926 repeat_type += op_type;
3927
3928 /* A minimum of zero is handled either as the special case * or ?, or as
3929 an UPTO, with the maximum given. */
3930
3931 if (repeat_min == 0)
3932 {
3933 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3934 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3935 else
3936 {
3937 *code++ = OP_UPTO + repeat_type;
3938 PUT2INC(code, 0, repeat_max);
3939 }
3940 }
3941
3942 /* A repeat minimum of 1 is optimized into some special cases. If the
3943 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3944 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3945 one less than the maximum. */
3946
3947 else if (repeat_min == 1)
3948 {
3949 if (repeat_max == -1)
3950 *code++ = OP_PLUS + repeat_type;
3951 else
3952 {
3953 code = oldcode; /* leave previous item in place */
3954 if (repeat_max == 1) goto END_REPEAT;
3955 *code++ = OP_UPTO + repeat_type;
3956 PUT2INC(code, 0, repeat_max - 1);
3957 }
3958 }
3959
3960 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3961 handled as an EXACT followed by an UPTO. */
3962
3963 else
3964 {
3965 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3966 PUT2INC(code, 0, repeat_min);
3967
3968 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3969 we have to insert the character for the previous code. For a repeated
3970 Unicode property match, there are two extra bytes that define the
3971 required property. In UTF-8 mode, long characters have their length in
3972 c, with the 0x80 bit as a flag. */
3973
3974 if (repeat_max < 0)
3975 {
3976 #ifdef SUPPORT_UTF8
3977 if (utf8 && c >= 128)
3978 {
3979 memcpy(code, utf8_char, c & 7);
3980 code += c & 7;
3981 }
3982 else
3983 #endif
3984 {
3985 *code++ = c;
3986 if (prop_type >= 0)
3987 {
3988 *code++ = prop_type;
3989 *code++ = prop_value;
3990 }
3991 }
3992 *code++ = OP_STAR + repeat_type;
3993 }
3994
3995 /* Else insert an UPTO if the max is greater than the min, again
3996 preceded by the character, for the previously inserted code. If the
3997 UPTO is just for 1 instance, we can use QUERY instead. */
3998
3999 else if (repeat_max != repeat_min)
4000 {
4001 #ifdef SUPPORT_UTF8
4002 if (utf8 && c >= 128)
4003 {
4004 memcpy(code, utf8_char, c & 7);
4005 code += c & 7;
4006 }
4007 else
4008 #endif
4009 *code++ = c;
4010 if (prop_type >= 0)
4011 {
4012 *code++ = prop_type;
4013 *code++ = prop_value;
4014 }
4015 repeat_max -= repeat_min;
4016
4017 if (repeat_max == 1)
4018 {
4019 *code++ = OP_QUERY + repeat_type;
4020 }
4021 else
4022 {
4023 *code++ = OP_UPTO + repeat_type;
4024 PUT2INC(code, 0, repeat_max);
4025 }
4026 }
4027 }
4028
4029 /* The character or character type itself comes last in all cases. */
4030
4031 #ifdef SUPPORT_UTF8
4032 if (utf8 && c >= 128)
4033 {
4034 memcpy(code, utf8_char, c & 7);
4035 code += c & 7;
4036 }
4037 else
4038 #endif
4039 *code++ = c;
4040
4041 /* For a repeated Unicode property match, there are two extra bytes that
4042 define the required property. */
4043
4044 #ifdef SUPPORT_UCP
4045 if (prop_type >= 0)
4046 {
4047 *code++ = prop_type;
4048 *code++ = prop_value;
4049 }
4050 #endif
4051 }
4052
4053 /* If previous was a character class or a back reference, we put the repeat
4054 stuff after it, but just skip the item if the repeat was {0,0}. */
4055
4056 else if (*previous == OP_CLASS ||
4057 *previous == OP_NCLASS ||
4058 #ifdef SUPPORT_UTF8
4059 *previous == OP_XCLASS ||
4060 #endif
4061 *previous == OP_REF)
4062 {
4063 if (repeat_max == 0)
4064 {
4065 code = previous;
4066 goto END_REPEAT;
4067 }
4068
4069 /*--------------------------------------------------------------------*/
4070 /* This code is obsolete from release 8.00; the restriction was finally
4071 removed: */
4072
4073 /* All real repeats make it impossible to handle partial matching (maybe
4074 one day we will be able to remove this restriction). */
4075
4076 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4077 /*--------------------------------------------------------------------*/
4078
4079 if (repeat_min == 0 && repeat_max == -1)
4080 *code++ = OP_CRSTAR + repeat_type;
4081 else if (repeat_min == 1 && repeat_max == -1)
4082 *code++ = OP_CRPLUS + repeat_type;
4083 else if (repeat_min == 0 && repeat_max == 1)
4084 *code++ = OP_CRQUERY + repeat_type;
4085 else
4086 {
4087 *code++ = OP_CRRANGE + repeat_type;
4088 PUT2INC(code, 0, repeat_min);
4089 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4090 PUT2INC(code, 0, repeat_max);
4091 }
4092 }
4093
4094 /* If previous was a bracket group, we may have to replicate it in certain
4095 cases. */
4096
4097 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4098 *previous == OP_ONCE || *previous == OP_COND)
4099 {
4100 register int i;
4101 int ketoffset = 0;
4102 int len = code - previous;
4103 uschar *bralink = NULL;
4104
4105 /* Repeating a DEFINE group is pointless */
4106
4107 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4108 {
4109 *errorcodeptr = ERR55;
4110 goto FAILED;
4111 }
4112
4113 /* If the maximum repeat count is unlimited, find the end of the bracket
4114 by scanning through from the start, and compute the offset back to it
4115 from the current code pointer. There may be an OP_OPT setting following
4116 the final KET, so we can't find the end just by going back from the code
4117 pointer. */
4118
4119 if (repeat_max == -1)
4120 {
4121 register uschar *ket = previous;
4122 do ket += GET(ket, 1); while (*ket != OP_KET);
4123 ketoffset = code - ket;
4124 }
4125
4126 /* The case of a zero minimum is special because of the need to stick
4127 OP_BRAZERO in front of it, and because the group appears once in the
4128 data, whereas in other cases it appears the minimum number of times. For
4129 this reason, it is simplest to treat this case separately, as otherwise
4130 the code gets far too messy. There are several special subcases when the
4131 minimum is zero. */
4132
4133 if (repeat_min == 0)
4134 {
4135 /* If the maximum is also zero, we used to just omit the group from the
4136 output altogether, like this:
4137
4138 ** if (repeat_max == 0)
4139 ** {
4140 ** code = previous;
4141 ** goto END_REPEAT;
4142 ** }
4143
4144 However, that fails when a group is referenced as a subroutine from
4145 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4146 so that it is skipped on execution. As we don't have a list of which
4147 groups are referenced, we cannot do this selectively.
4148
4149 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4150 and do no more at this point. However, we do need to adjust any
4151 OP_RECURSE calls inside the group that refer to the group itself or any
4152 internal or forward referenced group, because the offset is from the
4153 start of the whole regex. Temporarily terminate the pattern while doing
4154 this. */
4155
4156 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4157 {
4158 *code = OP_END;
4159 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4160 memmove(previous+1, previous, len);
4161 code++;
4162 if (repeat_max == 0)
4163 {
4164 *previous++ = OP_SKIPZERO;
4165 goto END_REPEAT;
4166 }
4167 *previous++ = OP_BRAZERO + repeat_type;
4168 }
4169
4170 /* If the maximum is greater than 1 and limited, we have to replicate
4171 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4172 The first one has to be handled carefully because it's the original
4173 copy, which has to be moved up. The remainder can be handled by code
4174 that is common with the non-zero minimum case below. We have to
4175 adjust the value or repeat_max, since one less copy is required. Once
4176 again, we may have to adjust any OP_RECURSE calls inside the group. */
4177
4178 else
4179 {
4180 int offset;
4181 *code = OP_END;
4182 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4183 memmove(previous + 2 + LINK_SIZE, previous, len);
4184 code += 2 + LINK_SIZE;
4185 *previous++ = OP_BRAZERO + repeat_type;
4186 *previous++ = OP_BRA;
4187
4188 /* We chain together the bracket offset fields that have to be
4189 filled in later when the ends of the brackets are reached. */
4190
4191 offset = (bralink == NULL)? 0 : previous - bralink;
4192 bralink = previous;
4193 PUTINC(previous, 0, offset);
4194 }
4195
4196 repeat_max--;
4197 }
4198
4199 /* If the minimum is greater than zero, replicate the group as many
4200 times as necessary, and adjust the maximum to the number of subsequent
4201 copies that we need. If we set a first char from the group, and didn't
4202 set a required char, copy the latter from the former. If there are any
4203 forward reference subroutine calls in the group, there will be entries on
4204 the workspace list; replicate these with an appropriate increment. */
4205
4206 else
4207 {
4208 if (repeat_min > 1)
4209 {
4210 /* In the pre-compile phase, we don't actually do the replication. We
4211 just adjust the length as if we had. Do some paranoid checks for
4212 potential integer overflow. */
4213
4214 if (lengthptr != NULL)
4215 {
4216 int delta = (repeat_min - 1)*length_prevgroup;
4217 if ((double)(repeat_min - 1)*(double)length_prevgroup >
4218 (double)INT_MAX ||
4219 OFLOW_MAX - *lengthptr < delta)
4220 {
4221 *errorcodeptr = ERR20;
4222 goto FAILED;
4223 }
4224 *lengthptr += delta;
4225 }
4226
4227 /* This is compiling for real */
4228
4229 else
4230 {
4231 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4232 for (i = 1; i < repeat_min; i++)
4233 {
4234 uschar *hc;
4235 uschar *this_hwm = cd->hwm;
4236 memcpy(code, previous, len);
4237 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4238 {
4239 PUT(cd->hwm, 0, GET(hc, 0) + len);
4240 cd->hwm += LINK_SIZE;
4241 }
4242 save_hwm = this_hwm;
4243 code += len;
4244 }
4245 }
4246 }
4247
4248 if (repeat_max > 0) repeat_max -= repeat_min;
4249 }
4250
4251 /* This code is common to both the zero and non-zero minimum cases. If
4252 the maximum is limited, it replicates the group in a nested fashion,
4253 remembering the bracket starts on a stack. In the case of a zero minimum,
4254 the first one was set up above. In all cases the repeat_max now specifies
4255 the number of additional copies needed. Again, we must remember to
4256 replicate entries on the forward reference list. */
4257
4258 if (repeat_max >= 0)
4259 {
4260 /* In the pre-compile phase, we don't actually do the replication. We
4261 just adjust the length as if we had. For each repetition we must add 1
4262 to the length for BRAZERO and for all but the last repetition we must
4263 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4264 paranoid checks to avoid integer overflow. */
4265
4266 if (lengthptr != NULL && repeat_max > 0)
4267 {
4268 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4269 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4270 if ((double)repeat_max *
4271 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4272 > (double)INT_MAX ||
4273 OFLOW_MAX - *lengthptr < delta)
4274 {
4275 *errorcodeptr = ERR20;
4276 goto FAILED;
4277 }
4278 *lengthptr += delta;
4279 }
4280
4281 /* This is compiling for real */
4282
4283 else for (i = repeat_max - 1; i >= 0; i--)
4284 {
4285 uschar *hc;
4286 uschar *this_hwm = cd->hwm;
4287
4288 *code++ = OP_BRAZERO + repeat_type;
4289
4290 /* All but the final copy start a new nesting, maintaining the
4291 chain of brackets outstanding. */
4292
4293 if (i != 0)
4294 {
4295 int offset;
4296 *code++ = OP_BRA;
4297 offset = (bralink == NULL)? 0 : code - bralink;
4298 bralink = code;
4299 PUTINC(code, 0, offset);
4300 }
4301
4302 memcpy(code, previous, len);
4303 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4304 {
4305 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4306 cd->hwm += LINK_SIZE;
4307 }
4308 save_hwm = this_hwm;
4309 code += len;
4310 }
4311
4312 /* Now chain through the pending brackets, and fill in their length
4313 fields (which are holding the chain links pro tem). */
4314
4315 while (bralink != NULL)
4316 {
4317 int oldlinkoffset;
4318 int offset = code - bralink + 1;
4319 uschar *bra = code - offset;
4320 oldlinkoffset = GET(bra, 1);
4321 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4322 *code++ = OP_KET;
4323 PUTINC(code, 0, offset);
4324 PUT(bra, 1, offset);
4325 }
4326 }
4327
4328 /* If the maximum is unlimited, set a repeater in the final copy. We
4329 can't just offset backwards from the current code point, because we
4330 don't know if there's been an options resetting after the ket. The
4331 correct offset was computed above.
4332
4333 Then, when we are doing the actual compile phase, check to see whether
4334 this group is a non-atomic one that could match an empty string. If so,
4335 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4336 that runtime checking can be done. [This check is also applied to
4337 atomic groups at runtime, but in a different way.] */
4338
4339 else
4340 {
4341 uschar *ketcode = code - ketoffset;
4342 uschar *bracode = ketcode - GET(ketcode, 1);
4343 *ketcode = OP_KETRMAX + repeat_type;
4344 if (lengthptr == NULL && *bracode != OP_ONCE)
4345 {
4346 uschar *scode = bracode;
4347 do
4348 {
4349 if (could_be_empty_branch(scode, ketcode, utf8))
4350 {
4351 *bracode += OP_SBRA - OP_BRA;
4352 break;
4353 }
4354 scode += GET(scode, 1);
4355 }
4356 while (*scode == OP_ALT);
4357 }
4358 }
4359 }
4360
4361 /* If previous is OP_FAIL, it was generated by an empty class [] in
4362 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4363 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4364 error above. We can just ignore the repeat in JS case. */
4365
4366 else if (*previous == OP_FAIL) goto END_REPEAT;
4367
4368 /* Else there's some kind of shambles */
4369
4370 else
4371 {
4372 *errorcodeptr = ERR11;
4373 goto FAILED;
4374 }
4375
4376 /* If the character following a repeat is '+', or if certain optimization
4377 tests above succeeded, possessive_quantifier is TRUE. For some of the
4378 simpler opcodes, there is an special alternative opcode for this. For
4379 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4380 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4381 but the special opcodes can optimize it a bit. The repeated item starts at
4382 tempcode, not at previous, which might be the first part of a string whose
4383 (former) last char we repeated.
4384
4385 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4386 an 'upto' may follow. We skip over an 'exact' item, and then test the
4387 length of what remains before proceeding. */
4388
4389 if (possessive_quantifier)
4390 {
4391 int len;
4392
4393 if (*tempcode == OP_TYPEEXACT)
4394 tempcode += _pcre_OP_lengths[*tempcode] +
4395 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4396
4397 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4398 {
4399 tempcode += _pcre_OP_lengths[*tempcode];
4400 #ifdef SUPPORT_UTF8
4401 if (utf8 && tempcode[-1] >= 0xc0)
4402 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4403 #endif
4404 }
4405
4406 len = code - tempcode;
4407 if (len > 0) switch (*tempcode)
4408 {
4409 case OP_STAR: *tempcode = OP_POSSTAR; break;
4410 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4411 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4412 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4413
4414 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4415 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4416 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4417 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4418
4419 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4420 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4421 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4422 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4423
4424 default:
4425 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4426 code += 1 + LINK_SIZE;
4427 len += 1 + LINK_SIZE;
4428 tempcode[0] = OP_ONCE;
4429 *code++ = OP_KET;
4430 PUTINC(code, 0, len);
4431 PUT(tempcode, 1, len);
4432 break;
4433 }
4434 }
4435
4436 /* In all case we no longer have a previous item. We also set the
4437 "follows varying string" flag for subsequently encountered reqbytes if
4438 it isn't already set and we have just passed a varying length item. */
4439
4440 END_REPEAT:
4441 previous = NULL;
4442 cd->req_varyopt |= reqvary;
4443 break;
4444
4445
4446 /* ===================================================================*/
4447 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4448 lookbehind or option setting or condition or all the other extended
4449 parenthesis forms. */
4450
4451 case CHAR_LEFT_PARENTHESIS:
4452 newoptions = options;
4453 skipbytes = 0;
4454 bravalue = OP_CBRA;
4455 save_hwm = cd->hwm;
4456 reset_bracount = FALSE;
4457
4458 /* First deal with various "verbs" that can be introduced by '*'. */
4459
4460 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4461 {
4462 int i, namelen;
4463 const char *vn = verbnames;
4464 const uschar *name = ++ptr;
4465 previous = NULL;
4466 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4467 if (*ptr == CHAR_COLON)
4468 {
4469 *errorcodeptr = ERR59; /* Not supported */
4470 goto FAILED;
4471 }
4472 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4473 {
4474 *errorcodeptr = ERR60;
4475 goto FAILED;
4476 }
4477 namelen = ptr - name;
4478 for (i = 0; i < verbcount; i++)
4479 {
4480 if (namelen == verbs[i].len &&
4481 strncmp((char *)name, vn, namelen) == 0)
4482 {
4483 /* Check for open captures before ACCEPT */
4484
4485 if (verbs[i].op == OP_ACCEPT)
4486 {
4487 open_capitem *oc;
4488 cd->had_accept = TRUE;
4489 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4490 {
4491 *code++ = OP_CLOSE;
4492 PUT2INC(code, 0, oc->number);
4493 }
4494 }
4495 *code++ = verbs[i].op;
4496 break;
4497 }
4498 vn += verbs[i].len + 1;
4499 }
4500 if (i < verbcount) continue;
4501 *errorcodeptr = ERR60;
4502 goto FAILED;
4503 }
4504
4505 /* Deal with the extended parentheses; all are introduced by '?', and the
4506 appearance of any of them means that this is not a capturing group. */
4507
4508 else if (*ptr == CHAR_QUESTION_MARK)
4509 {
4510 int i, set, unset, namelen;
4511 int *optset;
4512 const uschar *name;
4513 uschar *slot;
4514
4515 switch (*(++ptr))
4516 {
4517 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
4518 ptr++;
4519 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4520 if (*ptr == 0)
4521 {
4522 *errorcodeptr = ERR18;
4523 goto FAILED;
4524 }
4525 continue;
4526
4527
4528 /* ------------------------------------------------------------ */
4529 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
4530 reset_bracount = TRUE;
4531 /* Fall through */
4532
4533 /* ------------------------------------------------------------ */
4534 case CHAR_COLON: /* Non-capturing bracket */
4535 bravalue = OP_BRA;
4536 ptr++;
4537 break;
4538
4539
4540 /* ------------------------------------------------------------ */
4541 case CHAR_LEFT_PARENTHESIS:
4542 bravalue = OP_COND; /* Conditional group */
4543
4544 /* A condition can be an assertion, a number (referring to a numbered
4545 group), a name (referring to a named group), or 'R', referring to
4546 recursion. R<digits> and R&name are also permitted for recursion tests.
4547
4548 There are several syntaxes for testing a named group: (?(name)) is used
4549 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4550
4551 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4552 be the recursive thing or the name 'R' (and similarly for 'R' followed
4553 by digits), and (b) a number could be a name that consists of digits.
4554 In both cases, we look for a name first; if not found, we try the other
4555 cases. */
4556
4557 /* For conditions that are assertions, check the syntax, and then exit
4558 the switch. This will take control down to where bracketed groups,
4559 including assertions, are processed. */
4560
4561 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4562 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4563 break;
4564
4565 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4566 below), and all need to skip 3 bytes at the start of the group. */
4567
4568 code[1+LINK_SIZE] = OP_CREF;
4569 skipbytes = 3;
4570 refsign = -1;
4571
4572 /* Check for a test for recursion in a named group. */
4573
4574 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4575 {
4576 terminator = -1;
4577 ptr += 2;
4578 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4579 }
4580
4581 /* Check for a test for a named group's having been set, using the Perl
4582 syntax (?(<name>) or (?('name') */
4583
4584 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4585 {
4586 terminator = CHAR_GREATER_THAN_SIGN;
4587 ptr++;
4588 }
4589 else if (ptr[1] == CHAR_APOSTROPHE)
4590 {
4591 terminator = CHAR_APOSTROPHE;
4592 ptr++;
4593 }
4594 else
4595 {
4596 terminator = 0;
4597 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4598 }
4599
4600 /* We now expect to read a name; any thing else is an error */
4601
4602 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4603 {
4604 ptr += 1; /* To get the right offset */
4605 *errorcodeptr = ERR28;
4606 goto FAILED;
4607 }
4608
4609 /* Read the name, but also get it as a number if it's all digits */
4610
4611 recno = 0;
4612 name = ++ptr;
4613 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4614 {
4615 if (recno >= 0)
4616 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4617 recno * 10 + *ptr - CHAR_0 : -1;
4618 ptr++;
4619 }
4620 namelen = ptr - name;
4621
4622 if ((terminator > 0 && *ptr++ != terminator) ||
4623 *ptr++ != CHAR_RIGHT_PARENTHESIS)
4624 {
4625 ptr--; /* Error offset */
4626 *errorcodeptr = ERR26;
4627 goto FAILED;
4628 }
4629
4630 /* Do no further checking in the pre-compile phase. */
4631
4632 if (lengthptr != NULL) break;
4633
4634 /* In the real compile we do the work of looking for the actual
4635 reference. If the string started with "+" or "-" we require the rest to
4636 be digits, in which case recno will be set. */
4637
4638 if (refsign > 0)
4639 {
4640 if (recno <= 0)
4641 {
4642 *errorcodeptr = ERR58;
4643 goto FAILED;
4644 }
4645 recno = (refsign == CHAR_MINUS)?
4646 cd->bracount - recno + 1 : recno +cd->bracount;
4647 if (recno <= 0 || recno > cd->final_bracount)
4648 {
4649 *errorcodeptr = ERR15;
4650 goto FAILED;
4651 }
4652 PUT2(code, 2+LINK_SIZE, recno);
4653 break;
4654 }
4655
4656 /* Otherwise (did not start with "+" or "-"), start by looking for the
4657 name. */
4658
4659 slot = cd->name_table;
4660 for (i = 0; i < cd->names_found; i++)
4661 {
4662 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4663 slot += cd->name_entry_size;
4664 }
4665
4666 /* Found a previous named subpattern */
4667
4668 if (i < cd->names_found)
4669 {
4670 recno = GET2(slot, 0);
4671 PUT2(code, 2+LINK_SIZE, recno);
4672 }
4673
4674 /* Search the pattern for a forward reference */
4675
4676 else if ((i = find_parens(cd, name, namelen,
4677 (options & PCRE_EXTENDED) != 0)) > 0)
4678 {
4679 PUT2(code, 2+LINK_SIZE, i);
4680 }
4681
4682 /* If terminator == 0 it means that the name followed directly after
4683 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4684 some further alternatives to try. For the cases where terminator != 0
4685 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4686 now checked all the possibilities, so give an error. */
4687
4688 else if (terminator != 0)
4689 {
4690 *errorcodeptr = ERR15;
4691 goto FAILED;
4692 }
4693
4694 /* Check for (?(R) for recursion. Allow digits after R to specify a
4695 specific group number. */
4696
4697 else if (*name == CHAR_R)
4698 {
4699 recno = 0;
4700 for (i = 1; i < namelen; i++)
4701 {
4702 if ((digitab[name[i]] & ctype_digit) == 0)
4703 {
4704 *errorcodeptr = ERR15;
4705 goto FAILED;
4706 }
4707 recno = recno * 10 + name[i] - CHAR_0;
4708 }
4709 if (recno == 0) recno = RREF_ANY;
4710 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4711 PUT2(code, 2+LINK_SIZE, recno);
4712 }
4713
4714 /* Similarly, check for the (?(DEFINE) "condition", which is always
4715 false. */
4716
4717 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4718 {
4719 code[1+LINK_SIZE] = OP_DEF;
4720 skipbytes = 1;
4721 }
4722
4723 /* Check for the "name" actually being a subpattern number. We are
4724 in the second pass here, so final_bracount is set. */
4725
4726 else if (recno > 0 && recno <= cd->final_bracount)
4727 {
4728 PUT2(code, 2+LINK_SIZE, recno);
4729 }
4730
4731 /* Either an unidentified subpattern, or a reference to (?(0) */
4732
4733 else
4734 {
4735 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4736 goto FAILED;
4737 }
4738 break;
4739
4740
4741 /* ------------------------------------------------------------ */
4742 case CHAR_EQUALS_SIGN: /* Positive lookahead */
4743 bravalue = OP_ASSERT;
4744 ptr++;
4745 break;
4746
4747
4748 /* ------------------------------------------------------------ */
4749 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
4750 ptr++;
4751 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
4752 {
4753 *code++ = OP_FAIL;
4754 previous = NULL;
4755 continue;
4756 }
4757 bravalue = OP_ASSERT_NOT;
4758 break;
4759
4760
4761 /* ------------------------------------------------------------ */
4762 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
4763 switch (ptr[1])
4764 {
4765 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
4766 bravalue = OP_ASSERTBACK;
4767 ptr += 2;
4768 break;
4769
4770 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
4771 bravalue = OP_ASSERTBACK_NOT;
4772 ptr += 2;
4773 break;
4774
4775 default: /* Could be name define, else bad */
4776 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4777 ptr++; /* Correct offset for error */
4778 *errorcodeptr = ERR24;
4779 goto FAILED;
4780 }
4781 break;
4782
4783
4784 /* ------------------------------------------------------------ */
4785 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
4786 bravalue = OP_ONCE;
4787 ptr++;
4788 break;
4789
4790
4791 /* ------------------------------------------------------------ */
4792 case CHAR_C: /* Callout - may be followed by digits; */
4793 previous_callout = code; /* Save for later completion */
4794 after_manual_callout = 1; /* Skip one item before completing */
4795 *code++ = OP_CALLOUT;
4796 {
4797 int n = 0;
4798 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4799 n = n * 10 + *ptr - CHAR_0;
4800 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4801 {
4802 *errorcodeptr = ERR39;
4803 goto FAILED;
4804 }
4805 if (n > 255)
4806 {
4807 *errorcodeptr = ERR38;
4808 goto FAILED;
4809 }
4810 *code++ = n;
4811 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4812 PUT(code, LINK_SIZE, 0); /* Default length */
4813 code += 2 * LINK_SIZE;
4814 }
4815 previous = NULL;
4816 continue;
4817
4818
4819 /* ------------------------------------------------------------ */
4820 case CHAR_P: /* Python-style named subpattern handling */
4821 if (*(++ptr) == CHAR_EQUALS_SIGN ||
4822 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
4823 {
4824 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4825 terminator = CHAR_RIGHT_PARENTHESIS;
4826 goto NAMED_REF_OR_RECURSE;
4827 }
4828 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
4829 {
4830 *errorcodeptr = ERR41;
4831 goto FAILED;
4832 }
4833 /* Fall through to handle (?P< as (?< is handled */
4834
4835
4836 /* ------------------------------------------------------------ */
4837 DEFINE_NAME: /* Come here from (?< handling */
4838 case CHAR_APOSTROPHE:
4839 {
4840 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4841 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4842 name = ++ptr;
4843
4844 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4845 namelen = ptr - name;
4846
4847 /* In the pre-compile phase, just do a syntax check. */
4848
4849 if (lengthptr != NULL)
4850 {
4851 if (*ptr != terminator)
4852 {
4853 *errorcodeptr = ERR42;
4854 goto FAILED;
4855 }
4856 if (cd->names_found >= MAX_NAME_COUNT)
4857 {
4858 *errorcodeptr = ERR49;
4859 goto FAILED;
4860 }
4861 if (namelen + 3 > cd->name_entry_size)
4862 {
4863 cd->name_entry_size = namelen + 3;
4864 if (namelen > MAX_NAME_SIZE)
4865 {
4866 *errorcodeptr = ERR48;
4867 goto FAILED;
4868 }
4869 }
4870 }
4871
4872 /* In the real compile, create the entry in the table, maintaining
4873 alphabetical order. Duplicate names for different numbers are
4874 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4875 number are always OK. (An existing number can be re-used if (?|
4876 appears in the pattern.) In either event, a duplicate name results in
4877 a duplicate entry in the table, even if the number is the same. This
4878 is because the number of names, and hence the table size, is computed
4879 in the pre-compile, and it affects various numbers and pointers which
4880 would all have to be modified, and the compiled code moved down, if
4881 duplicates with the same number were omitted from the table. This
4882 doesn't seem worth the hassle. However, *different* names for the
4883 same number are not permitted. */
4884
4885 else
4886 {
4887 BOOL dupname = FALSE;
4888 slot = cd->name_table;
4889
4890 for (i = 0; i < cd->names_found; i++)
4891 {
4892 int crc = memcmp(name, slot+2, namelen);
4893 if (crc == 0)
4894 {
4895 if (slot[2+namelen] == 0)
4896 {
4897 if (GET2(slot, 0) != cd->bracount + 1 &&
4898 (options & PCRE_DUPNAMES) == 0)
4899 {
4900 *errorcodeptr = ERR43;
4901 goto FAILED;
4902 }
4903 else dupname = TRUE;
4904 }
4905 else crc = -1; /* Current name is a substring */
4906 }
4907
4908 /* Make space in the table and break the loop for an earlier
4909 name. For a duplicate or later name, carry on. We do this for
4910 duplicates so that in the simple case (when ?(| is not used) they
4911 are in order of their numbers. */
4912
4913 if (crc < 0)
4914 {
4915 memmove(slot + cd->name_entry_size, slot,
4916 (cd->names_found - i) * cd->name_entry_size);
4917 break;
4918 }
4919
4920 /* Continue the loop for a later or duplicate name */
4921
4922 slot += cd->name_entry_size;
4923 }
4924
4925 /* For non-duplicate names, check for a duplicate number before
4926 adding the new name. */
4927
4928 if (!dupname)
4929 {
4930 uschar *cslot = cd->name_table;
4931 for (i = 0; i < cd->names_found; i++)
4932 {
4933 if (cslot != slot)
4934 {
4935 if (GET2(cslot, 0) == cd->bracount + 1)
4936 {
4937 *errorcodeptr = ERR65;
4938 goto FAILED;
4939 }
4940 }
4941 else i--;
4942 cslot += cd->name_entry_size;
4943 }
4944 }
4945
4946 PUT2(slot, 0, cd->bracount + 1);
4947 memcpy(slot + 2, name, namelen);
4948 slot[2+namelen] = 0;
4949 }
4950 }
4951
4952 /* In both pre-compile and compile, count the number of names we've
4953 encountered. */
4954
4955 cd->names_found++;
4956 ptr++; /* Move past > or ' */
4957 goto NUMBERED_GROUP;
4958
4959
4960 /* ------------------------------------------------------------ */
4961 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
4962 terminator = CHAR_RIGHT_PARENTHESIS;
4963 is_recurse = TRUE;
4964 /* Fall through */
4965
4966 /* We come here from the Python syntax above that handles both
4967 references (?P=name) and recursion (?P>name), as well as falling
4968 through from the Perl recursion syntax (?&name). We also come here from
4969 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4970 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4971
4972 NAMED_REF_OR_RECURSE:
4973 name = ++ptr;
4974 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4975 namelen = ptr - name;
4976
4977 /* In the pre-compile phase, do a syntax check and set a dummy
4978 reference number. */
4979
4980 if (lengthptr != NULL)
4981 {
4982 if (namelen == 0)
4983 {
4984 *errorcodeptr = ERR62;
4985 goto FAILED;
4986 }
4987 if (*ptr != terminator)
4988 {
4989 *errorcodeptr = ERR42;
4990 goto FAILED;
4991 }
4992 if (namelen > MAX_NAME_SIZE)
4993 {
4994 *errorcodeptr = ERR48;
4995 goto FAILED;
4996 }
4997 recno = 0;
4998 }
4999
5000 /* In the real compile, seek the name in the table. We check the name
5001 first, and then check that we have reached the end of the name in the
5002 table. That way, if the name that is longer than any in the table,
5003 the comparison will fail without reading beyond the table entry. */
5004
5005 else
5006 {
5007 slot = cd->name_table;
5008 for (i = 0; i < cd->names_found; i++)
5009 {
5010 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5011 slot[2+namelen] == 0)
5012 break;
5013 slot += cd->name_entry_size;
5014 }
5015
5016 if (i < cd->names_found) /* Back reference */
5017 {
5018 recno = GET2(slot, 0);
5019 }
5020 else if ((recno = /* Forward back reference */
5021 find_parens(cd, name, namelen,
5022 (options & PCRE_EXTENDED) != 0)) <= 0)
5023 {
5024 *errorcodeptr = ERR15;
5025 goto FAILED;
5026 }
5027 }
5028
5029 /* In both phases, we can now go to the code than handles numerical
5030 recursion or backreferences. */
5031
5032 if (is_recurse) goto HANDLE_RECURSION;
5033 else goto HANDLE_REFERENCE;
5034
5035
5036 /* ------------------------------------------------------------ */
5037 case CHAR_R: /* Recursion */
5038 ptr++; /* Same as (?0) */
5039 /* Fall through */
5040
5041
5042 /* ------------------------------------------------------------ */
5043 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
5044 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5045 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5046 {
5047 const uschar *called;
5048 terminator = CHAR_RIGHT_PARENTHESIS;
5049
5050 /* Come here from the \g<...> and \g'...' code (Oniguruma
5051 compatibility). However, the syntax has been checked to ensure that
5052 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5053 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5054 ever be taken. */
5055
5056 HANDLE_NUMERICAL_RECURSION:
5057
5058 if ((refsign = *ptr) == CHAR_PLUS)
5059 {
5060 ptr++;
5061 if ((digitab[*ptr] & ctype_digit) == 0)
5062 {
5063 *errorcodeptr = ERR63;
5064 goto FAILED;
5065 }
5066 }
5067 else if (refsign == CHAR_MINUS)
5068 {
5069 if ((digitab[ptr[1]] & ctype_digit) == 0)
5070 goto OTHER_CHAR_AFTER_QUERY;
5071 ptr++;
5072 }
5073
5074 recno = 0;
5075 while((digitab[*ptr] & ctype_digit) != 0)
5076 recno = recno * 10 + *ptr++ - CHAR_0;
5077
5078 if (*ptr != terminator)
5079 {
5080 *errorcodeptr = ERR29;
5081 goto FAILED;
5082 }
5083
5084 if (refsign == CHAR_MINUS)
5085 {
5086 if (recno == 0)
5087 {
5088 *errorcodeptr = ERR58;
5089 goto FAILED;
5090 }
5091 recno = cd->bracount - recno + 1;
5092 if (recno <= 0)
5093 {
5094 *errorcodeptr = ERR15;
5095 goto FAILED;
5096 }
5097 }
5098 else if (refsign == CHAR_PLUS)
5099 {
5100 if (recno == 0)
5101 {
5102 *errorcodeptr = ERR58;
5103 goto FAILED;
5104 }
5105 recno += cd->bracount;
5106 }
5107
5108 /* Come here from code above that handles a named recursion */
5109
5110 HANDLE_RECURSION:
5111
5112 previous = code;
5113 called = cd->start_code;
5114
5115 /* When we are actually compiling, find the bracket that is being
5116 referenced. Temporarily end the regex in case it doesn't exist before
5117 this point. If we end up with a forward reference, first check that
5118 the bracket does occur later so we can give the error (and position)
5119 now. Then remember this forward reference in the workspace so it can
5120 be filled in at the end. */
5121
5122 if (lengthptr == NULL)
5123 {
5124 *code = OP_END;
5125 if (recno != 0)
5126 called = _pcre_find_bracket(cd->start_code, utf8, recno);
5127
5128 /* Forward reference */
5129
5130 if (called == NULL)
5131 {
5132 if (find_parens(cd, NULL, recno,
5133 (options & PCRE_EXTENDED) != 0) < 0)
5134 {
5135 *errorcodeptr = ERR15;
5136 goto FAILED;
5137 }
5138 called = cd->start_code + recno;
5139 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5140 }
5141
5142 /* If not a forward reference, and the subpattern is still open,
5143 this is a recursive call. We check to see if this is a left
5144 recursion that could loop for ever, and diagnose that case. */
5145
5146 else if (GET(called, 1) == 0 &&
5147 could_be_empty(called, code, bcptr, utf8))
5148 {
5149 *errorcodeptr = ERR40;
5150 goto FAILED;
5151 }
5152 }
5153
5154 /* Insert the recursion/subroutine item, automatically wrapped inside
5155 "once" brackets. Set up a "previous group" length so that a
5156 subsequent quantifier will work. */
5157
5158 *code = OP_ONCE;
5159 PUT(code, 1, 2 + 2*LINK_SIZE);
5160 code += 1 + LINK_SIZE;
5161
5162 *code = OP_RECURSE;
5163 PUT(code, 1, called - cd->start_code);
5164 code += 1 + LINK_SIZE;
5165
5166 *code = OP_KET;
5167 PUT(code, 1, 2 + 2*LINK_SIZE);
5168 code += 1 + LINK_SIZE;
5169
5170 length_prevgroup = 3 + 3*LINK_SIZE;
5171 }
5172
5173 /* Can't determine a first byte now */
5174
5175 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5176 continue;
5177
5178
5179 /* ------------------------------------------------------------ */
5180 default: /* Other characters: check option setting */
5181 OTHER_CHAR_AFTER_QUERY:
5182 set = unset = 0;
5183 optset = &set;
5184
5185 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5186 {
5187 switch (*ptr++)
5188 {
5189 case CHAR_MINUS: optset = &unset; break;
5190
5191 case CHAR_J: /* Record that it changed in the external options */
5192 *optset |= PCRE_DUPNAMES;
5193 cd->external_flags |= PCRE_JCHANGED;
5194 break;
5195
5196 case CHAR_i: *optset |= PCRE_CASELESS; break;
5197 case CHAR_m: *optset |= PCRE_MULTILINE; break;
5198 case CHAR_s: *optset |= PCRE_DOTALL; break;
5199 case CHAR_x: *optset |= PCRE_EXTENDED; break;
5200 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5201 case CHAR_X: *optset |= PCRE_EXTRA; break;
5202
5203 default: *errorcodeptr = ERR12;
5204 ptr--; /* Correct the offset */
5205 goto FAILED;
5206 }
5207 }
5208
5209 /* Set up the changed option bits, but don't change anything yet. */
5210
5211 newoptions = (options | set) & (~unset);
5212
5213 /* If the options ended with ')' this is not the start of a nested
5214 group with option changes, so the options change at this level. If this
5215 item is right at the start of the pattern, the options can be
5216 abstracted and made external in the pre-compile phase, and ignored in
5217 the compile phase. This can be helpful when matching -- for instance in
5218 caseless checking of required bytes.
5219
5220 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5221 definitely *not* at the start of the pattern because something has been
5222 compiled. In the pre-compile phase, however, the code pointer can have
5223 that value after the start, because it gets reset as code is discarded
5224 during the pre-compile. However, this can happen only at top level - if
5225 we are within parentheses, the starting BRA will still be present. At
5226 any parenthesis level, the length value can be used to test if anything
5227 has been compiled at that level. Thus, a test for both these conditions
5228 is necessary to ensure we correctly detect the start of the pattern in
5229 both phases.
5230
5231 If we are not at the pattern start, compile code to change the ims
5232 options if this setting actually changes any of them, and reset the
5233 greedy defaults and the case value for firstbyte and reqbyte. */
5234
5235 if (*ptr == CHAR_RIGHT_PARENTHESIS)
5236 {
5237 if (code == cd->start_code + 1 + LINK_SIZE &&
5238 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5239 {
5240 cd->external_options = newoptions;
5241 }
5242 else
5243 {
5244 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5245 {
5246 *code++ = OP_OPT;
5247 *code++ = newoptions & PCRE_IMS;
5248 }
5249 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5250 greedy_non_default = greedy_default ^ 1;
5251 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5252 }
5253
5254 /* Change options at this level, and pass them back for use
5255 in subsequent branches. When not at the start of the pattern, this
5256 information is also necessary so that a resetting item can be
5257 compiled at the end of a group (if we are in a group). */
5258
5259 *optionsptr = options = newoptions;
5260 previous = NULL; /* This item can't be repeated */
5261 continue; /* It is complete */
5262 }
5263
5264 /* If the options ended with ':' we are heading into a nested group
5265 with possible change of options. Such groups are non-capturing and are
5266 not assertions of any kind. All we need to do is skip over the ':';
5267 the newoptions value is handled below. */
5268
5269 bravalue = OP_BRA;
5270 ptr++;
5271 } /* End of switch for character following (? */
5272 } /* End of (? handling */
5273
5274 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5275 all unadorned brackets become non-capturing and behave like (?:...)
5276 brackets. */
5277
5278 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5279 {
5280 bravalue = OP_BRA;
5281 }
5282
5283 /* Else we have a capturing group. */
5284
5285 else
5286 {
5287 NUMBERED_GROUP:
5288 cd->bracount += 1;
5289 PUT2(code, 1+LINK_SIZE, cd->bracount);
5290 skipbytes = 2;
5291 }
5292
5293 /* Process nested bracketed regex. Assertions may not be repeated, but
5294 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5295 non-register variable in order to be able to pass its address because some
5296 compilers complain otherwise. Pass in a new setting for the ims options if
5297 they have changed. */
5298
5299 previous = (bravalue >= OP_ONCE)? code : NULL;
5300 *code = bravalue;
5301 tempcode = code;
5302 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5303 length_prevgroup = 0; /* Initialize for pre-compile phase */
5304
5305 if (!compile_regex(
5306 newoptions, /* The complete new option state */
5307 options & PCRE_IMS, /* The previous ims option state */
5308 &tempcode, /* Where to put code (updated) */
5309 &ptr, /* Input pointer (updated) */
5310 errorcodeptr, /* Where to put an error message */
5311 (bravalue == OP_ASSERTBACK ||
5312 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5313 reset_bracount, /* True if (?| group */
5314 skipbytes, /* Skip over bracket number */
5315 &subfirstbyte, /* For possible first char */
5316 &subreqbyte, /* For possible last char */
5317 bcptr, /* Current branch chain */
5318 cd, /* Tables block */
5319 (lengthptr == NULL)? NULL : /* Actual compile phase */
5320 &length_prevgroup /* Pre-compile phase */
5321 ))
5322 goto FAILED;
5323
5324 /* At the end of compiling, code is still pointing to the start of the
5325 group, while tempcode has been updated to point past the end of the group
5326 and any option resetting that may follow it. The pattern pointer (ptr)
5327 is on the bracket. */
5328
5329 /* If this is a conditional bracket, check that there are no more than
5330 two branches in the group, or just one if it's a DEFINE group. We do this
5331 in the real compile phase, not in the pre-pass, where the whole group may
5332 not be available. */
5333
5334 if (bravalue == OP_COND && lengthptr == NULL)
5335 {
5336 uschar *tc = code;
5337 int condcount = 0;
5338
5339 do {
5340 condcount++;
5341 tc += GET(tc,1);
5342 }
5343 while (*tc != OP_KET);
5344
5345 /* A DEFINE group is never obeyed inline (the "condition" is always
5346 false). It must have only one branch. */
5347
5348 if (code[LINK_SIZE+1] == OP_DEF)
5349 {
5350 if (condcount > 1)
5351 {
5352 *errorcodeptr = ERR54;
5353 goto FAILED;
5354 }
5355 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5356 }
5357
5358 /* A "normal" conditional group. If there is just one branch, we must not
5359 make use of its firstbyte or reqbyte, because this is equivalent to an
5360 empty second branch. */
5361
5362 else
5363 {
5364 if (condcount > 2)
5365 {
5366 *errorcodeptr = ERR27;
5367 goto FAILED;
5368 }
5369 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5370 }
5371 }
5372
5373 /* Error if hit end of pattern */
5374
5375 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5376 {
5377 *errorcodeptr = ERR14;
5378 goto FAILED;
5379 }
5380
5381 /* In the pre-compile phase, update the length by the length of the group,
5382 less the brackets at either end. Then reduce the compiled code to just a
5383 set of non-capturing brackets so that it doesn't use much memory if it is
5384 duplicated by a quantifier.*/
5385
5386 if (lengthptr != NULL)
5387 {
5388 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5389 {
5390 *errorcodeptr = ERR20;
5391 goto FAILED;
5392 }
5393 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5394 *code++ = OP_BRA;
5395 PUTINC(code, 0, 1 + LINK_SIZE);
5396 *code++ = OP_KET;
5397 PUTINC(code, 0, 1 + LINK_SIZE);
5398 break; /* No need to waste time with special character handling */
5399 }
5400
5401 /* Otherwise update the main code pointer to the end of the group. */
5402
5403 code = tempcode;
5404
5405 /* For a DEFINE group, required and first character settings are not
5406 relevant. */
5407
5408 if (bravalue == OP_DEF) break;
5409
5410 /* Handle updating of the required and first characters for other types of
5411 group. Update for normal brackets of all kinds, and conditions with two
5412 branches (see code above). If the bracket is followed by a quantifier with
5413 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5414 zerofirstbyte outside the main loop so that they can be accessed for the
5415 back off. */
5416
5417 zeroreqbyte = reqbyte;
5418 zerofirstbyte = firstbyte;
5419 groupsetfirstbyte = FALSE;
5420
5421 if (bravalue >= OP_ONCE)
5422 {
5423 /* If we have not yet set a firstbyte in this branch, take it from the
5424 subpattern, remembering that it was set here so that a repeat of more
5425 than one can replicate it as reqbyte if necessary. If the subpattern has
5426 no firstbyte, set "none" for the whole branch. In both cases, a zero
5427 repeat forces firstbyte to "none". */
5428
5429 if (firstbyte == REQ_UNSET)
5430 {
5431 if (subfirstbyte >= 0)
5432 {
5433 firstbyte = subfirstbyte;
5434 groupsetfirstbyte = TRUE;
5435 }
5436 else firstbyte = REQ_NONE;
5437 zerofirstbyte = REQ_NONE;
5438 }
5439
5440 /* If firstbyte was previously set, convert the subpattern's firstbyte
5441 into reqbyte if there wasn't one, using the vary flag that was in
5442 existence beforehand. */
5443
5444 else if (subfirstbyte >= 0 && subreqbyte < 0)
5445 subreqbyte = subfirstbyte | tempreqvary;
5446
5447 /* If the subpattern set a required byte (or set a first byte that isn't
5448 really the first byte - see above), set it. */
5449
5450 if (subreqbyte >= 0) reqbyte = subreqbyte;
5451 }
5452
5453 /* For a forward assertion, we take the reqbyte, if set. This can be
5454 helpful if the pattern that follows the assertion doesn't set a different
5455 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5456 for an assertion, however because it leads to incorrect effect for patterns
5457 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5458 of a firstbyte. This is overcome by a scan at the end if there's no
5459 firstbyte, looking for an asserted first char. */
5460
5461 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5462 break; /* End of processing '(' */
5463
5464
5465 /* ===================================================================*/
5466 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5467 are arranged to be the negation of the corresponding OP_values. For the
5468 back references, the values are ESC_REF plus the reference number. Only
5469 back references and those types that consume a character may be repeated.
5470 We can test for values between ESC_b and ESC_Z for the latter; this may
5471 have to change if any new ones are ever created. */
5472
5473 case CHAR_BACKSLASH:
5474 tempptr = ptr;
5475 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5476 if (*errorcodeptr != 0) goto FAILED;
5477
5478 if (c < 0)
5479 {
5480 if (-c == ESC_Q) /* Handle start of quoted string */
5481 {
5482 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5483 ptr += 2; /* avoid empty string */
5484 else inescq = TRUE;
5485 continue;
5486 }
5487
5488 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5489
5490 /* For metasequences that actually match a character, we disable the
5491 setting of a first character if it hasn't already been set. */
5492
5493 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5494 firstbyte = REQ_NONE;
5495
5496 /* Set values to reset to if this is followed by a zero repeat. */
5497
5498 zerofirstbyte = firstbyte;
5499 zeroreqbyte = reqbyte;
5500
5501 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5502 is a subroutine call by number (Oniguruma syntax). In fact, the value
5503 -ESC_g is returned only for these cases. So we don't need to check for <
5504 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5505 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5506 that is a synonym for a named back reference). */
5507
5508 if (-c == ESC_g)
5509 {
5510 const uschar *p;
5511 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5512 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5513 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5514
5515 /* These two statements stop the compiler for warning about possibly
5516 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5517 fact, because we actually check for a number below, the paths that
5518 would actually be in error are never taken. */
5519
5520 skipbytes = 0;
5521 reset_bracount = FALSE;
5522
5523 /* Test for a name */
5524
5525 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5526 {
5527 BOOL isnumber = TRUE;
5528 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5529 {
5530 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5531 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5532 }
5533 if (*p != terminator)
5534 {
5535 *errorcodeptr = ERR57;
5536 break;
5537 }
5538 if (isnumber)
5539 {
5540 ptr++;
5541 goto HANDLE_NUMERICAL_RECURSION;
5542 }
5543 is_recurse = TRUE;
5544 goto NAMED_REF_OR_RECURSE;
5545 }
5546
5547 /* Test a signed number in angle brackets or quotes. */
5548
5549 p = ptr + 2;
5550 while ((digitab[*p] & ctype_digit) != 0) p++;
5551 if (*p != terminator)
5552 {
5553 *errorcodeptr = ERR57;
5554 break;
5555 }
5556 ptr++;
5557 goto HANDLE_NUMERICAL_RECURSION;
5558 }
5559
5560 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5561 We also support \k{name} (.NET syntax) */
5562
5563 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5564 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5565 {
5566 is_recurse = FALSE;
5567 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5568 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5569 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5570 goto NAMED_REF_OR_RECURSE;
5571 }
5572
5573 /* Back references are handled specially; must disable firstbyte if
5574 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5575 ':' later. */
5576
5577 if (-c >= ESC_REF)
5578 {
5579 recno = -c - ESC_REF;
5580
5581 HANDLE_REFERENCE: /* Come here from named backref handling */
5582 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5583 previous = code;
5584 *code++ = OP_REF;
5585 PUT2INC(code, 0, recno);
5586 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5587 if (recno > cd->top_backref) cd->top_backref = recno;
5588 }
5589
5590 /* So are Unicode property matches, if supported. */
5591
5592 #ifdef SUPPORT_UCP
5593 else if (-c == ESC_P || -c == ESC_p)
5594 {
5595 BOOL negated;
5596 int pdata;
5597 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5598 if (ptype < 0) goto FAILED;
5599 previous = code;
5600 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5601 *code++ = ptype;
5602 *code++ = pdata;
5603 }
5604 #else
5605
5606 /* If Unicode properties are not supported, \X, \P, and \p are not
5607 allowed. */
5608
5609 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5610 {
5611 *errorcodeptr = ERR45;
5612 goto FAILED;
5613 }
5614 #endif
5615
5616 /* For the rest (including \X when Unicode properties are supported), we
5617 can obtain the OP value by negating the escape value. */
5618
5619 else
5620 {
5621 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5622 *code++ = -c;
5623 }
5624 continue;
5625 }
5626
5627 /* We have a data character whose value is in c. In UTF-8 mode it may have
5628 a value > 127. We set its representation in the length/buffer, and then
5629 handle it as a data character. */
5630
5631 #ifdef SUPPORT_UTF8
5632 if (utf8 && c > 127)
5633 mclength = _pcre_ord2utf8(c, mcbuffer);
5634 else
5635 #endif
5636
5637 {
5638 mcbuffer[0] = c;
5639 mclength = 1;
5640 }
5641 goto ONE_CHAR;
5642
5643
5644 /* ===================================================================*/
5645 /* Handle a literal character. It is guaranteed not to be whitespace or #
5646 when the extended flag is set. If we are in UTF-8 mode, it may be a
5647 multi-byte literal character. */
5648
5649 default:
5650 NORMAL_CHAR:
5651 mclength = 1;
5652 mcbuffer[0] = c;
5653
5654 #ifdef SUPPORT_UTF8
5655 if (utf8 && c >= 0xc0)
5656 {
5657 while ((ptr[1] & 0xc0) == 0x80)
5658 mcbuffer[mclength++] = *(++ptr);
5659 }
5660 #endif
5661
5662 /* At this point we have the character's bytes in mcbuffer, and the length
5663 in mclength. When not in UTF-8 mode, the length is always 1. */
5664
5665 ONE_CHAR:
5666 previous = code;
5667 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5668 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5669
5670 /* Remember if \r or \n were seen */
5671
5672 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5673 cd->external_flags |= PCRE_HASCRORLF;
5674
5675 /* Set the first and required bytes appropriately. If no previous first
5676 byte, set it from this character, but revert to none on a zero repeat.
5677 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5678 repeat. */
5679
5680 if (firstbyte == REQ_UNSET)
5681 {
5682 zerofirstbyte = REQ_NONE;
5683 zeroreqbyte = reqbyte;
5684
5685 /* If the character is more than one byte long, we can set firstbyte
5686 only if it is not to be matched caselessly. */
5687
5688 if (mclength == 1 || req_caseopt == 0)
5689 {
5690 firstbyte = mcbuffer[0] | req_caseopt;
5691 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5692 }
5693 else firstbyte = reqbyte = REQ_NONE;
5694 }
5695
5696 /* firstbyte was previously set; we can set reqbyte only the length is
5697 1 or the matching is caseful. */
5698
5699 else
5700 {
5701 zerofirstbyte = firstbyte;
5702 zeroreqbyte = reqbyte;
5703 if (mclength == 1 || req_caseopt == 0)
5704 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5705 }
5706
5707 break; /* End of literal character handling */
5708 }
5709 } /* end of big loop */
5710
5711
5712 /* Control never reaches here by falling through, only by a goto for all the
5713 error states. Pass back the position in the pattern so that it can be displayed
5714 to the user for diagnosing the error. */
5715
5716 FAILED:
5717 *ptrptr = ptr;
5718 return FALSE;
5719 }
5720
5721
5722
5723
5724 /*************************************************
5725 * Compile sequence of alternatives *
5726 *************************************************/
5727
5728 /* On entry, ptr is pointing past the bracket character, but on return it
5729 points to the closing bracket, or vertical bar, or end of string. The code
5730 variable is pointing at the byte into which the BRA operator has been stored.
5731 If the ims options are changed at the start (for a (?ims: group) or during any
5732 branch, we need to insert an OP_OPT item at the start of every following branch
5733 to ensure they get set correctly at run time, and also pass the new options
5734 into every subsequent branch compile.
5735
5736 This function is used during the pre-compile phase when we are trying to find
5737 out the amount of memory needed, as well as during the real compile phase. The
5738 value of lengthptr distinguishes the two phases.
5739
5740 Arguments:
5741 options option bits, including any changes for this subpattern
5742 oldims previous settings of ims option bits
5743 codeptr -> the address of the current code pointer
5744 ptrptr -> the address of the current pattern pointer
5745 errorcodeptr -> pointer to error code variable
5746 lookbehind TRUE if this is a lookbehind assertion
5747 reset_bracount TRUE to reset the count for each branch
5748 skipbytes skip this many bytes at start (for brackets and OP_COND)
5749 firstbyteptr place to put the first required character, or a negative number
5750 reqbyteptr place to put the last required character, or a negative number
5751 bcptr pointer to the chain of currently open branches
5752 cd points to the data block with tables pointers etc.
5753 lengthptr NULL during the real compile phase
5754 points to length accumulator during pre-compile phase
5755
5756 Returns: TRUE on success
5757 */
5758
5759 static BOOL
5760 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5761 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5762 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5763 int *lengthptr)
5764 {
5765 const uschar *ptr = *ptrptr;
5766 uschar *code = *codeptr;
5767 uschar *last_branch = code;
5768 uschar *start_bracket = code;
5769 uschar *reverse_count = NULL;
5770 open_capitem capitem;
5771 int capnumber = 0;
5772 int firstbyte, reqbyte;
5773 int branchfirstbyte, branchreqbyte;
5774 int length;
5775 int orig_bracount;
5776 int max_bracount;
5777 branch_chain bc;
5778
5779 bc.outer = bcptr;
5780 bc.current = code;
5781
5782 firstbyte = reqbyte = REQ_UNSET;
5783
5784 /* Accumulate the length for use in the pre-compile phase. Start with the
5785 length of the BRA and KET and any extra bytes that are required at the
5786 beginning. We accumulate in a local variable to save frequent testing of
5787 lenthptr for NULL. We cannot do this by looking at the value of code at the
5788 start and end of each alternative, because compiled items are discarded during
5789 the pre-compile phase so that the work space is not exceeded. */
5790
5791 length = 2 + 2*LINK_SIZE + skipbytes;
5792
5793 /* WARNING: If the above line is changed for any reason, you must also change
5794 the code that abstracts option settings at the start of the pattern and makes
5795 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5796 pre-compile phase to find out whether anything has yet been compiled or not. */
5797
5798 /* If this is a capturing subpattern, add to the chain of open capturing items
5799 so that we can detect them if (*ACCEPT) is encountered. */
5800
5801 if (*code == OP_CBRA)
5802 {
5803 capnumber = GET2(code, 1 + LINK_SIZE);
5804 capitem.number = capnumber;
5805 capitem.next = cd->open_caps;
5806 cd->open_caps = &capitem;
5807 }
5808
5809 /* Offset is set zero to mark that this bracket is still open */
5810
5811 PUT(code, 1, 0);
5812 code += 1 + LINK_SIZE + skipbytes;
5813
5814 /* Loop for each alternative branch */
5815
5816 orig_bracount = max_bracount = cd->bracount;
5817 for (;;)
5818 {
5819 /* For a (?| group, reset the capturing bracket count so that each branch
5820 uses the same numbers. */
5821
5822 if (reset_bracount) cd->bracount = orig_bracount;
5823
5824 /* Handle a change of ims options at the start of the branch */
5825
5826 if ((options & PCRE_IMS) != oldims)
5827 {
5828 *code++ = OP_OPT;
5829 *code++ = options & PCRE_IMS;
5830 length += 2;
5831 }
5832
5833 /* Set up dummy OP_REVERSE if lookbehind assertion */
5834
5835 if (lookbehind)
5836 {
5837 *code++ = OP_REVERSE;
5838 reverse_count = code;
5839 PUTINC(code, 0, 0);
5840 length += 1 + LINK_SIZE;
5841 }
5842
5843 /* Now compile the branch; in the pre-compile phase its length gets added
5844 into the length. */
5845
5846 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5847 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5848 {
5849 *ptrptr = ptr;
5850 return FALSE;
5851 }
5852
5853 /* Keep the highest bracket count in case (?| was used and some branch
5854 has fewer than the rest. */
5855
5856 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5857
5858 /* In the real compile phase, there is some post-processing to be done. */
5859
5860 if (lengthptr == NULL)
5861 {
5862 /* If this is the first branch, the firstbyte and reqbyte values for the
5863 branch become the values for the regex. */
5864
5865 if (*last_branch != OP_ALT)
5866 {
5867 firstbyte = branchfirstbyte;
5868 reqbyte = branchreqbyte;
5869 }
5870
5871 /* If this is not the first branch, the first char and reqbyte have to
5872 match the values from all the previous branches, except that if the
5873 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5874 and we set REQ_VARY for the regex. */
5875
5876 else
5877 {
5878 /* If we previously had a firstbyte, but it doesn't match the new branch,
5879 we have to abandon the firstbyte for the regex, but if there was
5880 previously no reqbyte, it takes on the value of the old firstbyte. */
5881
5882 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5883 {
5884 if (reqbyte < 0) reqbyte = firstbyte;
5885 firstbyte = REQ_NONE;
5886 }
5887
5888 /* If we (now or from before) have no firstbyte, a firstbyte from the
5889 branch becomes a reqbyte if there isn't a branch reqbyte. */
5890
5891 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5892 branchreqbyte = branchfirstbyte;
5893
5894 /* Now ensure that the reqbytes match */
5895
5896 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5897 reqbyte = REQ_NONE;
5898 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5899 }
5900
5901 /* If lookbehind, check that this branch matches a fixed-length string, and
5902 put the length into the OP_REVERSE item. Temporarily mark the end of the
5903 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5904 because there may be forward references that we can't check here. Set a
5905 flag to cause another lookbehind check at the end. Why not do it all at the
5906 end? Because common, erroneous checks are picked up here and the offset of
5907 the problem can be shown. */
5908
5909 if (lookbehind)
5910 {
5911 int fixed_length;
5912 *code = OP_END;
5913 fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
5914 DPRINTF(("fixed length = %d\n", fixed_length));
5915 if (fixed_length == -3)
5916 {
5917 cd->check_lookbehind = TRUE;
5918 }
5919 else if (fixed_length < 0)
5920 {
5921 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5922 *ptrptr = ptr;
5923 return FALSE;
5924 }
5925 else { PUT(reverse_count, 0, fixed_length); }
5926 }
5927 }
5928
5929 /* Reached end of expression, either ')' or end of pattern. In the real
5930 compile phase, go back through the alternative branches and reverse the chain
5931 of offsets, with the field in the BRA item now becoming an offset to the
5932 first alternative. If there are no alternatives, it points to the end of the
5933 group. The length in the terminating ket is always the length of the whole
5934 bracketed item. If any of the ims options were changed inside the group,
5935 compile a resetting op-code following, except at the very end of the pattern.
5936 Return leaving the pointer at the terminating char. */
5937
5938 if (*ptr != CHAR_VERTICAL_LINE)
5939 {
5940 if (lengthptr == NULL)
5941 {
5942 int branch_length = code - last_branch;
5943 do
5944 {
5945 int prev_length = GET(last_branch, 1);
5946 PUT(last_branch, 1, branch_length);
5947 branch_length = prev_length;
5948 last_branch -= branch_length;
5949 }
5950 while (branch_length > 0);
5951 }
5952
5953 /* If it was a capturing subpattern, remove it from the chain. */
5954
5955 if (capnumber > 0) cd->open_caps = cd->open_caps->next;
5956
5957 /* Fill in the ket */
5958
5959 *code = OP_KET;
5960 PUT(code, 1, code - start_bracket);
5961 code += 1 + LINK_SIZE;
5962
5963 /* Resetting option if needed */
5964
5965 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
5966 {
5967 *code++ = OP_OPT;
5968 *code++ = oldims;
5969 length += 2;
5970 }
5971
5972 /* Retain the highest bracket number, in case resetting was used. */
5973
5974 cd->bracount = max_bracount;
5975
5976 /* Set values to pass back */
5977
5978 *codeptr = code;
5979 *ptrptr = ptr;
5980 *firstbyteptr = firstbyte;
5981 *reqbyteptr = reqbyte;
5982 if (lengthptr != NULL)
5983 {
5984 if (OFLOW_MAX - *lengthptr < length)
5985 {
5986 *errorcodeptr = ERR20;
5987 return FALSE;
5988 }
5989 *lengthptr += length;
5990 }
5991 return TRUE;
5992 }
5993
5994 /* Another branch follows. In the pre-compile phase, we can move the code
5995 pointer back to where it was for the start of the first branch. (That is,
5996 pretend that each branch is the only one.)
5997
5998 In the real compile phase, insert an ALT node. Its length field points back
5999 to the previous branch while the bracket remains open. At the end the chain
6000 is reversed. It's done like this so that the start of the bracket has a
6001 zero offset until it is closed, making it possible to detect recursion. */
6002
6003 if (lengthptr != NULL)
6004 {
6005 code = *codeptr + 1 + LINK_SIZE + skipbytes;
6006 length += 1 + LINK_SIZE;
6007 }
6008 else
6009 {
6010 *code = OP_ALT;
6011 PUT(code, 1, code - last_branch);
6012 bc.current = last_branch = code;
6013 code += 1 + LINK_SIZE;
6014 }
6015
6016 ptr++;
6017 }
6018 /* Control never reaches here */
6019 }
6020
6021
6022
6023
6024 /*************************************************
6025 * Check for anchored expression *
6026 *************************************************/
6027
6028 /* Try to find out if this is an anchored regular expression. Consider each
6029 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6030 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6031 it's anchored. However, if this is a multiline pattern, then only OP_SOD
6032 counts, since OP_CIRC can match in the middle.
6033
6034 We can also consider a regex to be anchored if OP_SOM starts all its branches.
6035 This is the code for \G, which means "match at start of match position, taking
6036 into account the match offset".
6037
6038 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6039 because that will try the rest of the pattern at all possible matching points,
6040 so there is no point trying again.... er ....
6041
6042 .... except when the .* appears inside capturing parentheses, and there is a
6043 subsequent back reference to those parentheses. We haven't enough information
6044 to catch that case precisely.
6045
6046 At first, the best we could do was to detect when .* was in capturing brackets
6047 and the highest back reference was greater than or equal to that level.
6048 However, by keeping a bitmap of the first 31 back references, we can catch some
6049 of the more common cases more precisely.
6050
6051 Arguments:
6052 code points to start of expression (the bracket)
6053 options points to the options setting
6054 bracket_map a bitmap of which brackets we are inside while testing; this
6055 handles up to substring 31; after that we just have to take
6056 the less precise approach
6057 backref_map the back reference bitmap
6058
6059 Returns: TRUE or FALSE
6060 */
6061
6062 static BOOL
6063 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6064 unsigned int backref_map)
6065 {
6066 do {
6067 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6068 options, PCRE_MULTILINE, FALSE);
6069 register int op = *scode;
6070
6071 /* Non-capturing brackets */
6072
6073 if (op == OP_BRA)
6074 {
6075 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6076 }
6077
6078 /* Capturing brackets */
6079
6080 else if (op == OP_CBRA)
6081 {
6082 int n = GET2(scode, 1+LINK_SIZE);
6083 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6084 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6085 }
6086
6087 /* Other brackets */
6088
6089 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6090 {
6091 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6092 }
6093
6094 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6095 it isn't in brackets that are or may be referenced. */
6096
6097 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6098 op == OP_TYPEPOSSTAR))
6099 {
6100 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6101 return FALSE;
6102 }
6103
6104 /* Check for explicit anchoring */
6105
6106 else if (op != OP_SOD && op != OP_SOM &&
6107 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6108 return FALSE;
6109 code += GET(code, 1);
6110 }
6111 while (*code == OP_ALT); /* Loop for each alternative */
6112 return TRUE;
6113 }
6114
6115
6116
6117 /*************************************************
6118 * Check for starting with ^ or .* *
6119 *************************************************/
6120
6121 /* This is called to find out if every branch starts with ^ or .* so that
6122 "first char" processing can be done to speed things up in multiline
6123 matching and for non-DOTALL patterns that start with .* (which must start at
6124 the beginning or after \n). As in the case of is_anchored() (see above), we
6125 have to take account of back references to capturing brackets that contain .*
6126 because in that case we can't make the assumption.
6127
6128 Arguments:
6129 code points to start of expression (the bracket)
6130 bracket_map a bitmap of which brackets we are inside while testing; this
6131 handles up to substring 31; after that we just have to take
6132 the less precise approach
6133 backref_map the back reference bitmap
6134
6135 Returns: TRUE or FALSE
6136 */
6137
6138 static BOOL
6139 is_startline(const uschar *code, unsigned int bracket_map,
6140 unsigned int backref_map)
6141 {
6142 do {
6143 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6144 NULL, 0, FALSE);
6145 register int op = *scode;
6146
6147 /* If we are at the start of a conditional assertion group, *both* the
6148 conditional assertion *and* what follows the condition must satisfy the test
6149 for start of line. Other kinds of condition fail. Note that there may be an
6150 auto-callout at the start of a condition. */
6151
6152 if (op == OP_COND)
6153 {
6154 scode += 1 + LINK_SIZE;
6155 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6156 switch (*scode)
6157 {
6158 case OP_CREF:
6159 case OP_RREF:
6160 case OP_DEF:
6161 return FALSE;
6162
6163 default: /* Assertion */
6164 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6165 do scode += GET(scode, 1); while (*scode == OP_ALT);
6166 scode += 1 + LINK_SIZE;
6167 break;
6168 }
6169 scode = first_significant_code(scode, NULL, 0, FALSE);
6170 op = *scode;
6171 }
6172
6173 /* Non-capturing brackets */
6174
6175 if (op == OP_BRA)
6176 {
6177 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6178 }
6179
6180 /* Capturing brackets */
6181
6182 else if (op == OP_CBRA)
6183 {
6184 int n = GET2(scode, 1+LINK_SIZE);
6185 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6186 if (!is_startline(scode, new_map, backref_map)) return FALSE;
6187 }
6188
6189 /* Other brackets */
6190
6191 else if (op == OP_ASSERT || op == OP_ONCE)
6192 {
6193 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6194 }
6195
6196 /* .* means "start at start or after \n" if it isn't in brackets that
6197 may be referenced. */
6198
6199 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6200 {
6201 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6202 }
6203
6204 /* Check for explicit circumflex */
6205
6206 else if (op != OP_CIRC) return FALSE;
6207
6208 /* Move on to the next alternative */
6209
6210 code += GET(code, 1);
6211 }
6212 while (*code == OP_ALT); /* Loop for each alternative */
6213 return TRUE;
6214 }
6215
6216
6217
6218 /*************************************************
6219 * Check for asserted fixed first char *
6220 *************************************************/
6221
6222 /* During compilation, the "first char" settings from forward assertions are
6223 discarded, because they can cause conflicts with actual literals that follow.
6224 However, if we end up without a first char setting for an unanchored pattern,
6225 it is worth scanning the regex to see if there is an initial asserted first
6226 char. If all branches start with the same asserted char, or with a bracket all
6227 of whose alternatives start with the same asserted char (recurse ad lib), then
6228 we return that char, otherwise -1.
6229
6230 Arguments:
6231 code points to start of expression (the bracket)
6232 options pointer to the options (used to check casing changes)
6233 inassert TRUE if in an assertion
6234
6235 Returns: -1 or the fixed first char
6236 */
6237
6238 static int
6239 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6240 {
6241 register int c = -1;
6242 do {
6243 int d;
6244 const uschar *scode =
6245 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6246 register int op = *scode;
6247
6248 switch(op)
6249 {
6250 default:
6251 return -1;
6252
6253 case OP_BRA:
6254 case OP_CBRA:
6255 case OP_ASSERT:
6256 case OP_ONCE:
6257 case OP_COND:
6258 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6259 return -1;
6260 if (c < 0) c = d; else if (c != d) return -1;
6261 break;
6262
6263 case OP_EXACT: /* Fall through */
6264 scode += 2;
6265
6266 case OP_CHAR:
6267 case OP_CHARNC:
6268 case OP_PLUS:
6269 case OP_MINPLUS:
6270 case OP_POSPLUS:
6271 if (!inassert) return -1;
6272 if (c < 0)
6273 {
6274 c = scode[1];
6275 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6276 }
6277 else if (c != scode[1]) return -1;
6278 break;
6279 }
6280
6281 code += GET(code, 1);
6282 }
6283 while (*code == OP_ALT);
6284 return c;
6285 }
6286
6287
6288
6289 /*************************************************
6290 * Compile a Regular Expression *
6291 *************************************************/
6292
6293 /* This function takes a string and returns a pointer to a block of store
6294 holding a compiled version of the expression. The original API for this
6295 function had no error code return variable; it is retained for backwards
6296 compatibility. The new function is given a new name.
6297
6298 Arguments:
6299 pattern the regular expression
6300 options various option bits
6301 errorcodeptr pointer to error code variable (pcre_compile2() only)
6302 can be NULL if you don't want a code value
6303 errorptr pointer to pointer to error text
6304 erroroffset ptr offset in pattern where error was detected
6305 tables pointer to character tables or NULL
6306
6307 Returns: pointer to compiled data block, or NULL on error,
6308 with errorptr and erroroffset set
6309 */
6310
6311 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6312 pcre_compile(const char *pattern, int options, const char **errorptr,
6313 int *erroroffset, const unsigned char *tables)
6314 {
6315 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6316 }
6317
6318
6319 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6320 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6321 const char **errorptr, int *erroroffset, const unsigned char *tables)
6322 {
6323 real_pcre *re;
6324 int length = 1; /* For final END opcode */
6325 int firstbyte, reqbyte, newline;
6326 int errorcode = 0;
6327 int skipatstart = 0;
6328 BOOL utf8 = (options & PCRE_UTF8) != 0;
6329 size_t size;
6330 uschar *code;
6331 const uschar *codestart;
6332 const uschar *ptr;
6333 compile_data compile_block;
6334 compile_data *cd = &compile_block;
6335
6336 /* This space is used for "compiling" into during the first phase, when we are
6337 computing the amount of memory that is needed. Compiled items are thrown away
6338 as soon as possible, so that a fairly large buffer should be sufficient for
6339 this purpose. The same space is used in the second phase for remembering where
6340 to fill in forward references to subpatterns. */
6341
6342 uschar cworkspace[COMPILE_WORK_SIZE];
6343
6344 /* Set this early so that early errors get offset 0. */
6345
6346 ptr = (const uschar *)pattern;
6347
6348 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6349 can do is just return NULL, but we can set a code value if there is a code
6350 pointer. */
6351
6352 if (errorptr == NULL)
6353 {
6354 if (errorcodeptr != NULL) *errorcodeptr = 99;
6355 return NULL;
6356 }
6357
6358 *errorptr = NULL;
6359 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6360
6361 /* However, we can give a message for this error */
6362
6363 if (erroroffset == NULL)
6364 {
6365 errorcode = ERR16;
6366 goto PCRE_EARLY_ERROR_RETURN2;
6367 }
6368
6369 *erroroffset = 0;
6370
6371 /* Set up pointers to the individual character tables */
6372
6373 if (tables == NULL) tables = _pcre_default_tables;
6374 cd->lcc = tables + lcc_offset;
6375 cd->fcc = tables + fcc_offset;
6376 cd->cbits = tables + cbits_offset;
6377 cd->ctypes = tables + ctypes_offset;
6378
6379 /* Check that all undefined public option bits are zero */
6380
6381 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6382 {
6383 errorcode = ERR17;
6384 goto PCRE_EARLY_ERROR_RETURN;
6385 }
6386
6387 /* Check for global one-time settings at the start of the pattern, and remember
6388 the offset for later. */
6389
6390 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6391 ptr[skipatstart+1] == CHAR_ASTERISK)
6392 {
6393 int newnl = 0;
6394 int newbsr = 0;
6395
6396 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6397 { skipatstart += 7; options |= PCRE_UTF8; continue; }
6398
6399 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6400 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6401 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0)
6402 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6403 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0)
6404 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6405 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6406 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6407 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6408 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6409
6410 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6411 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6412 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6413 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6414
6415 if (newnl != 0)
6416 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6417 else if (newbsr != 0)
6418 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6419 else break;
6420 }
6421
6422 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6423
6424 #ifdef SUPPORT_UTF8
6425 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6426 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6427 {
6428 errorcode = ERR44;
6429 goto PCRE_EARLY_ERROR_RETURN2;
6430 }
6431 #else
6432 if (utf8)
6433 {
6434 errorcode = ERR32;
6435 goto PCRE_EARLY_ERROR_RETURN;
6436 }
6437 #endif
6438
6439 /* Check validity of \R options. */
6440
6441 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6442 {
6443 case 0:
6444 case PCRE_BSR_ANYCRLF:
6445 case PCRE_BSR_UNICODE:
6446 break;
6447 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6448 }
6449
6450 /* Handle different types of newline. The three bits give seven cases. The
6451 current code allows for fixed one- or two-byte sequences, plus "any" and
6452 "anycrlf". */
6453
6454 switch (options & PCRE_NEWLINE_BITS)
6455 {
6456 case 0: newline = NEWLINE; break; /* Build-time default */
6457 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6458 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6459 case PCRE_NEWLINE_CR+
6460 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6461 case PCRE_NEWLINE_ANY: newline = -1; break;
6462 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6463 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6464 }
6465
6466 if (newline == -2)
6467 {
6468 cd->nltype = NLTYPE_ANYCRLF;
6469 }
6470 else if (newline < 0)
6471 {
6472 cd->nltype = NLTYPE_ANY;
6473 }
6474 else
6475 {
6476 cd->nltype = NLTYPE_FIXED;
6477 if (newline > 255)
6478 {
6479 cd->nllen = 2;
6480 cd->nl[0] = (newline >> 8) & 255;
6481 cd->nl[1] = newline & 255;
6482 }
6483 else
6484 {
6485 cd->nllen = 1;
6486 cd->nl[0] = newline;
6487 }
6488 }
6489
6490 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6491 references to help in deciding whether (.*) can be treated as anchored or not.
6492 */
6493
6494 cd->top_backref = 0;
6495 cd->backref_map = 0;
6496
6497 /* Reflect pattern for debugging output */
6498
6499 DPRINTF(("------------------------------------------------------------------\n"));
6500 DPRINTF(("%s\n", pattern));
6501
6502 /* Pretend to compile the pattern while actually just accumulating the length
6503 of memory required. This behaviour is triggered by passing a non-NULL final
6504 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6505 to compile parts of the pattern into; the compiled code is discarded when it is
6506 no longer needed, so hopefully this workspace will never overflow, though there
6507 is a test for its doing so. */
6508
6509 cd->bracount = cd->final_bracount = 0;
6510 cd->names_found = 0;
6511 cd->name_entry_size = 0;
6512 cd->name_table = NULL;
6513 cd->start_workspace = cworkspace;
6514 cd->start_code = cworkspace;
6515 cd->hwm = cworkspace;
6516 cd->start_pattern = (const uschar *)pattern;
6517 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6518 cd->req_varyopt = 0;
6519 cd->external_options = options;
6520 cd->external_flags = 0;
6521 cd->open_caps = NULL;
6522
6523 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6524 don't need to look at the result of the function here. The initial options have
6525 been put into the cd block so that they can be changed if an option setting is
6526 found within the regex right at the beginning. Bringing initial option settings
6527 outside can help speed up starting point checks. */
6528
6529 ptr += skipatstart;
6530 code = cworkspace;
6531 *code = OP_BRA;
6532 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6533 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6534 &length);
6535 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6536
6537 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6538 cd->hwm - cworkspace));
6539
6540 if (length > MAX_PATTERN_SIZE)
6541 {
6542 errorcode = ERR20;
6543 goto PCRE_EARLY_ERROR_RETURN;
6544 }
6545
6546 /* Compute the size of data block needed and get it, either from malloc or
6547 externally provided function. Integer overflow should no longer be possible
6548 because nowadays we limit the maximum value of cd->names_found and
6549 cd->name_entry_size. */
6550
6551 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6552 re = (real_pcre *)(pcre_malloc)(size);
6553
6554 if (re == NULL)
6555 {
6556 errorcode = ERR21;
6557 goto PCRE_EARLY_ERROR_RETURN;
6558 }
6559
6560 /* Put in the magic number, and save the sizes, initial options, internal
6561 flags, and character table pointer. NULL is used for the default character
6562 tables. The nullpad field is at the end; it's there to help in the case when a
6563 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6564 pointers. */
6565
6566 re->magic_number = MAGIC_NUMBER;
6567 re->size = size;
6568 re->options = cd->external_options;
6569 re->flags = cd->external_flags;
6570 re->dummy1 = 0;
6571 re->first_byte = 0;
6572 re->req_byte = 0;
6573 re->name_table_offset = sizeof(real_pcre);
6574 re->name_entry_size = cd->name_entry_size;
6575 re->name_count = cd->names_found;
6576 re->ref_count = 0;
6577 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6578 re->nullpad = NULL;
6579
6580 /* The starting points of the name/number translation table and of the code are
6581 passed around in the compile data block. The start/end pattern and initial
6582 options are already set from the pre-compile phase, as is the name_entry_size
6583 field. Reset the bracket count and the names_found field. Also reset the hwm
6584 field; this time it's used for remembering forward references to subpatterns.
6585 */
6586
6587 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6588 cd->bracount = 0;
6589 cd->names_found = 0;
6590 cd->name_table = (uschar *)re + re->name_table_offset;
6591 codestart = cd->name_table + re->name_entry_size * re->name_count;
6592 cd->start_code = codestart;
6593 cd->hwm = cworkspace;
6594 cd->req_varyopt = 0;
6595 cd->had_accept = FALSE;
6596 cd->check_lookbehind = FALSE;
6597 cd->open_caps = NULL;
6598
6599 /* Set up a starting, non-extracting bracket, then compile the expression. On
6600 error, errorcode will be set non-zero, so we don't need to look at the result
6601 of the function here. */
6602
6603 ptr = (const uschar *)pattern + skipatstart;
6604 code = (uschar *)codestart;
6605 *code = OP_BRA;
6606 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6607 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6608 re->top_bracket = cd->bracount;
6609 re->top_backref = cd->top_backref;
6610 re->flags = cd->external_flags;
6611
6612 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6613
6614 /* If not reached end of pattern on success, there's an excess bracket. */
6615
6616 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6617
6618 /* Fill in the terminating state and check for disastrous overflow, but
6619 if debugging, leave the test till after things are printed out. */
6620
6621 *code++ = OP_END;