/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 340 - (show annotations)
Fri Apr 18 20:00:21 2008 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 201856 byte(s)
Error occurred while calculating annotation data.
Fix incorrect error for patterns like /(?2)[]a()b](abc)/
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static const verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (? or (?-\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299 "a numbered reference must not be zero\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected\0"
305 "digit expected after (?+\0"
306 "] is an invalid data character in JavaScript compatibility mode";
307
308
309 /* Table to identify digits and hex digits. This is used when compiling
310 patterns. Note that the tables in chartables are dependent on the locale, and
311 may mark arbitrary characters as digits - but the PCRE compiling code expects
312 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
313 a private table here. It costs 256 bytes, but it is a lot faster than doing
314 character value tests (at least in some simple cases I timed), and in some
315 applications one wants PCRE to compile efficiently as well as match
316 efficiently.
317
318 For convenience, we use the same bit definitions as in chartables:
319
320 0x04 decimal digit
321 0x08 hexadecimal digit
322
323 Then we can use ctype_digit and ctype_xdigit in the code. */
324
325 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
326 static const unsigned char digitab[] =
327 {
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
334 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
335 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
336 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
340 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360
361 #else /* This is the "abnormal" case, for EBCDIC systems */
362 static const unsigned char digitab[] =
363 {
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
380 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
388 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
394 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
395 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
396
397 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
398 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
399 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
400 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
402 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
406 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
407 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
409 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
411 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
414 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
415 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
416 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
417 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
418 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
419 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
420 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
421 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
422 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
423 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
424 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
425 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
426 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
427 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
428 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
429 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
430 #endif
431
432
433 /* Definition to allow mutual recursion */
434
435 static BOOL
436 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437 int *, int *, branch_chain *, compile_data *, int *);
438
439
440
441 /*************************************************
442 * Find an error text *
443 *************************************************/
444
445 /* The error texts are now all in one long string, to save on relocations. As
446 some of the text is of unknown length, we can't use a table of offsets.
447 Instead, just count through the strings. This is not a performance issue
448 because it happens only when there has been a compilation error.
449
450 Argument: the error number
451 Returns: pointer to the error string
452 */
453
454 static const char *
455 find_error_text(int n)
456 {
457 const char *s = error_texts;
458 for (; n > 0; n--) while (*s++ != 0);
459 return s;
460 }
461
462
463 /*************************************************
464 * Handle escapes *
465 *************************************************/
466
467 /* This function is called when a \ has been encountered. It either returns a
468 positive value for a simple escape such as \n, or a negative value which
469 encodes one of the more complicated things such as \d. A backreference to group
470 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472 ptr is pointing at the \. On exit, it is on the final character of the escape
473 sequence.
474
475 Arguments:
476 ptrptr points to the pattern position pointer
477 errorcodeptr points to the errorcode variable
478 bracount number of previous extracting brackets
479 options the options bits
480 isclass TRUE if inside a character class
481
482 Returns: zero or positive => a data character
483 negative => a special escape sequence
484 on error, errorcodeptr is set
485 */
486
487 static int
488 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
489 int options, BOOL isclass)
490 {
491 BOOL utf8 = (options & PCRE_UTF8) != 0;
492 const uschar *ptr = *ptrptr + 1;
493 int c, i;
494
495 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
496 ptr--; /* Set pointer back to the last byte */
497
498 /* If backslash is at the end of the pattern, it's an error. */
499
500 if (c == 0) *errorcodeptr = ERR1;
501
502 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503 in a table. A non-zero result is something that can be returned immediately.
504 Otherwise further processing may be required. */
505
506 #ifndef EBCDIC /* ASCII coding */
507 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
508 else if ((i = escapes[c - '0']) != 0) c = i;
509
510 #else /* EBCDIC coding */
511 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
512 else if ((i = escapes[c - 0x48]) != 0) c = i;
513 #endif
514
515 /* Escapes that need further processing, or are illegal. */
516
517 else
518 {
519 const uschar *oldptr;
520 BOOL braced, negated;
521
522 switch (c)
523 {
524 /* A number of Perl escapes are not handled by PCRE. We give an explicit
525 error. */
526
527 case 'l':
528 case 'L':
529 case 'N':
530 case 'u':
531 case 'U':
532 *errorcodeptr = ERR37;
533 break;
534
535 /* \g must be followed by one of a number of specific things:
536
537 (1) A number, either plain or braced. If positive, it is an absolute
538 backreference. If negative, it is a relative backreference. This is a Perl
539 5.10 feature.
540
541 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542 is part of Perl's movement towards a unified syntax for back references. As
543 this is synonymous with \k{name}, we fudge it up by pretending it really
544 was \k.
545
546 (3) For Oniguruma compatibility we also support \g followed by a name or a
547 number either in angle brackets or in single quotes. However, these are
548 (possibly recursive) subroutine calls, _not_ backreferences. Just return
549 the -ESC_g code (cf \k). */
550
551 case 'g':
552 if (ptr[1] == '<' || ptr[1] == '\'')
553 {
554 c = -ESC_g;
555 break;
556 }
557
558 /* Handle the Perl-compatible cases */
559
560 if (ptr[1] == '{')
561 {
562 const uschar *p;
563 for (p = ptr+2; *p != 0 && *p != '}'; p++)
564 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565 if (*p != 0 && *p != '}')
566 {
567 c = -ESC_k;
568 break;
569 }
570 braced = TRUE;
571 ptr++;
572 }
573 else braced = FALSE;
574
575 if (ptr[1] == '-')
576 {
577 negated = TRUE;
578 ptr++;
579 }
580 else negated = FALSE;
581
582 c = 0;
583 while ((digitab[ptr[1]] & ctype_digit) != 0)
584 c = c * 10 + *(++ptr) - '0';
585
586 if (c < 0) /* Integer overflow */
587 {
588 *errorcodeptr = ERR61;
589 break;
590 }
591
592 if (braced && *(++ptr) != '}')
593 {
594 *errorcodeptr = ERR57;
595 break;
596 }
597
598 if (c == 0)
599 {
600 *errorcodeptr = ERR58;
601 break;
602 }
603
604 if (negated)
605 {
606 if (c > bracount)
607 {
608 *errorcodeptr = ERR15;
609 break;
610 }
611 c = bracount - (c - 1);
612 }
613
614 c = -(ESC_REF + c);
615 break;
616
617 /* The handling of escape sequences consisting of a string of digits
618 starting with one that is not zero is not straightforward. By experiment,
619 the way Perl works seems to be as follows:
620
621 Outside a character class, the digits are read as a decimal number. If the
622 number is less than 10, or if there are that many previous extracting
623 left brackets, then it is a back reference. Otherwise, up to three octal
624 digits are read to form an escaped byte. Thus \123 is likely to be octal
625 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
626 value is greater than 377, the least significant 8 bits are taken. Inside a
627 character class, \ followed by a digit is always an octal number. */
628
629 case '1': case '2': case '3': case '4': case '5':
630 case '6': case '7': case '8': case '9':
631
632 if (!isclass)
633 {
634 oldptr = ptr;
635 c -= '0';
636 while ((digitab[ptr[1]] & ctype_digit) != 0)
637 c = c * 10 + *(++ptr) - '0';
638 if (c < 0) /* Integer overflow */
639 {
640 *errorcodeptr = ERR61;
641 break;
642 }
643 if (c < 10 || c <= bracount)
644 {
645 c = -(ESC_REF + c);
646 break;
647 }
648 ptr = oldptr; /* Put the pointer back and fall through */
649 }
650
651 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
652 generates a binary zero byte and treats the digit as a following literal.
653 Thus we have to pull back the pointer by one. */
654
655 if ((c = *ptr) >= '8')
656 {
657 ptr--;
658 c = 0;
659 break;
660 }
661
662 /* \0 always starts an octal number, but we may drop through to here with a
663 larger first octal digit. The original code used just to take the least
664 significant 8 bits of octal numbers (I think this is what early Perls used
665 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666 than 3 octal digits. */
667
668 case '0':
669 c -= '0';
670 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671 c = c * 8 + *(++ptr) - '0';
672 if (!utf8 && c > 255) *errorcodeptr = ERR51;
673 break;
674
675 /* \x is complicated. \x{ddd} is a character number which can be greater
676 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
677 treated as a data character. */
678
679 case 'x':
680 if (ptr[1] == '{')
681 {
682 const uschar *pt = ptr + 2;
683 int count = 0;
684
685 c = 0;
686 while ((digitab[*pt] & ctype_xdigit) != 0)
687 {
688 register int cc = *pt++;
689 if (c == 0 && cc == '0') continue; /* Leading zeroes */
690 count++;
691
692 #ifndef EBCDIC /* ASCII coding */
693 if (cc >= 'a') cc -= 32; /* Convert to upper case */
694 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695 #else /* EBCDIC coding */
696 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
697 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698 #endif
699 }
700
701 if (*pt == '}')
702 {
703 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
704 ptr = pt;
705 break;
706 }
707
708 /* If the sequence of hex digits does not end with '}', then we don't
709 recognize this construct; fall through to the normal \x handling. */
710 }
711
712 /* Read just a single-byte hex-defined char */
713
714 c = 0;
715 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
716 {
717 int cc; /* Some compilers don't like ++ */
718 cc = *(++ptr); /* in initializers */
719 #ifndef EBCDIC /* ASCII coding */
720 if (cc >= 'a') cc -= 32; /* Convert to upper case */
721 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722 #else /* EBCDIC coding */
723 if (cc <= 'z') cc += 64; /* Convert to upper case */
724 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725 #endif
726 }
727 break;
728
729 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730 This coding is ASCII-specific, but then the whole concept of \cx is
731 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732
733 case 'c':
734 c = *(++ptr);
735 if (c == 0)
736 {
737 *errorcodeptr = ERR2;
738 break;
739 }
740
741 #ifndef EBCDIC /* ASCII coding */
742 if (c >= 'a' && c <= 'z') c -= 32;
743 c ^= 0x40;
744 #else /* EBCDIC coding */
745 if (c >= 'a' && c <= 'z') c += 64;
746 c ^= 0xC0;
747 #endif
748 break;
749
750 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751 other alphanumeric following \ is an error if PCRE_EXTRA was set;
752 otherwise, for Perl compatibility, it is a literal. This code looks a bit
753 odd, but there used to be some cases other than the default, and there may
754 be again in future, so I haven't "optimized" it. */
755
756 default:
757 if ((options & PCRE_EXTRA) != 0) switch(c)
758 {
759 default:
760 *errorcodeptr = ERR3;
761 break;
762 }
763 break;
764 }
765 }
766
767 *ptrptr = ptr;
768 return c;
769 }
770
771
772
773 #ifdef SUPPORT_UCP
774 /*************************************************
775 * Handle \P and \p *
776 *************************************************/
777
778 /* This function is called after \P or \p has been encountered, provided that
779 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
780 pointing at the P or p. On exit, it is pointing at the final character of the
781 escape sequence.
782
783 Argument:
784 ptrptr points to the pattern position pointer
785 negptr points to a boolean that is set TRUE for negation else FALSE
786 dptr points to an int that is set to the detailed property value
787 errorcodeptr points to the error code variable
788
789 Returns: type value from ucp_type_table, or -1 for an invalid type
790 */
791
792 static int
793 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
794 {
795 int c, i, bot, top;
796 const uschar *ptr = *ptrptr;
797 char name[32];
798
799 c = *(++ptr);
800 if (c == 0) goto ERROR_RETURN;
801
802 *negptr = FALSE;
803
804 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
805 negation. */
806
807 if (c == '{')
808 {
809 if (ptr[1] == '^')
810 {
811 *negptr = TRUE;
812 ptr++;
813 }
814 for (i = 0; i < (int)sizeof(name) - 1; i++)
815 {
816 c = *(++ptr);
817 if (c == 0) goto ERROR_RETURN;
818 if (c == '}') break;
819 name[i] = c;
820 }
821 if (c !='}') goto ERROR_RETURN;
822 name[i] = 0;
823 }
824
825 /* Otherwise there is just one following character */
826
827 else
828 {
829 name[0] = c;
830 name[1] = 0;
831 }
832
833 *ptrptr = ptr;
834
835 /* Search for a recognized property name using binary chop */
836
837 bot = 0;
838 top = _pcre_utt_size;
839
840 while (bot < top)
841 {
842 i = (bot + top) >> 1;
843 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844 if (c == 0)
845 {
846 *dptr = _pcre_utt[i].value;
847 return _pcre_utt[i].type;
848 }
849 if (c > 0) bot = i + 1; else top = i;
850 }
851
852 *errorcodeptr = ERR47;
853 *ptrptr = ptr;
854 return -1;
855
856 ERROR_RETURN:
857 *errorcodeptr = ERR46;
858 *ptrptr = ptr;
859 return -1;
860 }
861 #endif
862
863
864
865
866 /*************************************************
867 * Check for counted repeat *
868 *************************************************/
869
870 /* This function is called when a '{' is encountered in a place where it might
871 start a quantifier. It looks ahead to see if it really is a quantifier or not.
872 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
873 where the ddds are digits.
874
875 Arguments:
876 p pointer to the first char after '{'
877
878 Returns: TRUE or FALSE
879 */
880
881 static BOOL
882 is_counted_repeat(const uschar *p)
883 {
884 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
885 while ((digitab[*p] & ctype_digit) != 0) p++;
886 if (*p == '}') return TRUE;
887
888 if (*p++ != ',') return FALSE;
889 if (*p == '}') return TRUE;
890
891 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
892 while ((digitab[*p] & ctype_digit) != 0) p++;
893
894 return (*p == '}');
895 }
896
897
898
899 /*************************************************
900 * Read repeat counts *
901 *************************************************/
902
903 /* Read an item of the form {n,m} and return the values. This is called only
904 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
905 so the syntax is guaranteed to be correct, but we need to check the values.
906
907 Arguments:
908 p pointer to first char after '{'
909 minp pointer to int for min
910 maxp pointer to int for max
911 returned as -1 if no max
912 errorcodeptr points to error code variable
913
914 Returns: pointer to '}' on success;
915 current ptr on error, with errorcodeptr set non-zero
916 */
917
918 static const uschar *
919 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
920 {
921 int min = 0;
922 int max = -1;
923
924 /* Read the minimum value and do a paranoid check: a negative value indicates
925 an integer overflow. */
926
927 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
928 if (min < 0 || min > 65535)
929 {
930 *errorcodeptr = ERR5;
931 return p;
932 }
933
934 /* Read the maximum value if there is one, and again do a paranoid on its size.
935 Also, max must not be less than min. */
936
937 if (*p == '}') max = min; else
938 {
939 if (*(++p) != '}')
940 {
941 max = 0;
942 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
943 if (max < 0 || max > 65535)
944 {
945 *errorcodeptr = ERR5;
946 return p;
947 }
948 if (max < min)
949 {
950 *errorcodeptr = ERR4;
951 return p;
952 }
953 }
954 }
955
956 /* Fill in the required variables, and pass back the pointer to the terminating
957 '}'. */
958
959 *minp = min;
960 *maxp = max;
961 return p;
962 }
963
964
965
966 /*************************************************
967 * Find forward referenced subpattern *
968 *************************************************/
969
970 /* This function scans along a pattern's text looking for capturing
971 subpatterns, and counting them. If it finds a named pattern that matches the
972 name it is given, it returns its number. Alternatively, if the name is NULL, it
973 returns when it reaches a given numbered subpattern. This is used for forward
974 references to subpatterns. We know that if (?P< is encountered, the name will
975 be terminated by '>' because that is checked in the first pass.
976
977 Arguments:
978 ptr current position in the pattern
979 count current count of capturing parens so far encountered
980 name name to seek, or NULL if seeking a numbered subpattern
981 lorn name length, or subpattern number if name is NULL
982 xmode TRUE if we are in /x mode
983
984 Returns: the number of the named subpattern, or -1 if not found
985 */
986
987 static int
988 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
989 BOOL xmode)
990 {
991 const uschar *thisname;
992
993 for (; *ptr != 0; ptr++)
994 {
995 int term;
996
997 /* Skip over backslashed characters and also entire \Q...\E */
998
999 if (*ptr == '\\')
1000 {
1001 if (*(++ptr) == 0) return -1;
1002 if (*ptr == 'Q') for (;;)
1003 {
1004 while (*(++ptr) != 0 && *ptr != '\\');
1005 if (*ptr == 0) return -1;
1006 if (*(++ptr) == 'E') break;
1007 }
1008 continue;
1009 }
1010
1011 /* Skip over character classes; this logic must be similar to the way they
1012 are handled for real. If the first character is '^', skip it. Also, if the
1013 first few characters (either before or after ^) are \Q\E or \E we skip them
1014 too. This makes for compatibility with Perl. */
1015
1016 if (*ptr == '[')
1017 {
1018 BOOL negate_class = FALSE;
1019 for (;;)
1020 {
1021 int c = *(++ptr);
1022 if (c == '\\')
1023 {
1024 if (ptr[1] == 'E') ptr++;
1025 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1026 else break;
1027 }
1028 else if (!negate_class && c == '^')
1029 negate_class = TRUE;
1030 else break;
1031 }
1032
1033 /* If the next character is ']', it is a data character that must be
1034 skipped. */
1035
1036 if (ptr[1] == ']') ptr++;
1037
1038 while (*(++ptr) != ']')
1039 {
1040 if (*ptr == 0) return -1;
1041 if (*ptr == '\\')
1042 {
1043 if (*(++ptr) == 0) return -1;
1044 if (*ptr == 'Q') for (;;)
1045 {
1046 while (*(++ptr) != 0 && *ptr != '\\');
1047 if (*ptr == 0) return -1;
1048 if (*(++ptr) == 'E') break;
1049 }
1050 continue;
1051 }
1052 }
1053 continue;
1054 }
1055
1056 /* Skip comments in /x mode */
1057
1058 if (xmode && *ptr == '#')
1059 {
1060 while (*(++ptr) != 0 && *ptr != '\n');
1061 if (*ptr == 0) return -1;
1062 continue;
1063 }
1064
1065 /* An opening parens must now be a real metacharacter */
1066
1067 if (*ptr != '(') continue;
1068 if (ptr[1] != '?' && ptr[1] != '*')
1069 {
1070 count++;
1071 if (name == NULL && count == lorn) return count;
1072 continue;
1073 }
1074
1075 ptr += 2;
1076 if (*ptr == 'P') ptr++; /* Allow optional P */
1077
1078 /* We have to disambiguate (?<! and (?<= from (?<name> */
1079
1080 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1081 *ptr != '\'')
1082 continue;
1083
1084 count++;
1085
1086 if (name == NULL && count == lorn) return count;
1087 term = *ptr++;
1088 if (term == '<') term = '>';
1089 thisname = ptr;
1090 while (*ptr != term) ptr++;
1091 if (name != NULL && lorn == ptr - thisname &&
1092 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1093 return count;
1094 }
1095
1096 return -1;
1097 }
1098
1099
1100
1101 /*************************************************
1102 * Find first significant op code *
1103 *************************************************/
1104
1105 /* This is called by several functions that scan a compiled expression looking
1106 for a fixed first character, or an anchoring op code etc. It skips over things
1107 that do not influence this. For some calls, a change of option is important.
1108 For some calls, it makes sense to skip negative forward and all backward
1109 assertions, and also the \b assertion; for others it does not.
1110
1111 Arguments:
1112 code pointer to the start of the group
1113 options pointer to external options
1114 optbit the option bit whose changing is significant, or
1115 zero if none are
1116 skipassert TRUE if certain assertions are to be skipped
1117
1118 Returns: pointer to the first significant opcode
1119 */
1120
1121 static const uschar*
1122 first_significant_code(const uschar *code, int *options, int optbit,
1123 BOOL skipassert)
1124 {
1125 for (;;)
1126 {
1127 switch ((int)*code)
1128 {
1129 case OP_OPT:
1130 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1131 *options = (int)code[1];
1132 code += 2;
1133 break;
1134
1135 case OP_ASSERT_NOT:
1136 case OP_ASSERTBACK:
1137 case OP_ASSERTBACK_NOT:
1138 if (!skipassert) return code;
1139 do code += GET(code, 1); while (*code == OP_ALT);
1140 code += _pcre_OP_lengths[*code];
1141 break;
1142
1143 case OP_WORD_BOUNDARY:
1144 case OP_NOT_WORD_BOUNDARY:
1145 if (!skipassert) return code;
1146 /* Fall through */
1147
1148 case OP_CALLOUT:
1149 case OP_CREF:
1150 case OP_RREF:
1151 case OP_DEF:
1152 code += _pcre_OP_lengths[*code];
1153 break;
1154
1155 default:
1156 return code;
1157 }
1158 }
1159 /* Control never reaches here */
1160 }
1161
1162
1163
1164
1165 /*************************************************
1166 * Find the fixed length of a pattern *
1167 *************************************************/
1168
1169 /* Scan a pattern and compute the fixed length of subject that will match it,
1170 if the length is fixed. This is needed for dealing with backward assertions.
1171 In UTF8 mode, the result is in characters rather than bytes.
1172
1173 Arguments:
1174 code points to the start of the pattern (the bracket)
1175 options the compiling options
1176
1177 Returns: the fixed length, or -1 if there is no fixed length,
1178 or -2 if \C was encountered
1179 */
1180
1181 static int
1182 find_fixedlength(uschar *code, int options)
1183 {
1184 int length = -1;
1185
1186 register int branchlength = 0;
1187 register uschar *cc = code + 1 + LINK_SIZE;
1188
1189 /* Scan along the opcodes for this branch. If we get to the end of the
1190 branch, check the length against that of the other branches. */
1191
1192 for (;;)
1193 {
1194 int d;
1195 register int op = *cc;
1196 switch (op)
1197 {
1198 case OP_CBRA:
1199 case OP_BRA:
1200 case OP_ONCE:
1201 case OP_COND:
1202 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1203 if (d < 0) return d;
1204 branchlength += d;
1205 do cc += GET(cc, 1); while (*cc == OP_ALT);
1206 cc += 1 + LINK_SIZE;
1207 break;
1208
1209 /* Reached end of a branch; if it's a ket it is the end of a nested
1210 call. If it's ALT it is an alternation in a nested call. If it is
1211 END it's the end of the outer call. All can be handled by the same code. */
1212
1213 case OP_ALT:
1214 case OP_KET:
1215 case OP_KETRMAX:
1216 case OP_KETRMIN:
1217 case OP_END:
1218 if (length < 0) length = branchlength;
1219 else if (length != branchlength) return -1;
1220 if (*cc != OP_ALT) return length;
1221 cc += 1 + LINK_SIZE;
1222 branchlength = 0;
1223 break;
1224
1225 /* Skip over assertive subpatterns */
1226
1227 case OP_ASSERT:
1228 case OP_ASSERT_NOT:
1229 case OP_ASSERTBACK:
1230 case OP_ASSERTBACK_NOT:
1231 do cc += GET(cc, 1); while (*cc == OP_ALT);
1232 /* Fall through */
1233
1234 /* Skip over things that don't match chars */
1235
1236 case OP_REVERSE:
1237 case OP_CREF:
1238 case OP_RREF:
1239 case OP_DEF:
1240 case OP_OPT:
1241 case OP_CALLOUT:
1242 case OP_SOD:
1243 case OP_SOM:
1244 case OP_EOD:
1245 case OP_EODN:
1246 case OP_CIRC:
1247 case OP_DOLL:
1248 case OP_NOT_WORD_BOUNDARY:
1249 case OP_WORD_BOUNDARY:
1250 cc += _pcre_OP_lengths[*cc];
1251 break;
1252
1253 /* Handle literal characters */
1254
1255 case OP_CHAR:
1256 case OP_CHARNC:
1257 case OP_NOT:
1258 branchlength++;
1259 cc += 2;
1260 #ifdef SUPPORT_UTF8
1261 if ((options & PCRE_UTF8) != 0)
1262 {
1263 while ((*cc & 0xc0) == 0x80) cc++;
1264 }
1265 #endif
1266 break;
1267
1268 /* Handle exact repetitions. The count is already in characters, but we
1269 need to skip over a multibyte character in UTF8 mode. */
1270
1271 case OP_EXACT:
1272 branchlength += GET2(cc,1);
1273 cc += 4;
1274 #ifdef SUPPORT_UTF8
1275 if ((options & PCRE_UTF8) != 0)
1276 {
1277 while((*cc & 0x80) == 0x80) cc++;
1278 }
1279 #endif
1280 break;
1281
1282 case OP_TYPEEXACT:
1283 branchlength += GET2(cc,1);
1284 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1285 cc += 4;
1286 break;
1287
1288 /* Handle single-char matchers */
1289
1290 case OP_PROP:
1291 case OP_NOTPROP:
1292 cc += 2;
1293 /* Fall through */
1294
1295 case OP_NOT_DIGIT:
1296 case OP_DIGIT:
1297 case OP_NOT_WHITESPACE:
1298 case OP_WHITESPACE:
1299 case OP_NOT_WORDCHAR:
1300 case OP_WORDCHAR:
1301 case OP_ANY:
1302 branchlength++;
1303 cc++;
1304 break;
1305
1306 /* The single-byte matcher isn't allowed */
1307
1308 case OP_ANYBYTE:
1309 return -2;
1310
1311 /* Check a class for variable quantification */
1312
1313 #ifdef SUPPORT_UTF8
1314 case OP_XCLASS:
1315 cc += GET(cc, 1) - 33;
1316 /* Fall through */
1317 #endif
1318
1319 case OP_CLASS:
1320 case OP_NCLASS:
1321 cc += 33;
1322
1323 switch (*cc)
1324 {
1325 case OP_CRSTAR:
1326 case OP_CRMINSTAR:
1327 case OP_CRQUERY:
1328 case OP_CRMINQUERY:
1329 return -1;
1330
1331 case OP_CRRANGE:
1332 case OP_CRMINRANGE:
1333 if (GET2(cc,1) != GET2(cc,3)) return -1;
1334 branchlength += GET2(cc,1);
1335 cc += 5;
1336 break;
1337
1338 default:
1339 branchlength++;
1340 }
1341 break;
1342
1343 /* Anything else is variable length */
1344
1345 default:
1346 return -1;
1347 }
1348 }
1349 /* Control never gets here */
1350 }
1351
1352
1353
1354
1355 /*************************************************
1356 * Scan compiled regex for numbered bracket *
1357 *************************************************/
1358
1359 /* This little function scans through a compiled pattern until it finds a
1360 capturing bracket with the given number.
1361
1362 Arguments:
1363 code points to start of expression
1364 utf8 TRUE in UTF-8 mode
1365 number the required bracket number
1366
1367 Returns: pointer to the opcode for the bracket, or NULL if not found
1368 */
1369
1370 static const uschar *
1371 find_bracket(const uschar *code, BOOL utf8, int number)
1372 {
1373 for (;;)
1374 {
1375 register int c = *code;
1376 if (c == OP_END) return NULL;
1377
1378 /* XCLASS is used for classes that cannot be represented just by a bit
1379 map. This includes negated single high-valued characters. The length in
1380 the table is zero; the actual length is stored in the compiled code. */
1381
1382 if (c == OP_XCLASS) code += GET(code, 1);
1383
1384 /* Handle capturing bracket */
1385
1386 else if (c == OP_CBRA)
1387 {
1388 int n = GET2(code, 1+LINK_SIZE);
1389 if (n == number) return (uschar *)code;
1390 code += _pcre_OP_lengths[c];
1391 }
1392
1393 /* Otherwise, we can get the item's length from the table, except that for
1394 repeated character types, we have to test for \p and \P, which have an extra
1395 two bytes of parameters. */
1396
1397 else
1398 {
1399 switch(c)
1400 {
1401 case OP_TYPESTAR:
1402 case OP_TYPEMINSTAR:
1403 case OP_TYPEPLUS:
1404 case OP_TYPEMINPLUS:
1405 case OP_TYPEQUERY:
1406 case OP_TYPEMINQUERY:
1407 case OP_TYPEPOSSTAR:
1408 case OP_TYPEPOSPLUS:
1409 case OP_TYPEPOSQUERY:
1410 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1411 break;
1412
1413 case OP_TYPEUPTO:
1414 case OP_TYPEMINUPTO:
1415 case OP_TYPEEXACT:
1416 case OP_TYPEPOSUPTO:
1417 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1418 break;
1419 }
1420
1421 /* Add in the fixed length from the table */
1422
1423 code += _pcre_OP_lengths[c];
1424
1425 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1426 a multi-byte character. The length in the table is a minimum, so we have to
1427 arrange to skip the extra bytes. */
1428
1429 #ifdef SUPPORT_UTF8
1430 if (utf8) switch(c)
1431 {
1432 case OP_CHAR:
1433 case OP_CHARNC:
1434 case OP_EXACT:
1435 case OP_UPTO:
1436 case OP_MINUPTO:
1437 case OP_POSUPTO:
1438 case OP_STAR:
1439 case OP_MINSTAR:
1440 case OP_POSSTAR:
1441 case OP_PLUS:
1442 case OP_MINPLUS:
1443 case OP_POSPLUS:
1444 case OP_QUERY:
1445 case OP_MINQUERY:
1446 case OP_POSQUERY:
1447 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1448 break;
1449 }
1450 #endif
1451 }
1452 }
1453 }
1454
1455
1456
1457 /*************************************************
1458 * Scan compiled regex for recursion reference *
1459 *************************************************/
1460
1461 /* This little function scans through a compiled pattern until it finds an
1462 instance of OP_RECURSE.
1463
1464 Arguments:
1465 code points to start of expression
1466 utf8 TRUE in UTF-8 mode
1467
1468 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1469 */
1470
1471 static const uschar *
1472 find_recurse(const uschar *code, BOOL utf8)
1473 {
1474 for (;;)
1475 {
1476 register int c = *code;
1477 if (c == OP_END) return NULL;
1478 if (c == OP_RECURSE) return code;
1479
1480 /* XCLASS is used for classes that cannot be represented just by a bit
1481 map. This includes negated single high-valued characters. The length in
1482 the table is zero; the actual length is stored in the compiled code. */
1483
1484 if (c == OP_XCLASS) code += GET(code, 1);
1485
1486 /* Otherwise, we can get the item's length from the table, except that for
1487 repeated character types, we have to test for \p and \P, which have an extra
1488 two bytes of parameters. */
1489
1490 else
1491 {
1492 switch(c)
1493 {
1494 case OP_TYPESTAR:
1495 case OP_TYPEMINSTAR:
1496 case OP_TYPEPLUS:
1497 case OP_TYPEMINPLUS:
1498 case OP_TYPEQUERY:
1499 case OP_TYPEMINQUERY:
1500 case OP_TYPEPOSSTAR:
1501 case OP_TYPEPOSPLUS:
1502 case OP_TYPEPOSQUERY:
1503 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1504 break;
1505
1506 case OP_TYPEPOSUPTO:
1507 case OP_TYPEUPTO:
1508 case OP_TYPEMINUPTO:
1509 case OP_TYPEEXACT:
1510 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1511 break;
1512 }
1513
1514 /* Add in the fixed length from the table */
1515
1516 code += _pcre_OP_lengths[c];
1517
1518 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1519 by a multi-byte character. The length in the table is a minimum, so we have
1520 to arrange to skip the extra bytes. */
1521
1522 #ifdef SUPPORT_UTF8
1523 if (utf8) switch(c)
1524 {
1525 case OP_CHAR:
1526 case OP_CHARNC:
1527 case OP_EXACT:
1528 case OP_UPTO:
1529 case OP_MINUPTO:
1530 case OP_POSUPTO:
1531 case OP_STAR:
1532 case OP_MINSTAR:
1533 case OP_POSSTAR:
1534 case OP_PLUS:
1535 case OP_MINPLUS:
1536 case OP_POSPLUS:
1537 case OP_QUERY:
1538 case OP_MINQUERY:
1539 case OP_POSQUERY:
1540 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1541 break;
1542 }
1543 #endif
1544 }
1545 }
1546 }
1547
1548
1549
1550 /*************************************************
1551 * Scan compiled branch for non-emptiness *
1552 *************************************************/
1553
1554 /* This function scans through a branch of a compiled pattern to see whether it
1555 can match the empty string or not. It is called from could_be_empty()
1556 below and from compile_branch() when checking for an unlimited repeat of a
1557 group that can match nothing. Note that first_significant_code() skips over
1558 backward and negative forward assertions when its final argument is TRUE. If we
1559 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1560 bracket whose current branch will already have been scanned.
1561
1562 Arguments:
1563 code points to start of search
1564 endcode points to where to stop
1565 utf8 TRUE if in UTF8 mode
1566
1567 Returns: TRUE if what is matched could be empty
1568 */
1569
1570 static BOOL
1571 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1572 {
1573 register int c;
1574 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1575 code < endcode;
1576 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1577 {
1578 const uschar *ccode;
1579
1580 c = *code;
1581
1582 /* Skip over forward assertions; the other assertions are skipped by
1583 first_significant_code() with a TRUE final argument. */
1584
1585 if (c == OP_ASSERT)
1586 {
1587 do code += GET(code, 1); while (*code == OP_ALT);
1588 c = *code;
1589 continue;
1590 }
1591
1592 /* Groups with zero repeats can of course be empty; skip them. */
1593
1594 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1595 {
1596 code += _pcre_OP_lengths[c];
1597 do code += GET(code, 1); while (*code == OP_ALT);
1598 c = *code;
1599 continue;
1600 }
1601
1602 /* For other groups, scan the branches. */
1603
1604 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1605 {
1606 BOOL empty_branch;
1607 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1608
1609 /* Scan a closed bracket */
1610
1611 empty_branch = FALSE;
1612 do
1613 {
1614 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1615 empty_branch = TRUE;
1616 code += GET(code, 1);
1617 }
1618 while (*code == OP_ALT);
1619 if (!empty_branch) return FALSE; /* All branches are non-empty */
1620 c = *code;
1621 continue;
1622 }
1623
1624 /* Handle the other opcodes */
1625
1626 switch (c)
1627 {
1628 /* Check for quantifiers after a class. XCLASS is used for classes that
1629 cannot be represented just by a bit map. This includes negated single
1630 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1631 actual length is stored in the compiled code, so we must update "code"
1632 here. */
1633
1634 #ifdef SUPPORT_UTF8
1635 case OP_XCLASS:
1636 ccode = code += GET(code, 1);
1637 goto CHECK_CLASS_REPEAT;
1638 #endif
1639
1640 case OP_CLASS:
1641 case OP_NCLASS:
1642 ccode = code + 33;
1643
1644 #ifdef SUPPORT_UTF8
1645 CHECK_CLASS_REPEAT:
1646 #endif
1647
1648 switch (*ccode)
1649 {
1650 case OP_CRSTAR: /* These could be empty; continue */
1651 case OP_CRMINSTAR:
1652 case OP_CRQUERY:
1653 case OP_CRMINQUERY:
1654 break;
1655
1656 default: /* Non-repeat => class must match */
1657 case OP_CRPLUS: /* These repeats aren't empty */
1658 case OP_CRMINPLUS:
1659 return FALSE;
1660
1661 case OP_CRRANGE:
1662 case OP_CRMINRANGE:
1663 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1664 break;
1665 }
1666 break;
1667
1668 /* Opcodes that must match a character */
1669
1670 case OP_PROP:
1671 case OP_NOTPROP:
1672 case OP_EXTUNI:
1673 case OP_NOT_DIGIT:
1674 case OP_DIGIT:
1675 case OP_NOT_WHITESPACE:
1676 case OP_WHITESPACE:
1677 case OP_NOT_WORDCHAR:
1678 case OP_WORDCHAR:
1679 case OP_ANY:
1680 case OP_ANYBYTE:
1681 case OP_CHAR:
1682 case OP_CHARNC:
1683 case OP_NOT:
1684 case OP_PLUS:
1685 case OP_MINPLUS:
1686 case OP_POSPLUS:
1687 case OP_EXACT:
1688 case OP_NOTPLUS:
1689 case OP_NOTMINPLUS:
1690 case OP_NOTPOSPLUS:
1691 case OP_NOTEXACT:
1692 case OP_TYPEPLUS:
1693 case OP_TYPEMINPLUS:
1694 case OP_TYPEPOSPLUS:
1695 case OP_TYPEEXACT:
1696 return FALSE;
1697
1698 /* These are going to continue, as they may be empty, but we have to
1699 fudge the length for the \p and \P cases. */
1700
1701 case OP_TYPESTAR:
1702 case OP_TYPEMINSTAR:
1703 case OP_TYPEPOSSTAR:
1704 case OP_TYPEQUERY:
1705 case OP_TYPEMINQUERY:
1706 case OP_TYPEPOSQUERY:
1707 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1708 break;
1709
1710 /* Same for these */
1711
1712 case OP_TYPEUPTO:
1713 case OP_TYPEMINUPTO:
1714 case OP_TYPEPOSUPTO:
1715 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1716 break;
1717
1718 /* End of branch */
1719
1720 case OP_KET:
1721 case OP_KETRMAX:
1722 case OP_KETRMIN:
1723 case OP_ALT:
1724 return TRUE;
1725
1726 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1727 MINUPTO, and POSUPTO may be followed by a multibyte character */
1728
1729 #ifdef SUPPORT_UTF8
1730 case OP_STAR:
1731 case OP_MINSTAR:
1732 case OP_POSSTAR:
1733 case OP_QUERY:
1734 case OP_MINQUERY:
1735 case OP_POSQUERY:
1736 case OP_UPTO:
1737 case OP_MINUPTO:
1738 case OP_POSUPTO:
1739 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1740 break;
1741 #endif
1742 }
1743 }
1744
1745 return TRUE;
1746 }
1747
1748
1749
1750 /*************************************************
1751 * Scan compiled regex for non-emptiness *
1752 *************************************************/
1753
1754 /* This function is called to check for left recursive calls. We want to check
1755 the current branch of the current pattern to see if it could match the empty
1756 string. If it could, we must look outwards for branches at other levels,
1757 stopping when we pass beyond the bracket which is the subject of the recursion.
1758
1759 Arguments:
1760 code points to start of the recursion
1761 endcode points to where to stop (current RECURSE item)
1762 bcptr points to the chain of current (unclosed) branch starts
1763 utf8 TRUE if in UTF-8 mode
1764
1765 Returns: TRUE if what is matched could be empty
1766 */
1767
1768 static BOOL
1769 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1770 BOOL utf8)
1771 {
1772 while (bcptr != NULL && bcptr->current >= code)
1773 {
1774 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1775 bcptr = bcptr->outer;
1776 }
1777 return TRUE;
1778 }
1779
1780
1781
1782 /*************************************************
1783 * Check for POSIX class syntax *
1784 *************************************************/
1785
1786 /* This function is called when the sequence "[:" or "[." or "[=" is
1787 encountered in a character class. It checks whether this is followed by a
1788 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1789 reach an unescaped ']' without the special preceding character, return FALSE.
1790
1791 Originally, this function only recognized a sequence of letters between the
1792 terminators, but it seems that Perl recognizes any sequence of characters,
1793 though of course unknown POSIX names are subsequently rejected. Perl gives an
1794 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1795 didn't consider this to be a POSIX class. Likewise for [:1234:].
1796
1797 The problem in trying to be exactly like Perl is in the handling of escapes. We
1798 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1799 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1800 below handles the special case of \], but does not try to do any other escape
1801 processing. This makes it different from Perl for cases such as [:l\ower:]
1802 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1803 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1804 I think.
1805
1806 Arguments:
1807 ptr pointer to the initial [
1808 endptr where to return the end pointer
1809
1810 Returns: TRUE or FALSE
1811 */
1812
1813 static BOOL
1814 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1815 {
1816 int terminator; /* Don't combine these lines; the Solaris cc */
1817 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1818 for (++ptr; *ptr != 0; ptr++)
1819 {
1820 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1821 {
1822 if (*ptr == ']') return FALSE;
1823 if (*ptr == terminator && ptr[1] == ']')
1824 {
1825 *endptr = ptr;
1826 return TRUE;
1827 }
1828 }
1829 }
1830 return FALSE;
1831 }
1832
1833
1834
1835
1836 /*************************************************
1837 * Check POSIX class name *
1838 *************************************************/
1839
1840 /* This function is called to check the name given in a POSIX-style class entry
1841 such as [:alnum:].
1842
1843 Arguments:
1844 ptr points to the first letter
1845 len the length of the name
1846
1847 Returns: a value representing the name, or -1 if unknown
1848 */
1849
1850 static int
1851 check_posix_name(const uschar *ptr, int len)
1852 {
1853 const char *pn = posix_names;
1854 register int yield = 0;
1855 while (posix_name_lengths[yield] != 0)
1856 {
1857 if (len == posix_name_lengths[yield] &&
1858 strncmp((const char *)ptr, pn, len) == 0) return yield;
1859 pn += posix_name_lengths[yield] + 1;
1860 yield++;
1861 }
1862 return -1;
1863 }
1864
1865
1866 /*************************************************
1867 * Adjust OP_RECURSE items in repeated group *
1868 *************************************************/
1869
1870 /* OP_RECURSE items contain an offset from the start of the regex to the group
1871 that is referenced. This means that groups can be replicated for fixed
1872 repetition simply by copying (because the recursion is allowed to refer to
1873 earlier groups that are outside the current group). However, when a group is
1874 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1875 inserted before it, after it has been compiled. This means that any OP_RECURSE
1876 items within it that refer to the group itself or any contained groups have to
1877 have their offsets adjusted. That one of the jobs of this function. Before it
1878 is called, the partially compiled regex must be temporarily terminated with
1879 OP_END.
1880
1881 This function has been extended with the possibility of forward references for
1882 recursions and subroutine calls. It must also check the list of such references
1883 for the group we are dealing with. If it finds that one of the recursions in
1884 the current group is on this list, it adjusts the offset in the list, not the
1885 value in the reference (which is a group number).
1886
1887 Arguments:
1888 group points to the start of the group
1889 adjust the amount by which the group is to be moved
1890 utf8 TRUE in UTF-8 mode
1891 cd contains pointers to tables etc.
1892 save_hwm the hwm forward reference pointer at the start of the group
1893
1894 Returns: nothing
1895 */
1896
1897 static void
1898 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1899 uschar *save_hwm)
1900 {
1901 uschar *ptr = group;
1902
1903 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1904 {
1905 int offset;
1906 uschar *hc;
1907
1908 /* See if this recursion is on the forward reference list. If so, adjust the
1909 reference. */
1910
1911 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1912 {
1913 offset = GET(hc, 0);
1914 if (cd->start_code + offset == ptr + 1)
1915 {
1916 PUT(hc, 0, offset + adjust);
1917 break;
1918 }
1919 }
1920
1921 /* Otherwise, adjust the recursion offset if it's after the start of this
1922 group. */
1923
1924 if (hc >= cd->hwm)
1925 {
1926 offset = GET(ptr, 1);
1927 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1928 }
1929
1930 ptr += 1 + LINK_SIZE;
1931 }
1932 }
1933
1934
1935
1936 /*************************************************
1937 * Insert an automatic callout point *
1938 *************************************************/
1939
1940 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1941 callout points before each pattern item.
1942
1943 Arguments:
1944 code current code pointer
1945 ptr current pattern pointer
1946 cd pointers to tables etc
1947
1948 Returns: new code pointer
1949 */
1950
1951 static uschar *
1952 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1953 {
1954 *code++ = OP_CALLOUT;
1955 *code++ = 255;
1956 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1957 PUT(code, LINK_SIZE, 0); /* Default length */
1958 return code + 2*LINK_SIZE;
1959 }
1960
1961
1962
1963 /*************************************************
1964 * Complete a callout item *
1965 *************************************************/
1966
1967 /* A callout item contains the length of the next item in the pattern, which
1968 we can't fill in till after we have reached the relevant point. This is used
1969 for both automatic and manual callouts.
1970
1971 Arguments:
1972 previous_callout points to previous callout item
1973 ptr current pattern pointer
1974 cd pointers to tables etc
1975
1976 Returns: nothing
1977 */
1978
1979 static void
1980 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1981 {
1982 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1983 PUT(previous_callout, 2 + LINK_SIZE, length);
1984 }
1985
1986
1987
1988 #ifdef SUPPORT_UCP
1989 /*************************************************
1990 * Get othercase range *
1991 *************************************************/
1992
1993 /* This function is passed the start and end of a class range, in UTF-8 mode
1994 with UCP support. It searches up the characters, looking for internal ranges of
1995 characters in the "other" case. Each call returns the next one, updating the
1996 start address.
1997
1998 Arguments:
1999 cptr points to starting character value; updated
2000 d end value
2001 ocptr where to put start of othercase range
2002 odptr where to put end of othercase range
2003
2004 Yield: TRUE when range returned; FALSE when no more
2005 */
2006
2007 static BOOL
2008 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2009 unsigned int *odptr)
2010 {
2011 unsigned int c, othercase, next;
2012
2013 for (c = *cptr; c <= d; c++)
2014 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
2015
2016 if (c > d) return FALSE;
2017
2018 *ocptr = othercase;
2019 next = othercase + 1;
2020
2021 for (++c; c <= d; c++)
2022 {
2023 if (_pcre_ucp_othercase(c) != next) break;
2024 next++;
2025 }
2026
2027 *odptr = next - 1;
2028 *cptr = c;
2029
2030 return TRUE;
2031 }
2032 #endif /* SUPPORT_UCP */
2033
2034
2035
2036 /*************************************************
2037 * Check if auto-possessifying is possible *
2038 *************************************************/
2039
2040 /* This function is called for unlimited repeats of certain items, to see
2041 whether the next thing could possibly match the repeated item. If not, it makes
2042 sense to automatically possessify the repeated item.
2043
2044 Arguments:
2045 op_code the repeated op code
2046 this data for this item, depends on the opcode
2047 utf8 TRUE in UTF-8 mode
2048 utf8_char used for utf8 character bytes, NULL if not relevant
2049 ptr next character in pattern
2050 options options bits
2051 cd contains pointers to tables etc.
2052
2053 Returns: TRUE if possessifying is wanted
2054 */
2055
2056 static BOOL
2057 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2058 const uschar *ptr, int options, compile_data *cd)
2059 {
2060 int next;
2061
2062 /* Skip whitespace and comments in extended mode */
2063
2064 if ((options & PCRE_EXTENDED) != 0)
2065 {
2066 for (;;)
2067 {
2068 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2069 if (*ptr == '#')
2070 {
2071 while (*(++ptr) != 0)
2072 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2073 }
2074 else break;
2075 }
2076 }
2077
2078 /* If the next item is one that we can handle, get its value. A non-negative
2079 value is a character, a negative value is an escape value. */
2080
2081 if (*ptr == '\\')
2082 {
2083 int temperrorcode = 0;
2084 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2085 if (temperrorcode != 0) return FALSE;
2086 ptr++; /* Point after the escape sequence */
2087 }
2088
2089 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2090 {
2091 #ifdef SUPPORT_UTF8
2092 if (utf8) { GETCHARINC(next, ptr); } else
2093 #endif
2094 next = *ptr++;
2095 }
2096
2097 else return FALSE;
2098
2099 /* Skip whitespace and comments in extended mode */
2100
2101 if ((options & PCRE_EXTENDED) != 0)
2102 {
2103 for (;;)
2104 {
2105 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2106 if (*ptr == '#')
2107 {
2108 while (*(++ptr) != 0)
2109 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2110 }
2111 else break;
2112 }
2113 }
2114
2115 /* If the next thing is itself optional, we have to give up. */
2116
2117 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2118 return FALSE;
2119
2120 /* Now compare the next item with the previous opcode. If the previous is a
2121 positive single character match, "item" either contains the character or, if
2122 "item" is greater than 127 in utf8 mode, the character's bytes are in
2123 utf8_char. */
2124
2125
2126 /* Handle cases when the next item is a character. */
2127
2128 if (next >= 0) switch(op_code)
2129 {
2130 case OP_CHAR:
2131 #ifdef SUPPORT_UTF8
2132 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2133 #endif
2134 return item != next;
2135
2136 /* For CHARNC (caseless character) we must check the other case. If we have
2137 Unicode property support, we can use it to test the other case of
2138 high-valued characters. */
2139
2140 case OP_CHARNC:
2141 #ifdef SUPPORT_UTF8
2142 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2143 #endif
2144 if (item == next) return FALSE;
2145 #ifdef SUPPORT_UTF8
2146 if (utf8)
2147 {
2148 unsigned int othercase;
2149 if (next < 128) othercase = cd->fcc[next]; else
2150 #ifdef SUPPORT_UCP
2151 othercase = _pcre_ucp_othercase((unsigned int)next);
2152 #else
2153 othercase = NOTACHAR;
2154 #endif
2155 return (unsigned int)item != othercase;
2156 }
2157 else
2158 #endif /* SUPPORT_UTF8 */
2159 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2160
2161 /* For OP_NOT, "item" must be a single-byte character. */
2162
2163 case OP_NOT:
2164 if (item == next) return TRUE;
2165 if ((options & PCRE_CASELESS) == 0) return FALSE;
2166 #ifdef SUPPORT_UTF8
2167 if (utf8)
2168 {
2169 unsigned int othercase;
2170 if (next < 128) othercase = cd->fcc[next]; else
2171 #ifdef SUPPORT_UCP
2172 othercase = _pcre_ucp_othercase(next);
2173 #else
2174 othercase = NOTACHAR;
2175 #endif
2176 return (unsigned int)item == othercase;
2177 }
2178 else
2179 #endif /* SUPPORT_UTF8 */
2180 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2181
2182 case OP_DIGIT:
2183 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2184
2185 case OP_NOT_DIGIT:
2186 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2187
2188 case OP_WHITESPACE:
2189 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2190
2191 case OP_NOT_WHITESPACE:
2192 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2193
2194 case OP_WORDCHAR:
2195 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2196
2197 case OP_NOT_WORDCHAR:
2198 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2199
2200 case OP_HSPACE:
2201 case OP_NOT_HSPACE:
2202 switch(next)
2203 {
2204 case 0x09:
2205 case 0x20:
2206 case 0xa0:
2207 case 0x1680:
2208 case 0x180e:
2209 case 0x2000:
2210 case 0x2001:
2211 case 0x2002:
2212 case 0x2003:
2213 case 0x2004:
2214 case 0x2005:
2215 case 0x2006:
2216 case 0x2007:
2217 case 0x2008:
2218 case 0x2009:
2219 case 0x200A:
2220 case 0x202f:
2221 case 0x205f:
2222 case 0x3000:
2223 return op_code != OP_HSPACE;
2224 default:
2225 return op_code == OP_HSPACE;
2226 }
2227
2228 case OP_VSPACE:
2229 case OP_NOT_VSPACE:
2230 switch(next)
2231 {
2232 case 0x0a:
2233 case 0x0b:
2234 case 0x0c:
2235 case 0x0d:
2236 case 0x85:
2237 case 0x2028:
2238 case 0x2029:
2239 return op_code != OP_VSPACE;
2240 default:
2241 return op_code == OP_VSPACE;
2242 }
2243
2244 default:
2245 return FALSE;
2246 }
2247
2248
2249 /* Handle the case when the next item is \d, \s, etc. */
2250
2251 switch(op_code)
2252 {
2253 case OP_CHAR:
2254 case OP_CHARNC:
2255 #ifdef SUPPORT_UTF8
2256 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2257 #endif
2258 switch(-next)
2259 {
2260 case ESC_d:
2261 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2262
2263 case ESC_D:
2264 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2265
2266 case ESC_s:
2267 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2268
2269 case ESC_S:
2270 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2271
2272 case ESC_w:
2273 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2274
2275 case ESC_W:
2276 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2277
2278 case ESC_h:
2279 case ESC_H:
2280 switch(item)
2281 {
2282 case 0x09:
2283 case 0x20:
2284 case 0xa0:
2285 case 0x1680:
2286 case 0x180e:
2287 case 0x2000:
2288 case 0x2001:
2289 case 0x2002:
2290 case 0x2003:
2291 case 0x2004:
2292 case 0x2005:
2293 case 0x2006:
2294 case 0x2007:
2295 case 0x2008:
2296 case 0x2009:
2297 case 0x200A:
2298 case 0x202f:
2299 case 0x205f:
2300 case 0x3000:
2301 return -next != ESC_h;
2302 default:
2303 return -next == ESC_h;
2304 }
2305
2306 case ESC_v:
2307 case ESC_V:
2308 switch(item)
2309 {
2310 case 0x0a:
2311 case 0x0b:
2312 case 0x0c:
2313 case 0x0d:
2314 case 0x85:
2315 case 0x2028:
2316 case 0x2029:
2317 return -next != ESC_v;
2318 default:
2319 return -next == ESC_v;
2320 }
2321
2322 default:
2323 return FALSE;
2324 }
2325
2326 case OP_DIGIT:
2327 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2328 next == -ESC_h || next == -ESC_v;
2329
2330 case OP_NOT_DIGIT:
2331 return next == -ESC_d;
2332
2333 case OP_WHITESPACE:
2334 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2335
2336 case OP_NOT_WHITESPACE:
2337 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2338
2339 case OP_HSPACE:
2340 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2341
2342 case OP_NOT_HSPACE:
2343 return next == -ESC_h;
2344
2345 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2346 case OP_VSPACE:
2347 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2348
2349 case OP_NOT_VSPACE:
2350 return next == -ESC_v;
2351
2352 case OP_WORDCHAR:
2353 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2354
2355 case OP_NOT_WORDCHAR:
2356 return next == -ESC_w || next == -ESC_d;
2357
2358 default:
2359 return FALSE;
2360 }
2361
2362 /* Control does not reach here */
2363 }
2364
2365
2366
2367 /*************************************************
2368 * Compile one branch *
2369 *************************************************/
2370
2371 /* Scan the pattern, compiling it into the a vector. If the options are
2372 changed during the branch, the pointer is used to change the external options
2373 bits. This function is used during the pre-compile phase when we are trying
2374 to find out the amount of memory needed, as well as during the real compile
2375 phase. The value of lengthptr distinguishes the two phases.
2376
2377 Arguments:
2378 optionsptr pointer to the option bits
2379 codeptr points to the pointer to the current code point
2380 ptrptr points to the current pattern pointer
2381 errorcodeptr points to error code variable
2382 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2383 reqbyteptr set to the last literal character required, else < 0
2384 bcptr points to current branch chain
2385 cd contains pointers to tables etc.
2386 lengthptr NULL during the real compile phase
2387 points to length accumulator during pre-compile phase
2388
2389 Returns: TRUE on success
2390 FALSE, with *errorcodeptr set non-zero on error
2391 */
2392
2393 static BOOL
2394 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2395 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2396 compile_data *cd, int *lengthptr)
2397 {
2398 int repeat_type, op_type;
2399 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2400 int bravalue = 0;
2401 int greedy_default, greedy_non_default;
2402 int firstbyte, reqbyte;
2403 int zeroreqbyte, zerofirstbyte;
2404 int req_caseopt, reqvary, tempreqvary;
2405 int options = *optionsptr;
2406 int after_manual_callout = 0;
2407 int length_prevgroup = 0;
2408 register int c;
2409 register uschar *code = *codeptr;
2410 uschar *last_code = code;
2411 uschar *orig_code = code;
2412 uschar *tempcode;
2413 BOOL inescq = FALSE;
2414 BOOL groupsetfirstbyte = FALSE;
2415 const uschar *ptr = *ptrptr;
2416 const uschar *tempptr;
2417 uschar *previous = NULL;
2418 uschar *previous_callout = NULL;
2419 uschar *save_hwm = NULL;
2420 uschar classbits[32];
2421
2422 #ifdef SUPPORT_UTF8
2423 BOOL class_utf8;
2424 BOOL utf8 = (options & PCRE_UTF8) != 0;
2425 uschar *class_utf8data;
2426 uschar *class_utf8data_base;
2427 uschar utf8_char[6];
2428 #else
2429 BOOL utf8 = FALSE;
2430 uschar *utf8_char = NULL;
2431 #endif
2432
2433 #ifdef DEBUG
2434 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2435 #endif
2436
2437 /* Set up the default and non-default settings for greediness */
2438
2439 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2440 greedy_non_default = greedy_default ^ 1;
2441
2442 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2443 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2444 matches a non-fixed char first char; reqbyte just remains unset if we never
2445 find one.
2446
2447 When we hit a repeat whose minimum is zero, we may have to adjust these values
2448 to take the zero repeat into account. This is implemented by setting them to
2449 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2450 item types that can be repeated set these backoff variables appropriately. */
2451
2452 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2453
2454 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2455 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2456 value > 255. It is added into the firstbyte or reqbyte variables to record the
2457 case status of the value. This is used only for ASCII characters. */
2458
2459 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2460
2461 /* Switch on next character until the end of the branch */
2462
2463 for (;; ptr++)
2464 {
2465 BOOL negate_class;
2466 BOOL should_flip_negation;
2467 BOOL possessive_quantifier;
2468 BOOL is_quantifier;
2469 BOOL is_recurse;
2470 BOOL reset_bracount;
2471 int class_charcount;
2472 int class_lastchar;
2473 int newoptions;
2474 int recno;
2475 int refsign;
2476 int skipbytes;
2477 int subreqbyte;
2478 int subfirstbyte;
2479 int terminator;
2480 int mclength;
2481 uschar mcbuffer[8];
2482
2483 /* Get next byte in the pattern */
2484
2485 c = *ptr;
2486
2487 /* If we are in the pre-compile phase, accumulate the length used for the
2488 previous cycle of this loop. */
2489
2490 if (lengthptr != NULL)
2491 {
2492 #ifdef DEBUG
2493 if (code > cd->hwm) cd->hwm = code; /* High water info */
2494 #endif
2495 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2496 {
2497 *errorcodeptr = ERR52;
2498 goto FAILED;
2499 }
2500
2501 /* There is at least one situation where code goes backwards: this is the
2502 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2503 the class is simply eliminated. However, it is created first, so we have to
2504 allow memory for it. Therefore, don't ever reduce the length at this point.
2505 */
2506
2507 if (code < last_code) code = last_code;
2508
2509 /* Paranoid check for integer overflow */
2510
2511 if (OFLOW_MAX - *lengthptr < code - last_code)
2512 {
2513 *errorcodeptr = ERR20;
2514 goto FAILED;
2515 }
2516
2517 *lengthptr += code - last_code;
2518 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2519
2520 /* If "previous" is set and it is not at the start of the work space, move
2521 it back to there, in order to avoid filling up the work space. Otherwise,
2522 if "previous" is NULL, reset the current code pointer to the start. */
2523
2524 if (previous != NULL)
2525 {
2526 if (previous > orig_code)
2527 {
2528 memmove(orig_code, previous, code - previous);
2529 code -= previous - orig_code;
2530 previous = orig_code;
2531 }
2532 }
2533 else code = orig_code;
2534
2535 /* Remember where this code item starts so we can pick up the length
2536 next time round. */
2537
2538 last_code = code;
2539 }
2540
2541 /* In the real compile phase, just check the workspace used by the forward
2542 reference list. */
2543
2544 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2545 {
2546 *errorcodeptr = ERR52;
2547 goto FAILED;
2548 }
2549
2550 /* If in \Q...\E, check for the end; if not, we have a literal */
2551
2552 if (inescq && c != 0)
2553 {
2554 if (c == '\\' && ptr[1] == 'E')
2555 {
2556 inescq = FALSE;
2557 ptr++;
2558 continue;
2559 }
2560 else
2561 {
2562 if (previous_callout != NULL)
2563 {
2564 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2565 complete_callout(previous_callout, ptr, cd);
2566 previous_callout = NULL;
2567 }
2568 if ((options & PCRE_AUTO_CALLOUT) != 0)
2569 {
2570 previous_callout = code;
2571 code = auto_callout(code, ptr, cd);
2572 }
2573 goto NORMAL_CHAR;
2574 }
2575 }
2576
2577 /* Fill in length of a previous callout, except when the next thing is
2578 a quantifier. */
2579
2580 is_quantifier = c == '*' || c == '+' || c == '?' ||
2581 (c == '{' && is_counted_repeat(ptr+1));
2582
2583 if (!is_quantifier && previous_callout != NULL &&
2584 after_manual_callout-- <= 0)
2585 {
2586 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2587 complete_callout(previous_callout, ptr, cd);
2588 previous_callout = NULL;
2589 }
2590
2591 /* In extended mode, skip white space and comments */
2592
2593 if ((options & PCRE_EXTENDED) != 0)
2594 {
2595 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2596 if (c == '#')
2597 {
2598 while (*(++ptr) != 0)
2599 {
2600 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2601 }
2602 if (*ptr != 0) continue;
2603
2604 /* Else fall through to handle end of string */
2605 c = 0;
2606 }
2607 }
2608
2609 /* No auto callout for quantifiers. */
2610
2611 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2612 {
2613 previous_callout = code;
2614 code = auto_callout(code, ptr, cd);
2615 }
2616
2617 switch(c)
2618 {
2619 /* ===================================================================*/
2620 case 0: /* The branch terminates at string end */
2621 case '|': /* or | or ) */
2622 case ')':
2623 *firstbyteptr = firstbyte;
2624 *reqbyteptr = reqbyte;
2625 *codeptr = code;
2626 *ptrptr = ptr;
2627 if (lengthptr != NULL)
2628 {
2629 if (OFLOW_MAX - *lengthptr < code - last_code)
2630 {
2631 *errorcodeptr = ERR20;
2632 goto FAILED;
2633 }
2634 *lengthptr += code - last_code; /* To include callout length */
2635 DPRINTF((">> end branch\n"));
2636 }
2637 return TRUE;
2638
2639
2640 /* ===================================================================*/
2641 /* Handle single-character metacharacters. In multiline mode, ^ disables
2642 the setting of any following char as a first character. */
2643
2644 case '^':
2645 if ((options & PCRE_MULTILINE) != 0)
2646 {
2647 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2648 }
2649 previous = NULL;
2650 *code++ = OP_CIRC;
2651 break;
2652
2653 case '$':
2654 previous = NULL;
2655 *code++ = OP_DOLL;
2656 break;
2657
2658 /* There can never be a first char if '.' is first, whatever happens about
2659 repeats. The value of reqbyte doesn't change either. */
2660
2661 case '.':
2662 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2663 zerofirstbyte = firstbyte;
2664 zeroreqbyte = reqbyte;
2665 previous = code;
2666 *code++ = OP_ANY;
2667 break;
2668
2669
2670 /* ===================================================================*/
2671 /* Character classes. If the included characters are all < 256, we build a
2672 32-byte bitmap of the permitted characters, except in the special case
2673 where there is only one such character. For negated classes, we build the
2674 map as usual, then invert it at the end. However, we use a different opcode
2675 so that data characters > 255 can be handled correctly.
2676
2677 If the class contains characters outside the 0-255 range, a different
2678 opcode is compiled. It may optionally have a bit map for characters < 256,
2679 but those above are are explicitly listed afterwards. A flag byte tells
2680 whether the bitmap is present, and whether this is a negated class or not.
2681
2682 In JavaScript compatibility mode, an isolated ']' causes an error. In
2683 default (Perl) mode, it is treated as a data character. */
2684
2685 case ']':
2686 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2687 {
2688 *errorcodeptr = ERR64;
2689 goto FAILED;
2690 }
2691 goto NORMAL_CHAR;
2692
2693 case '[':
2694 previous = code;
2695
2696 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2697 they are encountered at the top level, so we'll do that too. */
2698
2699 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2700 check_posix_syntax(ptr, &tempptr))
2701 {
2702 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2703 goto FAILED;
2704 }
2705
2706 /* If the first character is '^', set the negation flag and skip it. Also,
2707 if the first few characters (either before or after ^) are \Q\E or \E we
2708 skip them too. This makes for compatibility with Perl. */
2709
2710 negate_class = FALSE;
2711 for (;;)
2712 {
2713 c = *(++ptr);
2714 if (c == '\\')
2715 {
2716 if (ptr[1] == 'E') ptr++;
2717 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2718 else break;
2719 }
2720 else if (!negate_class && c == '^')
2721 negate_class = TRUE;
2722 else break;
2723 }
2724
2725 /* If a class contains a negative special such as \S, we need to flip the
2726 negation flag at the end, so that support for characters > 255 works
2727 correctly (they are all included in the class). */
2728
2729 should_flip_negation = FALSE;
2730
2731 /* Keep a count of chars with values < 256 so that we can optimize the case
2732 of just a single character (as long as it's < 256). However, For higher
2733 valued UTF-8 characters, we don't yet do any optimization. */
2734
2735 class_charcount = 0;
2736 class_lastchar = -1;
2737
2738 /* Initialize the 32-char bit map to all zeros. We build the map in a
2739 temporary bit of memory, in case the class contains only 1 character (less
2740 than 256), because in that case the compiled code doesn't use the bit map.
2741 */
2742
2743 memset(classbits, 0, 32 * sizeof(uschar));
2744
2745 #ifdef SUPPORT_UTF8
2746 class_utf8 = FALSE; /* No chars >= 256 */
2747 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2748 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2749 #endif
2750
2751 /* Process characters until ] is reached. By writing this as a "do" it
2752 means that an initial ] is taken as a data character. At the start of the
2753 loop, c contains the first byte of the character. */
2754
2755 if (c != 0) do
2756 {
2757 const uschar *oldptr;
2758
2759 #ifdef SUPPORT_UTF8
2760 if (utf8 && c > 127)
2761 { /* Braces are required because the */
2762 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2763 }
2764
2765 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2766 data and reset the pointer. This is so that very large classes that
2767 contain a zillion UTF-8 characters no longer overwrite the work space
2768 (which is on the stack). */
2769
2770 if (lengthptr != NULL)
2771 {
2772 *lengthptr += class_utf8data - class_utf8data_base;
2773 class_utf8data = class_utf8data_base;
2774 }
2775
2776 #endif
2777
2778 /* Inside \Q...\E everything is literal except \E */
2779
2780 if (inescq)
2781 {
2782 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2783 {
2784 inescq = FALSE; /* Reset literal state */
2785 ptr++; /* Skip the 'E' */
2786 continue; /* Carry on with next */
2787 }
2788 goto CHECK_RANGE; /* Could be range if \E follows */
2789 }
2790
2791 /* Handle POSIX class names. Perl allows a negation extension of the
2792 form [:^name:]. A square bracket that doesn't match the syntax is
2793 treated as a literal. We also recognize the POSIX constructions
2794 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2795 5.6 and 5.8 do. */
2796
2797 if (c == '[' &&
2798 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2799 check_posix_syntax(ptr, &tempptr))
2800 {
2801 BOOL local_negate = FALSE;
2802 int posix_class, taboffset, tabopt;
2803 register const uschar *cbits = cd->cbits;
2804 uschar pbits[32];
2805
2806 if (ptr[1] != ':')
2807 {
2808 *errorcodeptr = ERR31;
2809 goto FAILED;
2810 }
2811
2812 ptr += 2;
2813 if (*ptr == '^')
2814 {
2815 local_negate = TRUE;
2816 should_flip_negation = TRUE; /* Note negative special */
2817 ptr++;
2818 }
2819
2820 posix_class = check_posix_name(ptr, tempptr - ptr);
2821 if (posix_class < 0)
2822 {
2823 *errorcodeptr = ERR30;
2824 goto FAILED;
2825 }
2826
2827 /* If matching is caseless, upper and lower are converted to
2828 alpha. This relies on the fact that the class table starts with
2829 alpha, lower, upper as the first 3 entries. */
2830
2831 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2832 posix_class = 0;
2833
2834 /* We build the bit map for the POSIX class in a chunk of local store
2835 because we may be adding and subtracting from it, and we don't want to
2836 subtract bits that may be in the main map already. At the end we or the
2837 result into the bit map that is being built. */
2838
2839 posix_class *= 3;
2840
2841 /* Copy in the first table (always present) */
2842
2843 memcpy(pbits, cbits + posix_class_maps[posix_class],
2844 32 * sizeof(uschar));
2845
2846 /* If there is a second table, add or remove it as required. */
2847
2848 taboffset = posix_class_maps[posix_class + 1];
2849 tabopt = posix_class_maps[posix_class + 2];
2850
2851 if (taboffset >= 0)
2852 {
2853 if (tabopt >= 0)
2854 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2855 else
2856 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2857 }
2858
2859 /* Not see if we need to remove any special characters. An option
2860 value of 1 removes vertical space and 2 removes underscore. */
2861
2862 if (tabopt < 0) tabopt = -tabopt;
2863 if (tabopt == 1) pbits[1] &= ~0x3c;
2864 else if (tabopt == 2) pbits[11] &= 0x7f;
2865
2866 /* Add the POSIX table or its complement into the main table that is
2867 being built and we are done. */
2868
2869 if (local_negate)
2870 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2871 else
2872 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2873
2874 ptr = tempptr + 1;
2875 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2876 continue; /* End of POSIX syntax handling */
2877 }
2878
2879 /* Backslash may introduce a single character, or it may introduce one
2880 of the specials, which just set a flag. The sequence \b is a special
2881 case. Inside a class (and only there) it is treated as backspace.
2882 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2883 to 'or' into the one we are building. We assume they have more than one
2884 character in them, so set class_charcount bigger than one. */
2885
2886 if (c == '\\')
2887 {
2888 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2889 if (*errorcodeptr != 0) goto FAILED;
2890
2891 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2892 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2893 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2894 else if (-c == ESC_Q) /* Handle start of quoted string */
2895 {
2896 if (ptr[1] == '\\' && ptr[2] == 'E')
2897 {
2898 ptr += 2; /* avoid empty string */
2899 }
2900 else inescq = TRUE;
2901 continue;
2902 }
2903 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2904
2905 if (c < 0)
2906 {
2907 register const uschar *cbits = cd->cbits;
2908 class_charcount += 2; /* Greater than 1 is what matters */
2909
2910 /* Save time by not doing this in the pre-compile phase. */
2911
2912 if (lengthptr == NULL) switch (-c)
2913 {
2914 case ESC_d:
2915 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2916 continue;
2917
2918 case ESC_D:
2919 should_flip_negation = TRUE;
2920 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2921 continue;
2922
2923 case ESC_w:
2924 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2925 continue;
2926
2927 case ESC_W:
2928 should_flip_negation = TRUE;
2929 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2930 continue;
2931
2932 case ESC_s:
2933 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2934 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2935 continue;
2936
2937 case ESC_S:
2938 should_flip_negation = TRUE;
2939 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2940 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2941 continue;
2942
2943 default: /* Not recognized; fall through */
2944 break; /* Need "default" setting to stop compiler warning. */
2945 }
2946
2947 /* In the pre-compile phase, just do the recognition. */
2948
2949 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2950 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2951
2952 /* We need to deal with \H, \h, \V, and \v in both phases because
2953 they use extra memory. */
2954
2955 if (-c == ESC_h)
2956 {
2957 SETBIT(classbits, 0x09); /* VT */
2958 SETBIT(classbits, 0x20); /* SPACE */
2959 SETBIT(classbits, 0xa0); /* NSBP */
2960 #ifdef SUPPORT_UTF8
2961 if (utf8)
2962 {
2963 class_utf8 = TRUE;
2964 *class_utf8data++ = XCL_SINGLE;
2965 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2966 *class_utf8data++ = XCL_SINGLE;
2967 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2968 *class_utf8data++ = XCL_RANGE;
2969 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2970 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2971 *class_utf8data++ = XCL_SINGLE;
2972 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2973 *class_utf8data++ = XCL_SINGLE;
2974 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2975 *class_utf8data++ = XCL_SINGLE;
2976 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2977 }
2978 #endif
2979 continue;
2980 }
2981
2982 if (-c == ESC_H)
2983 {
2984 for (c = 0; c < 32; c++)
2985 {
2986 int x = 0xff;
2987 switch (c)
2988 {
2989 case 0x09/8: x ^= 1 << (0x09%8); break;
2990 case 0x20/8: x ^= 1 << (0x20%8); break;
2991 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2992 default: break;
2993 }
2994 classbits[c] |= x;
2995 }
2996
2997 #ifdef SUPPORT_UTF8
2998 if (utf8)
2999 {
3000 class_utf8 = TRUE;
3001 *class_utf8data++ = XCL_RANGE;
3002 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3003 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3004 *class_utf8data++ = XCL_RANGE;
3005 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3006 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3007 *class_utf8data++ = XCL_RANGE;
3008 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3009 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3010 *class_utf8data++ = XCL_RANGE;
3011 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3012 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3013 *class_utf8data++ = XCL_RANGE;
3014 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3015 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3016 *class_utf8data++ = XCL_RANGE;
3017 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3018 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3019 *class_utf8data++ = XCL_RANGE;
3020 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3021 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3022 }
3023 #endif
3024 continue;
3025 }
3026
3027 if (-c == ESC_v)
3028 {
3029 SETBIT(classbits, 0x0a); /* LF */
3030 SETBIT(classbits, 0x0b); /* VT */
3031 SETBIT(classbits, 0x0c); /* FF */
3032 SETBIT(classbits, 0x0d); /* CR */
3033 SETBIT(classbits, 0x85); /* NEL */
3034 #ifdef SUPPORT_UTF8
3035 if (utf8)
3036 {
3037 class_utf8 = TRUE;
3038 *class_utf8data++ = XCL_RANGE;
3039 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3040 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3041 }
3042 #endif
3043 continue;
3044 }
3045
3046 if (-c == ESC_V)
3047 {
3048 for (c = 0; c < 32; c++)
3049 {
3050 int x = 0xff;
3051 switch (c)
3052 {
3053 case 0x0a/8: x ^= 1 << (0x0a%8);
3054 x ^= 1 << (0x0b%8);
3055 x ^= 1 << (0x0c%8);
3056 x ^= 1 << (0x0d%8);
3057 break;
3058 case 0x85/8: x ^= 1 << (0x85%8); break;
3059 default: break;
3060 }
3061 classbits[c] |= x;
3062 }
3063
3064 #ifdef SUPPORT_UTF8
3065 if (utf8)
3066 {
3067 class_utf8 = TRUE;
3068 *class_utf8data++ = XCL_RANGE;
3069 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3070 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3071 *class_utf8data++ = XCL_RANGE;
3072 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3073 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3074 }
3075 #endif
3076 continue;
3077 }
3078
3079 /* We need to deal with \P and \p in both phases. */
3080
3081 #ifdef SUPPORT_UCP
3082 if (-c == ESC_p || -c == ESC_P)
3083 {
3084 BOOL negated;
3085 int pdata;
3086 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3087 if (ptype < 0) goto FAILED;
3088 class_utf8 = TRUE;
3089 *class_utf8data++ = ((-c == ESC_p) != negated)?
3090 XCL_PROP : XCL_NOTPROP;
3091 *class_utf8data++ = ptype;
3092 *class_utf8data++ = pdata;
3093 class_charcount -= 2; /* Not a < 256 character */
3094 continue;
3095 }
3096 #endif
3097 /* Unrecognized escapes are faulted if PCRE is running in its
3098 strict mode. By default, for compatibility with Perl, they are
3099 treated as literals. */
3100
3101 if ((options & PCRE_EXTRA) != 0)
3102 {
3103 *errorcodeptr = ERR7;
3104 goto FAILED;
3105 }
3106
3107 class_charcount -= 2; /* Undo the default count from above */
3108 c = *ptr; /* Get the final character and fall through */
3109 }
3110
3111 /* Fall through if we have a single character (c >= 0). This may be
3112 greater than 256 in UTF-8 mode. */
3113
3114 } /* End of backslash handling */
3115
3116 /* A single character may be followed by '-' to form a range. However,
3117 Perl does not permit ']' to be the end of the range. A '-' character
3118 at the end is treated as a literal. Perl ignores orphaned \E sequences
3119 entirely. The code for handling \Q and \E is messy. */
3120
3121 CHECK_RANGE:
3122 while (ptr[1] == '\\' && ptr[2] == 'E')
3123 {
3124 inescq = FALSE;
3125 ptr += 2;
3126 }
3127
3128 oldptr = ptr;
3129
3130 /* Remember \r or \n */
3131
3132 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3133
3134 /* Check for range */
3135
3136 if (!inescq && ptr[1] == '-')
3137 {
3138 int d;
3139 ptr += 2;
3140 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3141
3142 /* If we hit \Q (not followed by \E) at this point, go into escaped
3143 mode. */
3144
3145 while (*ptr == '\\' && ptr[1] == 'Q')
3146 {
3147 ptr += 2;
3148 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3149 inescq = TRUE;
3150 break;
3151 }
3152
3153 if (*ptr == 0 || (!inescq && *ptr == ']'))
3154 {
3155 ptr = oldptr;
3156 goto LONE_SINGLE_CHARACTER;
3157 }
3158
3159 #ifdef SUPPORT_UTF8
3160 if (utf8)
3161 { /* Braces are required because the */
3162 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3163 }
3164 else
3165 #endif
3166 d = *ptr; /* Not UTF-8 mode */
3167
3168 /* The second part of a range can be a single-character escape, but
3169 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3170 in such circumstances. */
3171
3172 if (!inescq && d == '\\')
3173 {
3174 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3175 if (*errorcodeptr != 0) goto FAILED;
3176
3177 /* \b is backspace; \X is literal X; \R is literal R; any other
3178 special means the '-' was literal */
3179
3180 if (d < 0)
3181 {
3182 if (d == -ESC_b) d = '\b';
3183 else if (d == -ESC_X) d = 'X';
3184 else if (d == -ESC_R) d = 'R'; else
3185 {
3186 ptr = oldptr;
3187 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3188 }
3189 }
3190 }
3191
3192 /* Check that the two values are in the correct order. Optimize
3193 one-character ranges */
3194
3195 if (d < c)
3196 {
3197 *errorcodeptr = ERR8;
3198 goto FAILED;
3199 }
3200
3201 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3202
3203 /* Remember \r or \n */
3204
3205 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3206
3207 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3208 matching, we have to use an XCLASS with extra data items. Caseless
3209 matching for characters > 127 is available only if UCP support is
3210 available. */
3211
3212 #ifdef SUPPORT_UTF8
3213 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3214 {
3215 class_utf8 = TRUE;
3216
3217 /* With UCP support, we can find the other case equivalents of
3218 the relevant characters. There may be several ranges. Optimize how
3219 they fit with the basic range. */
3220
3221 #ifdef SUPPORT_UCP
3222 if ((options & PCRE_CASELESS) != 0)
3223 {
3224 unsigned int occ, ocd;
3225 unsigned int cc = c;
3226 unsigned int origd = d;
3227 while (get_othercase_range(&cc, origd, &occ, &ocd))
3228 {
3229 if (occ >= (unsigned int)c &&
3230 ocd <= (unsigned int)d)
3231 continue; /* Skip embedded ranges */
3232
3233 if (occ < (unsigned int)c &&
3234 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3235 { /* if there is overlap, */
3236 c = occ; /* noting that if occ < c */
3237 continue; /* we can't have ocd > d */
3238 } /* because a subrange is */
3239 if (ocd > (unsigned int)d &&
3240 occ <= (unsigned int)d + 1) /* always shorter than */
3241 { /* the basic range. */
3242 d = ocd;
3243 continue;
3244 }
3245
3246 if (occ == ocd)
3247 {
3248 *class_utf8data++ = XCL_SINGLE;
3249 }
3250 else
3251 {
3252 *class_utf8data++ = XCL_RANGE;
3253 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3254 }
3255 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3256 }
3257 }
3258 #endif /* SUPPORT_UCP */
3259
3260 /* Now record the original range, possibly modified for UCP caseless
3261 overlapping ranges. */
3262
3263 *class_utf8data++ = XCL_RANGE;
3264 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3265 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3266
3267 /* With UCP support, we are done. Without UCP support, there is no
3268 caseless matching for UTF-8 characters > 127; we can use the bit map
3269 for the smaller ones. */
3270
3271 #ifdef SUPPORT_UCP
3272 continue; /* With next character in the class */
3273 #else
3274 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3275
3276 /* Adjust upper limit and fall through to set up the map */
3277
3278 d = 127;
3279
3280 #endif /* SUPPORT_UCP */
3281 }
3282 #endif /* SUPPORT_UTF8 */
3283
3284 /* We use the bit map for all cases when not in UTF-8 mode; else
3285 ranges that lie entirely within 0-127 when there is UCP support; else
3286 for partial ranges without UCP support. */
3287
3288 class_charcount += d - c + 1;
3289 class_lastchar = d;
3290
3291 /* We can save a bit of time by skipping this in the pre-compile. */
3292
3293 if (lengthptr == NULL) for (; c <= d; c++)
3294 {
3295 classbits[c/8] |= (1 << (c&7));
3296 if ((options & PCRE_CASELESS) != 0)
3297 {
3298 int uc = cd->fcc[c]; /* flip case */
3299 classbits[uc/8] |= (1 << (uc&7));
3300 }
3301 }
3302
3303 continue; /* Go get the next char in the class */
3304 }
3305
3306 /* Handle a lone single character - we can get here for a normal
3307 non-escape char, or after \ that introduces a single character or for an
3308 apparent range that isn't. */
3309
3310 LONE_SINGLE_CHARACTER:
3311
3312 /* Handle a character that cannot go in the bit map */
3313
3314 #ifdef SUPPORT_UTF8
3315 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3316 {
3317 class_utf8 = TRUE;
3318 *class_utf8data++ = XCL_SINGLE;
3319 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3320
3321 #ifdef SUPPORT_UCP
3322 if ((options & PCRE_CASELESS) != 0)
3323 {
3324 unsigned int othercase;
3325 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3326 {
3327 *class_utf8data++ = XCL_SINGLE;
3328 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3329 }
3330 }
3331 #endif /* SUPPORT_UCP */
3332
3333 }
3334 else
3335 #endif /* SUPPORT_UTF8 */
3336
3337 /* Handle a single-byte character */
3338 {
3339 classbits[c/8] |= (1 << (c&7));
3340 if ((options & PCRE_CASELESS) != 0)
3341 {
3342 c = cd->fcc[c]; /* flip case */
3343 classbits[c/8] |= (1 << (c&7));
3344 }
3345 class_charcount++;
3346 class_lastchar = c;
3347 }
3348 }
3349
3350 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3351
3352 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3353
3354 if (c == 0) /* Missing terminating ']' */
3355 {
3356 *errorcodeptr = ERR6;
3357 goto FAILED;
3358 }
3359
3360
3361 /* This code has been disabled because it would mean that \s counts as
3362 an explicit \r or \n reference, and that's not really what is wanted. Now
3363 we set the flag only if there is a literal "\r" or "\n" in the class. */
3364
3365 #if 0
3366 /* Remember whether \r or \n are in this class */
3367
3368 if (negate_class)
3369 {
3370 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3371 }
3372 else
3373 {
3374 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3375 }
3376 #endif
3377
3378
3379 /* If class_charcount is 1, we saw precisely one character whose value is
3380 less than 256. As long as there were no characters >= 128 and there was no
3381 use of \p or \P, in other words, no use of any XCLASS features, we can
3382 optimize.
3383
3384 In UTF-8 mode, we can optimize the negative case only if there were no
3385 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3386 operate on single-bytes only. This is an historical hangover. Maybe one day
3387 we can tidy these opcodes to handle multi-byte characters.
3388
3389 The optimization throws away the bit map. We turn the item into a
3390 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3391 that OP_NOT does not support multibyte characters. In the positive case, it
3392 can cause firstbyte to be set. Otherwise, there can be no first char if
3393 this item is first, whatever repeat count may follow. In the case of
3394 reqbyte, save the previous value for reinstating. */
3395
3396 #ifdef SUPPORT_UTF8
3397 if (class_charcount == 1 && !class_utf8 &&
3398 (!utf8 || !negate_class || class_lastchar < 128))
3399 #else
3400 if (class_charcount == 1)
3401 #endif
3402 {
3403 zeroreqbyte = reqbyte;
3404
3405 /* The OP_NOT opcode works on one-byte characters only. */
3406
3407 if (negate_class)
3408 {
3409 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3410 zerofirstbyte = firstbyte;
3411 *code++ = OP_NOT;
3412 *code++ = class_lastchar;
3413 break;
3414 }
3415
3416 /* For a single, positive character, get the value into mcbuffer, and
3417 then we can handle this with the normal one-character code. */
3418
3419 #ifdef SUPPORT_UTF8
3420 if (utf8 && class_lastchar > 127)
3421 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3422 else
3423 #endif
3424 {
3425 mcbuffer[0] = class_lastchar;
3426 mclength = 1;
3427 }
3428 goto ONE_CHAR;
3429 } /* End of 1-char optimization */
3430
3431 /* The general case - not the one-char optimization. If this is the first
3432 thing in the branch, there can be no first char setting, whatever the
3433 repeat count. Any reqbyte setting must remain unchanged after any kind of
3434 repeat. */
3435
3436 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3437 zerofirstbyte = firstbyte;
3438 zeroreqbyte = reqbyte;
3439
3440 /* If there are characters with values > 255, we have to compile an
3441 extended class, with its own opcode, unless there was a negated special
3442 such as \S in the class, because in that case all characters > 255 are in
3443 the class, so any that were explicitly given as well can be ignored. If
3444 (when there are explicit characters > 255 that must be listed) there are no
3445 characters < 256, we can omit the bitmap in the actual compiled code. */
3446
3447 #ifdef SUPPORT_UTF8
3448 if (class_utf8 && !should_flip_negation)
3449 {
3450 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3451 *code++ = OP_XCLASS;
3452 code += LINK_SIZE;
3453 *code = negate_class? XCL_NOT : 0;
3454
3455 /* If the map is required, move up the extra data to make room for it;
3456 otherwise just move the code pointer to the end of the extra data. */
3457
3458 if (class_charcount > 0)
3459 {
3460 *code++ |= XCL_MAP;
3461 memmove(code + 32, code, class_utf8data - code);
3462 memcpy(code, classbits, 32);
3463 code = class_utf8data + 32;
3464 }
3465 else code = class_utf8data;
3466
3467 /* Now fill in the complete length of the item */
3468
3469 PUT(previous, 1, code - previous);
3470 break; /* End of class handling */
3471 }
3472 #endif
3473
3474 /* If there are no characters > 255, set the opcode to OP_CLASS or
3475 OP_NCLASS, depending on whether the whole class was negated and whether
3476 there were negative specials such as \S in the class. Then copy the 32-byte
3477 map into the code vector, negating it if necessary. */
3478
3479 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3480 if (negate_class)
3481 {
3482 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3483 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3484 }
3485 else
3486 {
3487 memcpy(code, classbits, 32);
3488 }
3489 code += 32;
3490 break;
3491
3492
3493 /* ===================================================================*/
3494 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3495 has been tested above. */
3496
3497 case '{':
3498 if (!is_quantifier) goto NORMAL_CHAR;
3499 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3500 if (*errorcodeptr != 0) goto FAILED;
3501 goto REPEAT;
3502
3503 case '*':
3504 repeat_min = 0;
3505 repeat_max = -1;
3506 goto REPEAT;
3507
3508 case '+':
3509 repeat_min = 1;
3510 repeat_max = -1;
3511 goto REPEAT;
3512
3513 case '?':
3514 repeat_min = 0;
3515 repeat_max = 1;
3516
3517 REPEAT:
3518 if (previous == NULL)
3519 {
3520 *errorcodeptr = ERR9;
3521 goto FAILED;
3522 }
3523
3524 if (repeat_min == 0)
3525 {
3526 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3527 reqbyte = zeroreqbyte; /* Ditto */
3528 }
3529
3530 /* Remember whether this is a variable length repeat */
3531
3532 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3533
3534 op_type = 0; /* Default single-char op codes */
3535 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3536
3537 /* Save start of previous item, in case we have to move it up to make space
3538 for an inserted OP_ONCE for the additional '+' extension. */
3539
3540 tempcode = previous;
3541
3542 /* If the next character is '+', we have a possessive quantifier. This
3543 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3544 If the next character is '?' this is a minimizing repeat, by default,
3545 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3546 repeat type to the non-default. */
3547
3548 if (ptr[1] == '+')
3549 {
3550 repeat_type = 0; /* Force greedy */
3551 possessive_quantifier = TRUE;
3552 ptr++;
3553 }
3554 else if (ptr[1] == '?')
3555 {
3556 repeat_type = greedy_non_default;
3557 ptr++;
3558 }
3559 else repeat_type = greedy_default;
3560
3561 /* If previous was a character match, abolish the item and generate a
3562 repeat item instead. If a char item has a minumum of more than one, ensure
3563 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3564 the first thing in a branch because the x will have gone into firstbyte
3565 instead. */
3566
3567 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3568 {
3569 /* Deal with UTF-8 characters that take up more than one byte. It's
3570 easier to write this out separately than try to macrify it. Use c to
3571 hold the length of the character in bytes, plus 0x80 to flag that it's a
3572 length rather than a small character. */
3573
3574 #ifdef SUPPORT_UTF8
3575 if (utf8 && (code[-1] & 0x80) != 0)
3576 {
3577 uschar *lastchar = code - 1;
3578 while((*lastchar & 0xc0) == 0x80) lastchar--;
3579 c = code - lastchar; /* Length of UTF-8 character */
3580 memcpy(utf8_char, lastchar, c); /* Save the char */
3581 c |= 0x80; /* Flag c as a length */
3582 }
3583 else
3584 #endif
3585
3586 /* Handle the case of a single byte - either with no UTF8 support, or
3587 with UTF-8 disabled, or for a UTF-8 character < 128. */
3588
3589 {
3590 c = code[-1];
3591 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3592 }
3593
3594 /* If the repetition is unlimited, it pays to see if the next thing on
3595 the line is something that cannot possibly match this character. If so,
3596 automatically possessifying this item gains some performance in the case
3597 where the match fails. */
3598
3599 if (!possessive_quantifier &&
3600 repeat_max < 0 &&
3601 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3602 options, cd))
3603 {
3604 repeat_type = 0; /* Force greedy */
3605 possessive_quantifier = TRUE;
3606 }
3607
3608 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3609 }
3610
3611 /* If previous was a single negated character ([^a] or similar), we use
3612 one of the special opcodes, replacing it. The code is shared with single-
3613 character repeats by setting opt_type to add a suitable offset into
3614 repeat_type. We can also test for auto-possessification. OP_NOT is
3615 currently used only for single-byte chars. */
3616
3617 else if (*previous == OP_NOT)
3618 {
3619 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3620 c = previous[1];
3621 if (!possessive_quantifier &&
3622 repeat_max < 0 &&
3623 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3624 {
3625 repeat_type = 0; /* Force greedy */
3626 possessive_quantifier = TRUE;
3627 }
3628 goto OUTPUT_SINGLE_REPEAT;
3629 }
3630
3631 /* If previous was a character type match (\d or similar), abolish it and
3632 create a suitable repeat item. The code is shared with single-character
3633 repeats by setting op_type to add a suitable offset into repeat_type. Note
3634 the the Unicode property types will be present only when SUPPORT_UCP is
3635 defined, but we don't wrap the little bits of code here because it just
3636 makes it horribly messy. */
3637
3638 else if (*previous < OP_EODN)
3639 {
3640 uschar *oldcode;
3641 int prop_type, prop_value;
3642 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3643 c = *previous;
3644
3645 if (!possessive_quantifier &&
3646 repeat_max < 0 &&
3647 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3648 {
3649 repeat_type = 0; /* Force greedy */
3650 possessive_quantifier = TRUE;
3651 }
3652
3653 OUTPUT_SINGLE_REPEAT:
3654 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3655 {
3656 prop_type = previous[1];
3657 prop_value = previous[2];
3658 }
3659 else prop_type = prop_value = -1;
3660
3661 oldcode = code;
3662 code = previous; /* Usually overwrite previous item */
3663
3664 /* If the maximum is zero then the minimum must also be zero; Perl allows
3665 this case, so we do too - by simply omitting the item altogether. */
3666
3667 if (repeat_max == 0) goto END_REPEAT;
3668
3669 /* All real repeats make it impossible to handle partial matching (maybe
3670 one day we will be able to remove this restriction). */
3671
3672 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3673
3674 /* Combine the op_type with the repeat_type */
3675
3676 repeat_type += op_type;
3677
3678 /* A minimum of zero is handled either as the special case * or ?, or as
3679 an UPTO, with the maximum given. */
3680
3681 if (repeat_min == 0)
3682 {
3683 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3684 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3685 else
3686 {
3687 *code++ = OP_UPTO + repeat_type;
3688 PUT2INC(code, 0, repeat_max);
3689 }
3690 }
3691
3692 /* A repeat minimum of 1 is optimized into some special cases. If the
3693 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3694 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3695 one less than the maximum. */
3696
3697 else if (repeat_min == 1)
3698 {
3699 if (repeat_max == -1)
3700 *code++ = OP_PLUS + repeat_type;
3701 else
3702 {
3703 code = oldcode; /* leave previous item in place */
3704 if (repeat_max == 1) goto END_REPEAT;
3705 *code++ = OP_UPTO + repeat_type;
3706 PUT2INC(code, 0, repeat_max - 1);
3707 }
3708 }
3709
3710 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3711 handled as an EXACT followed by an UPTO. */
3712
3713 else
3714 {
3715 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3716 PUT2INC(code, 0, repeat_min);
3717
3718 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3719 we have to insert the character for the previous code. For a repeated
3720 Unicode property match, there are two extra bytes that define the
3721 required property. In UTF-8 mode, long characters have their length in
3722 c, with the 0x80 bit as a flag. */
3723
3724 if (repeat_max < 0)
3725 {
3726 #ifdef SUPPORT_UTF8
3727 if (utf8 && c >= 128)
3728 {
3729 memcpy(code, utf8_char, c & 7);
3730 code += c & 7;
3731 }
3732 else
3733 #endif
3734 {
3735 *code++ = c;
3736 if (prop_type >= 0)
3737 {
3738 *code++ = prop_type;
3739 *code++ = prop_value;
3740 }
3741 }
3742 *code++ = OP_STAR + repeat_type;
3743 }
3744
3745 /* Else insert an UPTO if the max is greater than the min, again
3746 preceded by the character, for the previously inserted code. If the
3747 UPTO is just for 1 instance, we can use QUERY instead. */
3748
3749 else if (repeat_max != repeat_min)
3750 {
3751 #ifdef SUPPORT_UTF8
3752 if (utf8 && c >= 128)
3753 {
3754 memcpy(code, utf8_char, c & 7);
3755 code += c & 7;
3756 }
3757 else
3758 #endif
3759 *code++ = c;
3760 if (prop_type >= 0)
3761 {
3762 *code++ = prop_type;
3763 *code++ = prop_value;
3764 }
3765 repeat_max -= repeat_min;
3766
3767 if (repeat_max == 1)
3768 {
3769 *code++ = OP_QUERY + repeat_type;
3770 }
3771 else
3772 {
3773 *code++ = OP_UPTO + repeat_type;
3774 PUT2INC(code, 0, repeat_max);
3775 }
3776 }
3777 }
3778
3779 /* The character or character type itself comes last in all cases. */
3780
3781 #ifdef SUPPORT_UTF8
3782 if (utf8 && c >= 128)
3783 {
3784 memcpy(code, utf8_char, c & 7);
3785 code += c & 7;
3786 }
3787 else
3788 #endif
3789 *code++ = c;
3790
3791 /* For a repeated Unicode property match, there are two extra bytes that
3792 define the required property. */
3793
3794 #ifdef SUPPORT_UCP
3795 if (prop_type >= 0)
3796 {
3797 *code++ = prop_type;
3798 *code++ = prop_value;
3799 }
3800 #endif
3801 }
3802
3803 /* If previous was a character class or a back reference, we put the repeat
3804 stuff after it, but just skip the item if the repeat was {0,0}. */
3805
3806 else if (*previous == OP_CLASS ||
3807 *previous == OP_NCLASS ||
3808 #ifdef SUPPORT_UTF8
3809 *previous == OP_XCLASS ||
3810 #endif
3811 *previous == OP_REF)
3812 {
3813 if (repeat_max == 0)
3814 {
3815 code = previous;
3816 goto END_REPEAT;
3817 }
3818
3819 /* All real repeats make it impossible to handle partial matching (maybe
3820 one day we will be able to remove this restriction). */
3821
3822 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3823
3824 if (repeat_min == 0 && repeat_max == -1)
3825 *code++ = OP_CRSTAR + repeat_type;
3826 else if (repeat_min == 1 && repeat_max == -1)
3827 *code++ = OP_CRPLUS + repeat_type;
3828 else if (repeat_min == 0 && repeat_max == 1)
3829 *code++ = OP_CRQUERY + repeat_type;
3830 else
3831 {
3832 *code++ = OP_CRRANGE + repeat_type;
3833 PUT2INC(code, 0, repeat_min);
3834 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3835 PUT2INC(code, 0, repeat_max);
3836 }
3837 }
3838
3839 /* If previous was a bracket group, we may have to replicate it in certain
3840 cases. */
3841
3842 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3843 *previous == OP_ONCE || *previous == OP_COND)
3844 {
3845 register int i;
3846 int ketoffset = 0;
3847 int len = code - previous;
3848 uschar *bralink = NULL;
3849
3850 /* Repeating a DEFINE group is pointless */
3851
3852 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3853 {
3854 *errorcodeptr = ERR55;
3855 goto FAILED;
3856 }
3857
3858 /* If the maximum repeat count is unlimited, find the end of the bracket
3859 by scanning through from the start, and compute the offset back to it
3860 from the current code pointer. There may be an OP_OPT setting following
3861 the final KET, so we can't find the end just by going back from the code
3862 pointer. */
3863
3864 if (repeat_max == -1)
3865 {
3866 register uschar *ket = previous;
3867 do ket += GET(ket, 1); while (*ket != OP_KET);
3868 ketoffset = code - ket;
3869 }
3870
3871 /* The case of a zero minimum is special because of the need to stick
3872 OP_BRAZERO in front of it, and because the group appears once in the
3873 data, whereas in other cases it appears the minimum number of times. For
3874 this reason, it is simplest to treat this case separately, as otherwise
3875 the code gets far too messy. There are several special subcases when the
3876 minimum is zero. */
3877
3878 if (repeat_min == 0)
3879 {
3880 /* If the maximum is also zero, we used to just omit the group from the
3881 output altogether, like this:
3882
3883 ** if (repeat_max == 0)
3884 ** {
3885 ** code = previous;
3886 ** goto END_REPEAT;
3887 ** }
3888
3889 However, that fails when a group is referenced as a subroutine from
3890 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3891 so that it is skipped on execution. As we don't have a list of which
3892 groups are referenced, we cannot do this selectively.
3893
3894 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3895 and do no more at this point. However, we do need to adjust any
3896 OP_RECURSE calls inside the group that refer to the group itself or any
3897 internal or forward referenced group, because the offset is from the
3898 start of the whole regex. Temporarily terminate the pattern while doing
3899 this. */
3900
3901 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3902 {
3903 *code = OP_END;
3904 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3905 memmove(previous+1, previous, len);
3906 code++;
3907 if (repeat_max == 0)
3908 {
3909 *previous++ = OP_SKIPZERO;
3910 goto END_REPEAT;
3911 }
3912 *previous++ = OP_BRAZERO + repeat_type;
3913 }
3914
3915 /* If the maximum is greater than 1 and limited, we have to replicate
3916 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3917 The first one has to be handled carefully because it's the original
3918 copy, which has to be moved up. The remainder can be handled by code
3919 that is common with the non-zero minimum case below. We have to
3920 adjust the value or repeat_max, since one less copy is required. Once
3921 again, we may have to adjust any OP_RECURSE calls inside the group. */
3922
3923 else
3924 {
3925 int offset;
3926 *code = OP_END;
3927 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3928 memmove(previous + 2 + LINK_SIZE, previous, len);
3929 code += 2 + LINK_SIZE;
3930 *previous++ = OP_BRAZERO + repeat_type;
3931 *previous++ = OP_BRA;
3932
3933 /* We chain together the bracket offset fields that have to be
3934 filled in later when the ends of the brackets are reached. */
3935
3936 offset = (bralink == NULL)? 0 : previous - bralink;
3937 bralink = previous;
3938 PUTINC(previous, 0, offset);
3939 }
3940
3941 repeat_max--;
3942 }
3943
3944 /* If the minimum is greater than zero, replicate the group as many
3945 times as necessary, and adjust the maximum to the number of subsequent
3946 copies that we need. If we set a first char from the group, and didn't
3947 set a required char, copy the latter from the former. If there are any
3948 forward reference subroutine calls in the group, there will be entries on
3949 the workspace list; replicate these with an appropriate increment. */
3950
3951 else
3952 {
3953 if (repeat_min > 1)
3954 {
3955 /* In the pre-compile phase, we don't actually do the replication. We
3956 just adjust the length as if we had. Do some paranoid checks for
3957 potential integer overflow. */
3958
3959 if (lengthptr != NULL)
3960 {
3961 int delta = (repeat_min - 1)*length_prevgroup;
3962 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3963 (double)INT_MAX ||
3964 OFLOW_MAX - *lengthptr < delta)
3965 {
3966 *errorcodeptr = ERR20;
3967 goto FAILED;
3968 }
3969 *lengthptr += delta;
3970 }
3971
3972 /* This is compiling for real */
3973
3974 else
3975 {
3976 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3977 for (i = 1; i < repeat_min; i++)
3978 {
3979 uschar *hc;
3980 uschar *this_hwm = cd->hwm;
3981 memcpy(code, previous, len);
3982 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3983 {
3984 PUT(cd->hwm, 0, GET(hc, 0) + len);
3985 cd->hwm += LINK_SIZE;
3986 }
3987 save_hwm = this_hwm;
3988 code += len;
3989 }
3990 }
3991 }
3992
3993 if (repeat_max > 0) repeat_max -= repeat_min;
3994 }
3995
3996 /* This code is common to both the zero and non-zero minimum cases. If
3997 the maximum is limited, it replicates the group in a nested fashion,
3998 remembering the bracket starts on a stack. In the case of a zero minimum,
3999 the first one was set up above. In all cases the repeat_max now specifies
4000 the number of additional copies needed. Again, we must remember to
4001 replicate entries on the forward reference list. */
4002
4003 if (repeat_max >= 0)
4004 {
4005 /* In the pre-compile phase, we don't actually do the replication. We
4006 just adjust the length as if we had. For each repetition we must add 1
4007 to the length for BRAZERO and for all but the last repetition we must
4008 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4009 paranoid checks to avoid integer overflow. */
4010
4011 if (lengthptr != NULL && repeat_max > 0)
4012 {
4013 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4014 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4015 if ((double)repeat_max *
4016 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4017 > (double)INT_MAX ||
4018 OFLOW_MAX - *lengthptr < delta)
4019 {
4020 *errorcodeptr = ERR20;
4021 goto FAILED;
4022 }
4023 *lengthptr += delta;
4024 }
4025
4026 /* This is compiling for real */
4027
4028 else for (i = repeat_max - 1; i >= 0; i--)
4029 {
4030 uschar *hc;
4031 uschar *this_hwm = cd->hwm;
4032
4033 *code++ = OP_BRAZERO + repeat_type;
4034
4035 /* All but the final copy start a new nesting, maintaining the
4036 chain of brackets outstanding. */
4037
4038 if (i != 0)
4039 {
4040 int offset;
4041 *code++ = OP_BRA;
4042 offset = (bralink == NULL)? 0 : code - bralink;
4043 bralink = code;
4044 PUTINC(code, 0, offset);
4045 }
4046
4047 memcpy(code, previous, len);
4048 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4049 {
4050 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4051 cd->hwm += LINK_SIZE;
4052 }
4053 save_hwm = this_hwm;
4054 code += len;
4055 }
4056
4057 /* Now chain through the pending brackets, and fill in their length
4058 fields (which are holding the chain links pro tem). */
4059
4060 while (bralink != NULL)
4061 {
4062 int oldlinkoffset;
4063 int offset = code - bralink + 1;
4064 uschar *bra = code - offset;
4065 oldlinkoffset = GET(bra, 1);
4066 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4067 *code++ = OP_KET;
4068 PUTINC(code, 0, offset);
4069 PUT(bra, 1, offset);
4070 }
4071 }
4072
4073 /* If the maximum is unlimited, set a repeater in the final copy. We
4074 can't just offset backwards from the current code point, because we
4075 don't know if there's been an options resetting after the ket. The
4076 correct offset was computed above.
4077
4078 Then, when we are doing the actual compile phase, check to see whether
4079 this group is a non-atomic one that could match an empty string. If so,
4080 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4081 that runtime checking can be done. [This check is also applied to
4082 atomic groups at runtime, but in a different way.] */
4083
4084 else
4085 {
4086 uschar *ketcode = code - ketoffset;
4087 uschar *bracode = ketcode - GET(ketcode, 1);
4088 *ketcode = OP_KETRMAX + repeat_type;
4089 if (lengthptr == NULL && *bracode != OP_ONCE)
4090 {
4091 uschar *scode = bracode;
4092 do
4093 {
4094 if (could_be_empty_branch(scode, ketcode, utf8))
4095 {
4096 *bracode += OP_SBRA - OP_BRA;
4097 break;
4098 }
4099 scode += GET(scode, 1);
4100 }
4101 while (*scode == OP_ALT);
4102 }
4103 }
4104 }
4105
4106 /* Else there's some kind of shambles */
4107
4108 else
4109 {
4110 *errorcodeptr = ERR11;
4111 goto FAILED;
4112 }
4113
4114 /* If the character following a repeat is '+', or if certain optimization
4115 tests above succeeded, possessive_quantifier is TRUE. For some of the
4116 simpler opcodes, there is an special alternative opcode for this. For
4117 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4118 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4119 but the special opcodes can optimize it a bit. The repeated item starts at
4120 tempcode, not at previous, which might be the first part of a string whose
4121 (former) last char we repeated.
4122
4123 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4124 an 'upto' may follow. We skip over an 'exact' item, and then test the
4125 length of what remains before proceeding. */
4126
4127 if (possessive_quantifier)
4128 {
4129 int len;
4130 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4131 *tempcode == OP_NOTEXACT)
4132 tempcode += _pcre_OP_lengths[*tempcode] +
4133 ((*tempcode == OP_TYPEEXACT &&
4134 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4135 len = code - tempcode;
4136 if (len > 0) switch (*tempcode)
4137 {
4138 case OP_STAR: *tempcode = OP_POSSTAR; break;
4139 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4140 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4141 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4142
4143 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4144 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4145 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4146 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4147
4148 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4149 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4150 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4151 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4152
4153 default:
4154 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4155 code += 1 + LINK_SIZE;
4156 len += 1 + LINK_SIZE;
4157 tempcode[0] = OP_ONCE;
4158 *code++ = OP_KET;
4159 PUTINC(code, 0, len);
4160 PUT(tempcode, 1, len);
4161 break;
4162 }
4163 }
4164
4165 /* In all case we no longer have a previous item. We also set the
4166 "follows varying string" flag for subsequently encountered reqbytes if
4167 it isn't already set and we have just passed a varying length item. */
4168
4169 END_REPEAT:
4170 previous = NULL;
4171 cd->req_varyopt |= reqvary;
4172 break;
4173
4174
4175 /* ===================================================================*/
4176 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4177 lookbehind or option setting or condition or all the other extended
4178 parenthesis forms. */
4179
4180 case '(':
4181 newoptions = options;
4182 skipbytes = 0;
4183 bravalue = OP_CBRA;
4184 save_hwm = cd->hwm;
4185 reset_bracount = FALSE;
4186
4187 /* First deal with various "verbs" that can be introduced by '*'. */
4188
4189 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4190 {
4191 int i, namelen;
4192 const char *vn = verbnames;
4193 const uschar *name = ++ptr;
4194 previous = NULL;
4195 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4196 if (*ptr == ':')
4197 {
4198 *errorcodeptr = ERR59; /* Not supported */
4199 goto FAILED;
4200 }
4201 if (*ptr != ')')
4202 {
4203 *errorcodeptr = ERR60;
4204 goto FAILED;
4205 }
4206 namelen = ptr - name;
4207 for (i = 0; i < verbcount; i++)
4208 {
4209 if (namelen == verbs[i].len &&
4210 strncmp((char *)name, vn, namelen) == 0)
4211 {
4212 *code = verbs[i].op;
4213 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4214 break;
4215 }
4216 vn += verbs[i].len + 1;
4217 }
4218 if (i < verbcount) continue;
4219 *errorcodeptr = ERR60;
4220 goto FAILED;
4221 }
4222
4223 /* Deal with the extended parentheses; all are introduced by '?', and the
4224 appearance of any of them means that this is not a capturing group. */
4225
4226 else if (*ptr == '?')
4227 {
4228 int i, set, unset, namelen;
4229 int *optset;
4230 const uschar *name;
4231 uschar *slot;
4232
4233 switch (*(++ptr))
4234 {
4235 case '#': /* Comment; skip to ket */
4236 ptr++;
4237 while (*ptr != 0 && *ptr != ')') ptr++;
4238 if (*ptr == 0)
4239 {
4240 *errorcodeptr = ERR18;
4241 goto FAILED;
4242 }
4243 continue;
4244
4245
4246 /* ------------------------------------------------------------ */
4247 case '|': /* Reset capture count for each branch */
4248 reset_bracount = TRUE;
4249 /* Fall through */
4250
4251 /* ------------------------------------------------------------ */
4252 case ':': /* Non-capturing bracket */
4253 bravalue = OP_BRA;
4254 ptr++;
4255 break;
4256
4257
4258 /* ------------------------------------------------------------ */
4259 case '(':
4260 bravalue = OP_COND; /* Conditional group */
4261
4262 /* A condition can be an assertion, a number (referring to a numbered
4263 group), a name (referring to a named group), or 'R', referring to
4264 recursion. R<digits> and R&name are also permitted for recursion tests.
4265
4266 There are several syntaxes for testing a named group: (?(name)) is used
4267 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4268
4269 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4270 be the recursive thing or the name 'R' (and similarly for 'R' followed
4271 by digits), and (b) a number could be a name that consists of digits.
4272 In both cases, we look for a name first; if not found, we try the other
4273 cases. */
4274
4275 /* For conditions that are assertions, check the syntax, and then exit
4276 the switch. This will take control down to where bracketed groups,
4277 including assertions, are processed. */
4278
4279 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4280 break;
4281
4282 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4283 below), and all need to skip 3 bytes at the start of the group. */
4284
4285 code[1+LINK_SIZE] = OP_CREF;
4286 skipbytes = 3;
4287 refsign = -1;
4288
4289 /* Check for a test for recursion in a named group. */
4290
4291 if (ptr[1] == 'R' && ptr[2] == '&')
4292 {
4293 terminator = -1;
4294 ptr += 2;
4295 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4296 }
4297
4298 /* Check for a test for a named group's having been set, using the Perl
4299 syntax (?(<name>) or (?('name') */
4300
4301 else if (ptr[1] == '<')
4302 {
4303 terminator = '>';
4304 ptr++;
4305 }
4306 else if (ptr[1] == '\'')
4307 {
4308 terminator = '\'';
4309 ptr++;
4310 }
4311 else
4312 {
4313 terminator = 0;
4314 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4315 }
4316
4317 /* We now expect to read a name; any thing else is an error */
4318
4319 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4320 {
4321 ptr += 1; /* To get the right offset */
4322 *errorcodeptr = ERR28;
4323 goto FAILED;
4324 }
4325
4326 /* Read the name, but also get it as a number if it's all digits */
4327
4328 recno = 0;
4329 name = ++ptr;
4330 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4331 {
4332 if (recno >= 0)
4333 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4334 recno * 10 + *ptr - '0' : -1;
4335 ptr++;
4336 }
4337 namelen = ptr - name;
4338
4339 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4340 {
4341 ptr--; /* Error offset */
4342 *errorcodeptr = ERR26;
4343 goto FAILED;
4344 }
4345
4346 /* Do no further checking in the pre-compile phase. */
4347
4348 if (lengthptr != NULL) break;
4349
4350 /* In the real compile we do the work of looking for the actual
4351 reference. If the string started with "+" or "-" we require the rest to
4352 be digits, in which case recno will be set. */
4353
4354 if (refsign > 0)
4355 {
4356 if (recno <= 0)
4357 {
4358 *errorcodeptr = ERR58;
4359 goto FAILED;
4360 }
4361 recno = (refsign == '-')?
4362 cd->bracount - recno + 1 : recno +cd->bracount;
4363 if (recno <= 0 || recno > cd->final_bracount)
4364 {
4365 *errorcodeptr = ERR15;
4366 goto FAILED;
4367 }
4368 PUT2(code, 2+LINK_SIZE, recno);
4369 break;
4370 }
4371
4372 /* Otherwise (did not start with "+" or "-"), start by looking for the
4373 name. */
4374
4375 slot = cd->name_table;
4376 for (i = 0; i < cd->names_found; i++)
4377 {
4378 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4379 slot += cd->name_entry_size;
4380 }
4381
4382 /* Found a previous named subpattern */
4383
4384 if (i < cd->names_found)
4385 {
4386 recno = GET2(slot, 0);
4387 PUT2(code, 2+LINK_SIZE, recno);
4388 }
4389
4390 /* Search the pattern for a forward reference */
4391
4392 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4393 (options & PCRE_EXTENDED) != 0)) > 0)
4394 {
4395 PUT2(code, 2+LINK_SIZE, i);
4396 }
4397
4398 /* If terminator == 0 it means that the name followed directly after
4399 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4400 some further alternatives to try. For the cases where terminator != 0
4401 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4402 now checked all the possibilities, so give an error. */
4403
4404 else if (terminator != 0)
4405 {
4406 *errorcodeptr = ERR15;
4407 goto FAILED;
4408 }
4409
4410 /* Check for (?(R) for recursion. Allow digits after R to specify a
4411 specific group number. */
4412
4413 else if (*name == 'R')
4414 {
4415 recno = 0;
4416 for (i = 1; i < namelen; i++)
4417 {
4418 if ((digitab[name[i]] & ctype_digit) == 0)
4419 {
4420 *errorcodeptr = ERR15;
4421 goto FAILED;
4422 }
4423 recno = recno * 10 + name[i] - '0';
4424 }
4425 if (recno == 0) recno = RREF_ANY;
4426 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4427 PUT2(code, 2+LINK_SIZE, recno);
4428 }
4429
4430 /* Similarly, check for the (?(DEFINE) "condition", which is always
4431 false. */
4432
4433 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4434 {
4435 code[1+LINK_SIZE] = OP_DEF;
4436 skipbytes = 1;
4437 }
4438
4439 /* Check for the "name" actually being a subpattern number. We are
4440 in the second pass here, so final_bracount is set. */
4441
4442 else if (recno > 0 && recno <= cd->final_bracount)
4443 {
4444 PUT2(code, 2+LINK_SIZE, recno);
4445 }
4446
4447 /* Either an unidentified subpattern, or a reference to (?(0) */
4448
4449 else
4450 {
4451 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4452 goto FAILED;
4453 }
4454 break;
4455
4456
4457 /* ------------------------------------------------------------ */
4458 case '=': /* Positive lookahead */
4459 bravalue = OP_ASSERT;
4460 ptr++;
4461 break;
4462
4463
4464 /* ------------------------------------------------------------ */
4465 case '!': /* Negative lookahead */
4466 ptr++;
4467 if (*ptr == ')') /* Optimize (?!) */
4468 {
4469 *code++ = OP_FAIL;
4470 previous = NULL;
4471 continue;
4472 }
4473 bravalue = OP_ASSERT_NOT;
4474 break;
4475
4476
4477 /* ------------------------------------------------------------ */
4478 case '<': /* Lookbehind or named define */
4479 switch (ptr[1])
4480 {
4481 case '=': /* Positive lookbehind */
4482 bravalue = OP_ASSERTBACK;
4483 ptr += 2;
4484 break;
4485
4486 case '!': /* Negative lookbehind */
4487 bravalue = OP_ASSERTBACK_NOT;
4488 ptr += 2;
4489 break;
4490
4491 default: /* Could be name define, else bad */
4492 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4493 ptr++; /* Correct offset for error */
4494 *errorcodeptr = ERR24;
4495 goto FAILED;
4496 }
4497 break;
4498
4499
4500 /* ------------------------------------------------------------ */
4501 case '>': /* One-time brackets */
4502 bravalue = OP_ONCE;
4503 ptr++;
4504 break;
4505
4506
4507 /* ------------------------------------------------------------ */
4508 case 'C': /* Callout - may be followed by digits; */
4509 previous_callout = code; /* Save for later completion */
4510 after_manual_callout = 1; /* Skip one item before completing */
4511 *code++ = OP_CALLOUT;
4512 {
4513 int n = 0;
4514 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4515 n = n * 10 + *ptr - '0';
4516 if (*ptr != ')')
4517 {
4518 *errorcodeptr = ERR39;
4519 goto FAILED;
4520 }
4521 if (n > 255)
4522 {
4523 *errorcodeptr = ERR38;
4524 goto FAILED;
4525 }
4526 *code++ = n;
4527 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4528 PUT(code, LINK_SIZE, 0); /* Default length */
4529 code += 2 * LINK_SIZE;
4530 }
4531 previous = NULL;
4532 continue;
4533
4534
4535 /* ------------------------------------------------------------ */
4536 case 'P': /* Python-style named subpattern handling */
4537 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4538 {
4539 is_recurse = *ptr == '>';
4540 terminator = ')';
4541 goto NAMED_REF_OR_RECURSE;
4542 }
4543 else if (*ptr != '<') /* Test for Python-style definition */
4544 {
4545 *errorcodeptr = ERR41;
4546 goto FAILED;
4547 }
4548 /* Fall through to handle (?P< as (?< is handled */
4549
4550
4551 /* ------------------------------------------------------------ */
4552 DEFINE_NAME: /* Come here from (?< handling */
4553 case '\'':
4554 {
4555 terminator = (*ptr == '<')? '>' : '\'';
4556 name = ++ptr;
4557
4558 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4559 namelen = ptr - name;
4560
4561 /* In the pre-compile phase, just do a syntax check. */
4562
4563 if (lengthptr != NULL)
4564 {
4565 if (*ptr != terminator)
4566 {
4567 *errorcodeptr = ERR42;
4568 goto FAILED;
4569 }
4570 if (cd->names_found >= MAX_NAME_COUNT)
4571 {
4572 *errorcodeptr = ERR49;
4573 goto FAILED;
4574 }
4575 if (namelen + 3 > cd->name_entry_size)
4576 {
4577 cd->name_entry_size = namelen + 3;
4578 if (namelen > MAX_NAME_SIZE)
4579 {
4580 *errorcodeptr = ERR48;
4581 goto FAILED;
4582 }
4583 }
4584 }
4585
4586 /* In the real compile, create the entry in the table */
4587
4588 else
4589 {
4590 slot = cd->name_table;
4591 for (i = 0; i < cd->names_found; i++)
4592 {
4593 int crc = memcmp(name, slot+2, namelen);
4594 if (crc == 0)
4595 {
4596 if (slot[2+namelen] == 0)
4597 {
4598 if ((options & PCRE_DUPNAMES) == 0)
4599 {
4600 *errorcodeptr = ERR43;
4601 goto FAILED;
4602 }
4603 }
4604 else crc = -1; /* Current name is substring */
4605 }
4606 if (crc < 0)
4607 {
4608 memmove(slot + cd->name_entry_size, slot,
4609 (cd->names_found - i) * cd->name_entry_size);
4610 break;
4611 }
4612 slot += cd->name_entry_size;
4613 }
4614
4615 PUT2(slot, 0, cd->bracount + 1);
4616 memcpy(slot + 2, name, namelen);
4617 slot[2+namelen] = 0;
4618 }
4619 }
4620
4621 /* In both cases, count the number of names we've encountered. */
4622
4623 ptr++; /* Move past > or ' */
4624 cd->names_found++;
4625 goto NUMBERED_GROUP;
4626
4627
4628 /* ------------------------------------------------------------ */
4629 case '&': /* Perl recursion/subroutine syntax */
4630 terminator = ')';
4631 is_recurse = TRUE;
4632 /* Fall through */
4633
4634 /* We come here from the Python syntax above that handles both
4635 references (?P=name) and recursion (?P>name), as well as falling
4636 through from the Perl recursion syntax (?&name). We also come here from
4637 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4638 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4639
4640 NAMED_REF_OR_RECURSE:
4641 name = ++ptr;
4642 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4643 namelen = ptr - name;
4644
4645 /* In the pre-compile phase, do a syntax check and set a dummy
4646 reference number. */
4647
4648 if (lengthptr != NULL)
4649 {
4650 if (namelen == 0)
4651 {
4652 *errorcodeptr = ERR62;
4653 goto FAILED;
4654 }
4655 if (*ptr != terminator)
4656 {
4657 *errorcodeptr = ERR42;
4658 goto FAILED;
4659 }
4660 if (namelen > MAX_NAME_SIZE)
4661 {
4662 *errorcodeptr = ERR48;
4663 goto FAILED;
4664 }
4665 recno = 0;
4666 }
4667
4668 /* In the real compile, seek the name in the table. We check the name
4669 first, and then check that we have reached the end of the name in the
4670 table. That way, if the name that is longer than any in the table,
4671 the comparison will fail without reading beyond the table entry. */
4672
4673 else
4674 {
4675 slot = cd->name_table;
4676 for (i = 0; i < cd->names_found; i++)
4677 {
4678 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4679 slot[2+namelen] == 0)
4680 break;
4681 slot += cd->name_entry_size;
4682 }
4683
4684 if (i < cd->names_found) /* Back reference */
4685 {
4686 recno = GET2(slot, 0);
4687 }
4688 else if ((recno = /* Forward back reference */
4689 find_parens(ptr, cd->bracount, name, namelen,
4690 (options & PCRE_EXTENDED) != 0)) <= 0)
4691 {
4692 *errorcodeptr = ERR15;
4693 goto FAILED;
4694 }
4695 }
4696
4697 /* In both phases, we can now go to the code than handles numerical
4698 recursion or backreferences. */
4699
4700 if (is_recurse) goto HANDLE_RECURSION;
4701 else goto HANDLE_REFERENCE;
4702
4703
4704 /* ------------------------------------------------------------ */
4705 case 'R': /* Recursion */
4706 ptr++; /* Same as (?0) */
4707 /* Fall through */
4708
4709
4710 /* ------------------------------------------------------------ */
4711 case '-': case '+':
4712 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4713 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4714 {
4715 const uschar *called;
4716 terminator = ')';
4717
4718 /* Come here from the \g<...> and \g'...' code (Oniguruma
4719 compatibility). However, the syntax has been checked to ensure that
4720 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4721 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4722 ever be taken. */
4723
4724 HANDLE_NUMERICAL_RECURSION:
4725
4726 if ((refsign = *ptr) == '+')
4727 {
4728 ptr++;
4729 if ((digitab[*ptr] & ctype_digit) == 0)
4730 {
4731 *errorcodeptr = ERR63;
4732 goto FAILED;
4733 }
4734 }
4735 else if (refsign == '-')
4736 {
4737 if ((digitab[ptr[1]] & ctype_digit) == 0)
4738 goto OTHER_CHAR_AFTER_QUERY;
4739 ptr++;
4740 }
4741
4742 recno = 0;
4743 while((digitab[*ptr] & ctype_digit) != 0)
4744 recno = recno * 10 + *ptr++ - '0';
4745
4746 if (*ptr != terminator)
4747 {
4748 *errorcodeptr = ERR29;
4749 goto FAILED;
4750 }
4751
4752 if (refsign == '-')
4753 {
4754 if (recno == 0)
4755 {
4756 *errorcodeptr = ERR58;
4757 goto FAILED;
4758 }
4759 recno = cd->bracount - recno + 1;
4760 if (recno <= 0)
4761 {
4762 *errorcodeptr = ERR15;
4763 goto FAILED;
4764 }
4765 }
4766 else if (refsign == '+')
4767 {
4768 if (recno == 0)
4769 {
4770 *errorcodeptr = ERR58;
4771 goto FAILED;
4772 }
4773 recno += cd->bracount;
4774 }
4775
4776 /* Come here from code above that handles a named recursion */
4777
4778 HANDLE_RECURSION:
4779
4780 previous = code;
4781 called = cd->start_code;
4782
4783 /* When we are actually compiling, find the bracket that is being
4784 referenced. Temporarily end the regex in case it doesn't exist before
4785 this point. If we end up with a forward reference, first check that
4786 the bracket does occur later so we can give the error (and position)
4787 now. Then remember this forward reference in the workspace so it can
4788 be filled in at the end. */
4789
4790 if (lengthptr == NULL)
4791 {
4792 *code = OP_END;
4793 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4794
4795 /* Forward reference */
4796
4797 if (called == NULL)
4798 {
4799 if (find_parens(ptr, cd->bracount, NULL, recno,
4800 (options & PCRE_EXTENDED) != 0) < 0)
4801 {
4802 *errorcodeptr = ERR15;
4803 goto FAILED;
4804 }
4805 called = cd->start_code + recno;
4806 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4807 }
4808
4809 /* If not a forward reference, and the subpattern is still open,
4810 this is a recursive call. We check to see if this is a left
4811 recursion that could loop for ever, and diagnose that case. */
4812
4813 else if (GET(called, 1) == 0 &&
4814 could_be_empty(called, code, bcptr, utf8))
4815 {
4816 *errorcodeptr = ERR40;
4817 goto FAILED;
4818 }
4819 }
4820
4821 /* Insert the recursion/subroutine item, automatically wrapped inside
4822 "once" brackets. Set up a "previous group" length so that a
4823 subsequent quantifier will work. */
4824
4825 *code = OP_ONCE;
4826 PUT(code, 1, 2 + 2*LINK_SIZE);
4827 code += 1 + LINK_SIZE;
4828
4829 *code = OP_RECURSE;
4830 PUT(code, 1, called - cd->start_code);
4831 code += 1 + LINK_SIZE;
4832
4833 *code = OP_KET;
4834 PUT(code, 1, 2 + 2*LINK_SIZE);
4835 code += 1 + LINK_SIZE;
4836
4837 length_prevgroup = 3 + 3*LINK_SIZE;
4838 }
4839
4840 /* Can't determine a first byte now */
4841
4842 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4843 continue;
4844
4845
4846 /* ------------------------------------------------------------ */
4847 default: /* Other characters: check option setting */
4848 OTHER_CHAR_AFTER_QUERY:
4849 set = unset = 0;
4850 optset = &set;
4851
4852 while (*ptr != ')' && *ptr != ':')
4853 {
4854 switch (*ptr++)
4855 {
4856 case '-': optset = &unset; break;
4857
4858 case 'J': /* Record that it changed in the external options */
4859 *optset |= PCRE_DUPNAMES;
4860 cd->external_flags |= PCRE_JCHANGED;
4861 break;
4862
4863 case 'i': *optset |= PCRE_CASELESS; break;
4864 case 'm': *optset |= PCRE_MULTILINE; break;
4865 case 's': *optset |= PCRE_DOTALL; break;
4866 case 'x': *optset |= PCRE_EXTENDED; break;
4867 case 'U': *optset |= PCRE_UNGREEDY; break;
4868 case 'X': *optset |= PCRE_EXTRA; break;
4869
4870 default: *errorcodeptr = ERR12;
4871 ptr--; /* Correct the offset */
4872 goto FAILED;
4873 }
4874 }
4875
4876 /* Set up the changed option bits, but don't change anything yet. */
4877
4878 newoptions = (options | set) & (~unset);
4879
4880 /* If the options ended with ')' this is not the start of a nested
4881 group with option changes, so the options change at this level. If this
4882 item is right at the start of the pattern, the options can be
4883 abstracted and made external in the pre-compile phase, and ignored in
4884 the compile phase. This can be helpful when matching -- for instance in
4885 caseless checking of required bytes.
4886
4887 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4888 definitely *not* at the start of the pattern because something has been
4889 compiled. In the pre-compile phase, however, the code pointer can have
4890 that value after the start, because it gets reset as code is discarded
4891 during the pre-compile. However, this can happen only at top level - if
4892 we are within parentheses, the starting BRA will still be present. At
4893 any parenthesis level, the length value can be used to test if anything
4894 has been compiled at that level. Thus, a test for both these conditions
4895 is necessary to ensure we correctly detect the start of the pattern in
4896 both phases.
4897
4898 If we are not at the pattern start, compile code to change the ims
4899 options if this setting actually changes any of them. We also pass the
4900 new setting back so that it can be put at the start of any following
4901 branches, and when this group ends (if we are in a group), a resetting
4902 item can be compiled. */
4903
4904 if (*ptr == ')')
4905 {
4906 if (code == cd->start_code + 1 + LINK_SIZE &&
4907 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4908 {
4909 cd->external_options = newoptions;
4910 options = newoptions;
4911 }
4912 else
4913 {
4914 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4915 {
4916 *code++ = OP_OPT;
4917 *code++ = newoptions & PCRE_IMS;
4918 }
4919
4920 /* Change options at this level, and pass them back for use
4921 in subsequent branches. Reset the greedy defaults and the case
4922 value for firstbyte and reqbyte. */
4923
4924 *optionsptr = options = newoptions;
4925 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4926 greedy_non_default = greedy_default ^ 1;
4927 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4928 }
4929
4930 previous = NULL; /* This item can't be repeated */
4931 continue; /* It is complete */
4932 }
4933
4934 /* If the options ended with ':' we are heading into a nested group
4935 with possible change of options. Such groups are non-capturing and are
4936 not assertions of any kind. All we need to do is skip over the ':';
4937 the newoptions value is handled below. */
4938
4939 bravalue = OP_BRA;
4940 ptr++;
4941 } /* End of switch for character following (? */
4942 } /* End of (? handling */
4943
4944 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4945 all unadorned brackets become non-capturing and behave like (?:...)
4946 brackets. */
4947
4948 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4949 {
4950 bravalue = OP_BRA;
4951 }
4952
4953 /* Else we have a capturing group. */
4954
4955 else
4956 {
4957 NUMBERED_GROUP:
4958 cd->bracount += 1;
4959 PUT2(code, 1+LINK_SIZE, cd->bracount);
4960 skipbytes = 2;
4961 }
4962
4963 /* Process nested bracketed regex. Assertions may not be repeated, but
4964 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4965 non-register variable in order to be able to pass its address because some
4966 compilers complain otherwise. Pass in a new setting for the ims options if
4967 they have changed. */
4968
4969 previous = (bravalue >= OP_ONCE)? code : NULL;
4970 *code = bravalue;
4971 tempcode = code;
4972 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4973 length_prevgroup = 0; /* Initialize for pre-compile phase */
4974
4975 if (!compile_regex(
4976 newoptions, /* The complete new option state */
4977 options & PCRE_IMS, /* The previous ims option state */
4978 &tempcode, /* Where to put code (updated) */
4979 &ptr, /* Input pointer (updated) */
4980 errorcodeptr, /* Where to put an error message */
4981 (bravalue == OP_ASSERTBACK ||
4982 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4983 reset_bracount, /* True if (?| group */
4984 skipbytes, /* Skip over bracket number */
4985 &subfirstbyte, /* For possible first char */
4986 &subreqbyte, /* For possible last char */
4987 bcptr, /* Current branch chain */
4988 cd, /* Tables block */
4989 (lengthptr == NULL)? NULL : /* Actual compile phase */
4990 &length_prevgroup /* Pre-compile phase */
4991 ))
4992 goto FAILED;
4993
4994 /* At the end of compiling, code is still pointing to the start of the
4995 group, while tempcode has been updated to point past the end of the group
4996 and any option resetting that may follow it. The pattern pointer (ptr)
4997 is on the bracket. */
4998
4999 /* If this is a conditional bracket, check that there are no more than
5000 two branches in the group, or just one if it's a DEFINE group. We do this
5001 in the real compile phase, not in the pre-pass, where the whole group may
5002 not be available. */
5003
5004 if (bravalue == OP_COND && lengthptr == NULL)
5005 {
5006 uschar *tc = code;
5007 int condcount = 0;
5008
5009 do {
5010 condcount++;
5011 tc += GET(tc,1);
5012 }
5013 while (*tc != OP_KET);
5014
5015 /* A DEFINE group is never obeyed inline (the "condition" is always
5016 false). It must have only one branch. */
5017
5018 if (code[LINK_SIZE+1] == OP_DEF)
5019 {
5020 if (condcount > 1)
5021 {
5022 *errorcodeptr = ERR54;
5023 goto FAILED;
5024 }
5025 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5026 }
5027
5028 /* A "normal" conditional group. If there is just one branch, we must not
5029 make use of its firstbyte or reqbyte, because this is equivalent to an
5030 empty second branch. */
5031
5032 else
5033 {
5034 if (condcount > 2)
5035 {
5036 *errorcodeptr = ERR27;
5037 goto FAILED;
5038 }
5039 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5040 }
5041 }
5042
5043 /* Error if hit end of pattern */
5044
5045 if (*ptr != ')')
5046 {
5047 *errorcodeptr = ERR14;
5048 goto FAILED;
5049 }
5050
5051 /* In the pre-compile phase, update the length by the length of the group,
5052 less the brackets at either end. Then reduce the compiled code to just a
5053 set of non-capturing brackets so that it doesn't use much memory if it is
5054 duplicated by a quantifier.*/
5055
5056 if (lengthptr != NULL)
5057 {
5058 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5059 {
5060 *errorcodeptr = ERR20;
5061 goto FAILED;
5062 }
5063 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5064 *code++ = OP_BRA;
5065 PUTINC(code, 0, 1 + LINK_SIZE);
5066 *code++ = OP_KET;
5067 PUTINC(code, 0, 1 + LINK_SIZE);
5068 break; /* No need to waste time with special character handling */
5069 }
5070
5071 /* Otherwise update the main code pointer to the end of the group. */
5072
5073 code = tempcode;
5074
5075 /* For a DEFINE group, required and first character settings are not
5076 relevant. */
5077
5078 if (bravalue == OP_DEF) break;
5079
5080 /* Handle updating of the required and first characters for other types of
5081 group. Update for normal brackets of all kinds, and conditions with two
5082 branches (see code above). If the bracket is followed by a quantifier with
5083 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5084 zerofirstbyte outside the main loop so that they can be accessed for the
5085 back off. */
5086
5087 zeroreqbyte = reqbyte;
5088 zerofirstbyte = firstbyte;
5089 groupsetfirstbyte = FALSE;
5090
5091 if (bravalue >= OP_ONCE)
5092 {
5093 /* If we have not yet set a firstbyte in this branch, take it from the
5094 subpattern, remembering that it was set here so that a repeat of more
5095 than one can replicate it as reqbyte if necessary. If the subpattern has
5096 no firstbyte, set "none" for the whole branch. In both cases, a zero
5097 repeat forces firstbyte to "none". */
5098
5099 if (firstbyte == REQ_UNSET)
5100 {
5101 if (subfirstbyte >= 0)
5102 {
5103 firstbyte = subfirstbyte;
5104 groupsetfirstbyte = TRUE;
5105 }
5106 else firstbyte = REQ_NONE;
5107 zerofirstbyte = REQ_NONE;
5108 }
5109
5110 /* If firstbyte was previously set, convert the subpattern's firstbyte
5111 into reqbyte if there wasn't one, using the vary flag that was in
5112 existence beforehand. */
5113
5114 else if (subfirstbyte >= 0 && subreqbyte < 0)
5115 subreqbyte = subfirstbyte | tempreqvary;
5116
5117 /* If the subpattern set a required byte (or set a first byte that isn't
5118 really the first byte - see above), set it. */
5119
5120 if (subreqbyte >= 0) reqbyte = subreqbyte;
5121 }
5122
5123 /* For a forward assertion, we take the reqbyte, if set. This can be
5124 helpful if the pattern that follows the assertion doesn't set a different
5125 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5126 for an assertion, however because it leads to incorrect effect for patterns
5127 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5128 of a firstbyte. This is overcome by a scan at the end if there's no
5129 firstbyte, looking for an asserted first char. */
5130
5131 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5132 break; /* End of processing '(' */
5133
5134
5135 /* ===================================================================*/
5136 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5137 are arranged to be the negation of the corresponding OP_values. For the
5138 back references, the values are ESC_REF plus the reference number. Only
5139 back references and those types that consume a character may be repeated.
5140 We can test for values between ESC_b and ESC_Z for the latter; this may
5141 have to change if any new ones are ever created. */
5142
5143 case '\\':
5144 tempptr = ptr;
5145 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5146 if (*errorcodeptr != 0) goto FAILED;
5147
5148 if (c < 0)
5149 {
5150 if (-c == ESC_Q) /* Handle start of quoted string */
5151 {
5152 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5153 else inescq = TRUE;
5154 continue;
5155 }
5156
5157 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5158
5159 /* For metasequences that actually match a character, we disable the
5160 setting of a first character if it hasn't already been set. */
5161
5162 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5163 firstbyte = REQ_NONE;
5164
5165 /* Set values to reset to if this is followed by a zero repeat. */
5166
5167 zerofirstbyte = firstbyte;
5168 zeroreqbyte = reqbyte;
5169
5170 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5171 is a subroutine call by number (Oniguruma syntax). In fact, the value
5172 -ESC_g is returned only for these cases. So we don't need to check for <
5173 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5174 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5175 that is a synonym for a named back reference). */
5176
5177 if (-c == ESC_g)
5178 {
5179 const uschar *p;
5180 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5181 terminator = (*(++ptr) == '<')? '>' : '\'';
5182
5183 /* These two statements stop the compiler for warning about possibly
5184 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5185 fact, because we actually check for a number below, the paths that
5186 would actually be in error are never taken. */
5187
5188 skipbytes = 0;
5189 reset_bracount = FALSE;
5190
5191 /* Test for a name */
5192
5193 if (ptr[1] != '+' && ptr[1] != '-')
5194 {
5195 BOOL isnumber = TRUE;
5196 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5197 {
5198 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5199 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5200 }
5201 if (*p != terminator)
5202 {
5203 *errorcodeptr = ERR57;
5204 break;
5205 }
5206 if (isnumber)
5207 {
5208 ptr++;
5209 goto HANDLE_NUMERICAL_RECURSION;
5210 }
5211 is_recurse = TRUE;
5212 goto NAMED_REF_OR_RECURSE;
5213 }
5214
5215 /* Test a signed number in angle brackets or quotes. */
5216
5217 p = ptr + 2;
5218 while ((digitab[*p] & ctype_digit) != 0) p++;
5219 if (*p != terminator)
5220 {
5221 *errorcodeptr = ERR57;
5222 break;
5223 }
5224 ptr++;
5225 goto HANDLE_NUMERICAL_RECURSION;
5226 }
5227
5228 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5229 We also support \k{name} (.NET syntax) */
5230
5231 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5232 {
5233 is_recurse = FALSE;
5234 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5235 goto NAMED_REF_OR_RECURSE;
5236 }
5237
5238 /* Back references are handled specially; must disable firstbyte if
5239 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5240 ':' later. */
5241
5242 if (-c >= ESC_REF)
5243 {
5244 recno = -c - ESC_REF;
5245
5246 HANDLE_REFERENCE: /* Come here from named backref handling */
5247 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5248 previous = code;
5249 *code++ = OP_REF;
5250 PUT2INC(code, 0, recno);
5251 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5252 if (recno > cd->top_backref) cd->top_backref = recno;
5253 }
5254
5255 /* So are Unicode property matches, if supported. */
5256
5257 #ifdef SUPPORT_UCP
5258 else if (-c == ESC_P || -c == ESC_p)
5259 {
5260 BOOL negated;
5261 int pdata;
5262 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5263 if (ptype < 0) goto FAILED;
5264 previous = code;
5265 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5266 *code++ = ptype;
5267 *code++ = pdata;
5268 }
5269 #else
5270
5271 /* If Unicode properties are not supported, \X, \P, and \p are not
5272 allowed. */
5273
5274 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5275 {
5276 *errorcodeptr = ERR45;
5277 goto FAILED;
5278 }
5279 #endif
5280
5281 /* For the rest (including \X when Unicode properties are supported), we
5282 can obtain the OP value by negating the escape value. */
5283
5284 else
5285 {
5286 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5287 *code++ = -c;
5288 }
5289 continue;
5290 }
5291
5292 /* We have a data character whose value is in c. In UTF-8 mode it may have
5293 a value > 127. We set its representation in the length/buffer, and then
5294 handle it as a data character. */
5295
5296 #ifdef SUPPORT_UTF8
5297 if (utf8 && c > 127)
5298 mclength = _pcre_ord2utf8(c, mcbuffer);
5299 else
5300 #endif
5301
5302 {
5303 mcbuffer[0] = c;
5304 mclength = 1;
5305 }
5306 goto ONE_CHAR;
5307
5308
5309 /* ===================================================================*/
5310 /* Handle a literal character. It is guaranteed not to be whitespace or #
5311 when the extended flag is set. If we are in UTF-8 mode, it may be a
5312 multi-byte literal character. */
5313
5314 default:
5315 NORMAL_CHAR:
5316 mclength = 1;
5317 mcbuffer[0] = c;
5318
5319 #ifdef SUPPORT_UTF8
5320 if (utf8 && c >= 0xc0)
5321 {
5322 while ((ptr[1] & 0xc0) == 0x80)
5323 mcbuffer[mclength++] = *(++ptr);
5324 }
5325 #endif
5326
5327 /* At this point we have the character's bytes in mcbuffer, and the length
5328 in mclength. When not in UTF-8 mode, the length is always 1. */
5329
5330 ONE_CHAR:
5331 previous = code;
5332 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5333 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5334
5335 /* Remember if \r or \n were seen */
5336
5337 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5338 cd->external_flags |= PCRE_HASCRORLF;
5339
5340 /* Set the first and required bytes appropriately. If no previous first
5341 byte, set it from this character, but revert to none on a zero repeat.
5342 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5343 repeat. */
5344
5345 if (firstbyte == REQ_UNSET)
5346 {
5347 zerofirstbyte = REQ_NONE;
5348 zeroreqbyte = reqbyte;
5349
5350 /* If the character is more than one byte long, we can set firstbyte
5351 only if it is not to be matched caselessly. */
5352
5353 if (mclength == 1 || req_caseopt == 0)
5354 {
5355 firstbyte = mcbuffer[0] | req_caseopt;
5356 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5357 }
5358 else firstbyte = reqbyte = REQ_NONE;
5359 }
5360
5361 /* firstbyte was previously set; we can set reqbyte only the length is
5362 1 or the matching is caseful. */
5363
5364 else
5365 {
5366 zerofirstbyte = firstbyte;
5367 zeroreqbyte = reqbyte;
5368 if (mclength == 1 || req_caseopt == 0)
5369 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5370 }
5371
5372 break; /* End of literal character handling */
5373 }
5374 } /* end of big loop */
5375
5376
5377 /* Control never reaches here by falling through, only by a goto for all the
5378 error states. Pass back the position in the pattern so that it can be displayed
5379 to the user for diagnosing the error. */
5380
5381 FAILED:
5382 *ptrptr = ptr;
5383 return FALSE;
5384 }
5385
5386
5387
5388
5389 /*************************************************
5390 * Compile sequence of alternatives *
5391 *************************************************/
5392
5393 /* On entry, ptr is pointing past the bracket character, but on return it
5394 points to the closing bracket, or vertical bar, or end of string. The code
5395 variable is pointing at the byte into which the BRA operator has been stored.
5396 If the ims options are changed at the start (for a (?ims: group) or during any
5397 branch, we need to insert an OP_OPT item at the start of every following branch
5398 to ensure they get set correctly at run time, and also pass the new options
5399 into every subsequent branch compile.
5400
5401 This function is used during the pre-compile phase when we are trying to find
5402 out the amount of memory needed, as well as during the real compile phase. The
5403 value of lengthptr distinguishes the two phases.
5404
5405 Arguments:
5406 options option bits, including any changes for this subpattern
5407 oldims previous settings of ims option bits
5408 codeptr -> the address of the current code pointer
5409 ptrptr -> the address of the current pattern pointer
5410 errorcodeptr -> pointer to error code variable
5411 lookbehind TRUE if this is a lookbehind assertion
5412 reset_bracount TRUE to reset the count for each branch
5413 skipbytes skip this many bytes at start (for brackets and OP_COND)
5414 firstbyteptr place to put the first required character, or a negative number
5415 reqbyteptr place to put the last required character, or a negative number
5416 bcptr pointer to the chain of currently open branches
5417 cd points to the data block with tables pointers etc.
5418 lengthptr NULL during the real compile phase
5419 points to length accumulator during pre-compile phase
5420
5421 Returns: TRUE on success
5422 */
5423
5424 static BOOL
5425 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5426 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5427 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5428 int *lengthptr)
5429 {
5430 const uschar *ptr = *ptrptr;
5431 uschar *code = *codeptr;
5432 uschar *last_branch = code;
5433 uschar *start_bracket = code;
5434 uschar *reverse_count = NULL;
5435 int firstbyte, reqbyte;
5436 int branchfirstbyte, branchreqbyte;
5437 int length;
5438 int orig_bracount;
5439 int max_bracount;
5440 branch_chain bc;
5441
5442 bc.outer = bcptr;
5443 bc.current = code;
5444
5445 firstbyte = reqbyte = REQ_UNSET;
5446
5447 /* Accumulate the length for use in the pre-compile phase. Start with the
5448 length of the BRA and KET and any extra bytes that are required at the
5449 beginning. We accumulate in a local variable to save frequent testing of
5450 lenthptr for NULL. We cannot do this by looking at the value of code at the
5451 start and end of each alternative, because compiled items are discarded during
5452 the pre-compile phase so that the work space is not exceeded. */
5453
5454 length = 2 + 2*LINK_SIZE + skipbytes;
5455
5456 /* WARNING: If the above line is changed for any reason, you must also change
5457 the code that abstracts option settings at the start of the pattern and makes
5458 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5459 pre-compile phase to find out whether anything has yet been compiled or not. */
5460
5461 /* Offset is set zero to mark that this bracket is still open */
5462
5463 PUT(code, 1, 0);
5464 code += 1 + LINK_SIZE + skipbytes;
5465
5466 /* Loop for each alternative branch */
5467
5468 orig_bracount = max_bracount = cd->bracount;
5469 for (;;)
5470 {
5471 /* For a (?| group, reset the capturing bracket count so that each branch
5472 uses the same numbers. */
5473
5474 if (reset_bracount) cd->bracount = orig_bracount;
5475
5476 /* Handle a change of ims options at the start of the branch */
5477
5478 if ((options & PCRE_IMS) != oldims)
5479 {
5480 *code++ = OP_OPT;
5481 *code++ = options & PCRE_IMS;
5482 length += 2;
5483 }
5484
5485 /* Set up dummy OP_REVERSE if lookbehind assertion */
5486
5487 if (lookbehind)
5488 {
5489 *code++ = OP_REVERSE;
5490 reverse_count = code;
5491 PUTINC(code, 0, 0);
5492 length += 1 + LINK_SIZE;
5493 }
5494
5495 /* Now compile the branch; in the pre-compile phase its length gets added
5496 into the length. */
5497
5498 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5499 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5500 {
5501 *ptrptr = ptr;
5502 return FALSE;
5503 }
5504
5505 /* Keep the highest bracket count in case (?| was used and some branch
5506 has fewer than the rest. */
5507
5508 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5509
5510 /* In the real compile phase, there is some post-processing to be done. */
5511
5512 if (lengthptr == NULL)
5513 {
5514 /* If this is the first branch, the firstbyte and reqbyte values for the
5515 branch become the values for the regex. */
5516
5517 if (*last_branch != OP_ALT)
5518 {
5519 firstbyte = branchfirstbyte;
5520 reqbyte = branchreqbyte;
5521 }
5522
5523 /* If this is not the first branch, the first char and reqbyte have to
5524 match the values from all the previous branches, except that if the
5525 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5526 and we set REQ_VARY for the regex. */
5527
5528 else
5529 {
5530 /* If we previously had a firstbyte, but it doesn't match the new branch,
5531 we have to abandon the firstbyte for the regex, but if there was
5532 previously no reqbyte, it takes on the value of the old firstbyte. */
5533
5534 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5535 {
5536 if (reqbyte < 0) reqbyte = firstbyte;
5537 firstbyte = REQ_NONE;
5538 }
5539
5540 /* If we (now or from before) have no firstbyte, a firstbyte from the
5541 branch becomes a reqbyte if there isn't a branch reqbyte. */
5542
5543 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5544 branchreqbyte = branchfirstbyte;
5545
5546 /* Now ensure that the reqbytes match */
5547
5548 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5549 reqbyte = REQ_NONE;
5550 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5551 }
5552
5553 /* If lookbehind, check that this branch matches a fixed-length string, and
5554 put the length into the OP_REVERSE item. Temporarily mark the end of the
5555 branch with OP_END. */
5556
5557 if (lookbehind)
5558 {
5559 int fixed_length;
5560 *code = OP_END;
5561 fixed_length = find_fixedlength(last_branch, options);
5562 DPRINTF(("fixed length = %d\n", fixed_length));
5563 if (fixed_length < 0)
5564 {
5565 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5566 *ptrptr = ptr;
5567 return FALSE;
5568 }
5569 PUT(reverse_count, 0, fixed_length);
5570 }
5571 }
5572
5573 /* Reached end of expression, either ')' or end of pattern. In the real
5574 compile phase, go back through the alternative branches and reverse the chain
5575 of offsets, with the field in the BRA item now becoming an offset to the
5576 first alternative. If there are no alternatives, it points to the end of the
5577 group. The length in the terminating ket is always the length of the whole
5578 bracketed item. If any of the ims options were changed inside the group,
5579 compile a resetting op-code following, except at the very end of the pattern.
5580 Return leaving the pointer at the terminating char. */
5581
5582 if (*ptr != '|')
5583 {
5584 if (lengthptr == NULL)
5585 {
5586 int branch_length = code - last_branch;
5587 do
5588 {
5589 int prev_length = GET(last_branch, 1);
5590 PUT(last_branch, 1, branch_length);
5591 branch_length = prev_length;
5592 last_branch -= branch_length;
5593 }
5594 while (branch_length > 0);
5595 }
5596
5597 /* Fill in the ket */
5598
5599 *code = OP_KET;
5600 PUT(code, 1, code - start_bracket);
5601 code += 1 + LINK_SIZE;
5602
5603 /* Resetting option if needed */
5604
5605 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5606 {
5607 *code++ = OP_OPT;
5608 *code++ = oldims;
5609 length += 2;
5610 }
5611
5612 /* Retain the highest bracket number, in case resetting was used. */
5613
5614 cd->bracount = max_bracount;
5615
5616 /* Set values to pass back */
5617
5618 *codeptr = code;
5619 *ptrptr = ptr;
5620 *firstbyteptr = firstbyte;
5621 *reqbyteptr = reqbyte;
5622 if (lengthptr != NULL)
5623 {
5624 if (OFLOW_MAX - *lengthptr < length)
5625 {
5626 *errorcodeptr = ERR20;
5627 return FALSE;
5628 }
5629 *lengthptr += length;
5630 }
5631 return TRUE;
5632 }
5633
5634 /* Another branch follows. In the pre-compile phase, we can move the code
5635 pointer back to where it was for the start of the first branch. (That is,
5636 pretend that each branch is the only one.)
5637
5638 In the real compile phase, insert an ALT node. Its length field points back
5639 to the previous branch while the bracket remains open. At the end the chain
5640 is reversed. It's done like this so that the start of the bracket has a
5641 zero offset until it is closed, making it possible to detect recursion. */
5642
5643 if (lengthptr != NULL)
5644 {
5645 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5646 length += 1 + LINK_SIZE;
5647 }
5648 else
5649 {
5650 *code = OP_ALT;
5651 PUT(code, 1, code - last_branch);
5652 bc.current = last_branch = code;
5653 code += 1 + LINK_SIZE;
5654 }
5655
5656 ptr++;
5657 }
5658 /* Control never reaches here */
5659 }
5660
5661
5662
5663
5664 /*************************************************
5665 * Check for anchored expression *
5666 *************************************************/
5667
5668 /* Try to find out if this is an anchored regular expression. Consider each
5669 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5670 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5671 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5672 counts, since OP_CIRC can match in the middle.
5673
5674 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5675 This is the code for \G, which means "match at start of match position, taking
5676 into account the match offset".
5677
5678 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5679 because that will try the rest of the pattern at all possible matching points,
5680 so there is no point trying again.... er ....
5681
5682 .... except when the .* appears inside capturing parentheses, and there is a
5683 subsequent back reference to those parentheses. We haven't enough information
5684 to catch that case precisely.
5685
5686 At first, the best we could do was to detect when .* was in capturing brackets
5687 and the highest back reference was greater than or equal to that level.
5688 However, by keeping a bitmap of the first 31 back references, we can catch some
5689 of the more common cases more precisely.
5690
5691 Arguments:
5692 code points to start of expression (the bracket)
5693 options points to the options setting
5694 bracket_map a bitmap of which brackets we are inside while testing; this
5695 handles up to substring 31; after that we just have to take
5696 the less precise approach
5697 backref_map the back reference bitmap
5698
5699 Returns: TRUE or FALSE
5700 */
5701
5702 static BOOL
5703 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5704 unsigned int backref_map)
5705 {
5706 do {
5707 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5708 options, PCRE_MULTILINE, FALSE);
5709 register int op = *scode;
5710
5711 /* Non-capturing brackets */
5712
5713 if (op == OP_BRA)
5714 {
5715 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5716 }
5717
5718 /* Capturing brackets */
5719
5720 else if (op == OP_CBRA)
5721 {
5722 int n = GET2(scode, 1+LINK_SIZE);
5723 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5724 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5725 }
5726
5727 /* Other brackets */
5728
5729 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5730 {
5731 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5732 }
5733
5734 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5735 are or may be referenced. */
5736
5737 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5738 op == OP_TYPEPOSSTAR) &&
5739 (*options & PCRE_DOTALL) != 0)
5740 {
5741 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5742 }
5743
5744 /* Check for explicit anchoring */
5745
5746 else if (op != OP_SOD && op != OP_SOM &&
5747 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5748 return FALSE;
5749 code += GET(code, 1);
5750 }
5751 while (*code == OP_ALT); /* Loop for each alternative */
5752 return TRUE;
5753 }
5754
5755
5756
5757 /*************************************************
5758 * Check for starting with ^ or .* *
5759 *************************************************/
5760
5761 /* This is called to find out if every branch starts with ^ or .* so that
5762 "first char" processing can be done to speed things up in multiline
5763 matching and for non-DOTALL patterns that start with .* (which must start at
5764 the beginning or after \n). As in the case of is_anchored() (see above), we
5765 have to take account of back references to capturing brackets that contain .*
5766 because in that case we can't make the assumption.
5767
5768 Arguments:
5769 code points to start of expression (the bracket)
5770 bracket_map a bitmap of which brackets we are inside while testing; this
5771 handles up to substring 31; after that we just have to take
5772 the less precise approach
5773 backref_map the back reference bitmap
5774
5775 Returns: TRUE or FALSE
5776 */
5777
5778 static BOOL
5779 is_startline(const uschar *code, unsigned int bracket_map,
5780 unsigned int backref_map)
5781 {
5782 do {
5783 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5784 NULL, 0, FALSE);
5785 register int op = *scode;
5786
5787 /* Non-capturing brackets */
5788
5789 if (op == OP_BRA)
5790 {
5791 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5792 }
5793
5794 /* Capturing brackets */
5795
5796 else if (op == OP_CBRA)
5797 {
5798 int n = GET2(scode, 1+LINK_SIZE);
5799 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5800 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5801 }
5802
5803 /* Other brackets */
5804
5805 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5806 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5807
5808 /* .* means "start at start or after \n" if it isn't in brackets that
5809 may be referenced. */
5810
5811 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5812 {
5813 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5814 }
5815
5816 /* Check for explicit circumflex */
5817
5818 else if (op != OP_CIRC) return FALSE;
5819
5820 /* Move on to the next alternative */
5821
5822 code += GET(code, 1);
5823 }
5824 while (*code == OP_ALT); /* Loop for each alternative */
5825 return TRUE;
5826 }
5827
5828
5829
5830 /*************************************************
5831 * Check for asserted fixed first char *
5832 *************************************************/
5833
5834 /* During compilation, the "first char" settings from forward assertions are
5835 discarded, because they can cause conflicts with actual literals that follow.
5836 However, if we end up without a first char setting for an unanchored pattern,
5837 it is worth scanning the regex to see if there is an initial asserted first
5838 char. If all branches start with the same asserted char, or with a bracket all
5839 of whose alternatives start with the same asserted char (recurse ad lib), then
5840 we return that char, otherwise -1.
5841
5842 Arguments:
5843 code points to start of expression (the bracket)
5844 options pointer to the options (used to check casing changes)
5845 inassert TRUE if in an assertion
5846
5847 Returns: -1 or the fixed first char
5848 */
5849
5850 static int
5851 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5852 {
5853 register int c = -1;
5854 do {
5855 int d;
5856 const uschar *scode =
5857 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5858 register int op = *scode;
5859
5860 switch(op)
5861 {
5862 default:
5863 return -1;
5864
5865 case OP_BRA:
5866 case OP_CBRA:
5867 case OP_ASSERT:
5868 case OP_ONCE:
5869 case OP_COND:
5870 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5871 return -1;
5872 if (c < 0) c = d; else if (c != d) return -1;
5873 break;
5874
5875 case OP_EXACT: /* Fall through */
5876 scode += 2;
5877
5878 case OP_CHAR:
5879 case OP_CHARNC:
5880 case OP_PLUS:
5881 case OP_MINPLUS:
5882 case OP_POSPLUS:
5883 if (!inassert) return -1;
5884 if (c < 0)
5885 {
5886 c = scode[1];
5887 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5888 }
5889 else if (c != scode[1]) return -1;
5890 break;
5891 }
5892
5893 code += GET(code, 1);
5894 }
5895 while (*code == OP_ALT);
5896 return c;
5897 }
5898
5899
5900
5901 /*************************************************
5902 * Compile a Regular Expression *
5903 *************************************************/
5904
5905 /* This function takes a string and returns a pointer to a block of store
5906 holding a compiled version of the expression. The original API for this
5907 function had no error code return variable; it is retained for backwards
5908 compatibility. The new function is given a new name.
5909
5910 Arguments:
5911 pattern the regular expression
5912 options various option bits
5913 errorcodeptr pointer to error code variable (pcre_compile2() only)
5914 can be NULL if you don't want a code value
5915 errorptr pointer to pointer to error text
5916 erroroffset ptr offset in pattern where error was detected
5917 tables pointer to character tables or NULL
5918
5919 Returns: pointer to compiled data block, or NULL on error,
5920 with errorptr and erroroffset set
5921 */
5922
5923 PCRE_EXP_DEFN pcre *
5924 pcre_compile(const char *pattern, int options, const char **errorptr,
5925 int *erroroffset, const unsigned char *tables)
5926 {
5927 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5928 }
5929
5930
5931 PCRE_EXP_DEFN pcre *
5932 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5933 const char **errorptr, int *erroroffset, const unsigned char *tables)
5934 {
5935 real_pcre *re;
5936 int length = 1; /* For final END opcode */
5937 int firstbyte, reqbyte, newline;
5938 int errorcode = 0;
5939 int skipatstart = 0;
5940 #ifdef SUPPORT_UTF8
5941 BOOL utf8;
5942 #endif
5943 size_t size;
5944 uschar *code;
5945 const uschar *codestart;
5946 const uschar *ptr;
5947 compile_data compile_block;
5948 compile_data *cd = &compile_block;
5949
5950 /* This space is used for "compiling" into during the first phase, when we are
5951 computing the amount of memory that is needed. Compiled items are thrown away
5952 as soon as possible, so that a fairly large buffer should be sufficient for
5953 this purpose. The same space is used in the second phase for remembering where
5954 to fill in forward references to subpatterns. */
5955
5956 uschar cworkspace[COMPILE_WORK_SIZE];
5957
5958 /* Set this early so that early errors get offset 0. */
5959
5960 ptr = (const uschar *)pattern;
5961
5962 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5963 can do is just return NULL, but we can set a code value if there is a code
5964 pointer. */
5965
5966 if (errorptr == NULL)
5967 {
5968 if (errorcodeptr != NULL) *errorcodeptr = 99;
5969 return NULL;
5970 }
5971
5972 *errorptr = NULL;
5973 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5974
5975 /* However, we can give a message for this error */
5976
5977 if (erroroffset == NULL)
5978 {
5979 errorcode = ERR16;
5980 goto PCRE_EARLY_ERROR_RETURN2;
5981 }
5982
5983 *erroroffset = 0;
5984
5985 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5986
5987 #ifdef SUPPORT_UTF8
5988 utf8 = (options & PCRE_UTF8) != 0;
5989 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5990 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5991 {
5992 errorcode = ERR44;
5993 goto PCRE_EARLY_ERROR_RETURN2;
5994 }
5995 #else
5996 if ((options & PCRE_UTF8) != 0)
5997 {
5998 errorcode = ERR32;
5999 goto PCRE_EARLY_ERROR_RETURN;
6000 }
6001 #endif
6002
6003 if ((options & ~PUBLIC_OPTIONS) != 0)
6004 {
6005 errorcode = ERR17;
6006 goto PCRE_EARLY_ERROR_RETURN;
6007 }
6008
6009 /* Set up pointers to the individual character tables */
6010
6011 if (tables == NULL) tables = _pcre_default_tables;
6012 cd->lcc = tables + lcc_offset;
6013 cd->fcc = tables + fcc_offset;
6014 cd->cbits = tables + cbits_offset;
6015 cd->ctypes = tables + ctypes_offset;
6016
6017 /* Check for global one-time settings at the start of the pattern, and remember
6018 the offset for later. */
6019
6020 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
6021 {
6022 int newnl = 0;
6023 int newbsr = 0;
6024
6025 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
6026 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6027 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
6028 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6029 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
6030 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6031 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
6032 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6033 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
6034 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6035
6036 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
6037 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6038 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
6039 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6040
6041 if (newnl != 0)
6042 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6043 else if (newbsr != 0)
6044 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6045 else break;
6046 }
6047
6048 /* Check validity of \R options. */
6049
6050 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6051 {
6052 case 0:
6053 case PCRE_BSR_ANYCRLF:
6054 case PCRE_BSR_UNICODE:
6055 break;
6056 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6057 }
6058
6059 /* Handle different types of newline. The three bits give seven cases. The
6060 current code allows for fixed one- or two-byte sequences, plus "any" and
6061 "anycrlf". */
6062
6063 switch (options & PCRE_NEWLINE_BITS)
6064 {
6065 case 0: newline = NEWLINE; break; /* Build-time default */
6066 case PCRE_NEWLINE_CR: newline = '\r'; break;
6067 case PCRE_NEWLINE_LF: newline = '\n'; break;
6068 case PCRE_NEWLINE_CR+
6069 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6070 case PCRE_NEWLINE_ANY: newline = -1; break;
6071 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6072 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6073 }
6074
6075 if (newline == -2)
6076 {
6077 cd->nltype = NLTYPE_ANYCRLF;
6078 }
6079 else if (newline < 0)
6080 {
6081 cd->nltype = NLTYPE_ANY;
6082 }
6083 else
6084 {
6085 cd->nltype = NLTYPE_FIXED;
6086 if (newline > 255)
6087 {
6088 cd->nllen = 2;
6089 cd->nl[0] = (newline >> 8) & 255;
6090 cd->nl[1] = newline & 255;
6091 }
6092 else
6093 {
6094 cd->nllen = 1;
6095 cd->nl[0] = newline;
6096 }
6097 }
6098
6099 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6100 references to help in deciding whether (.*) can be treated as anchored or not.
6101 */
6102
6103 cd->top_backref = 0;
6104 cd->backref_map = 0;
6105
6106 /* Reflect pattern for debugging output */
6107
6108 DPRINTF(("------------------------------------------------------------------\n"));
6109 DPRINTF(("%s\n", pattern));
6110
6111 /* Pretend to compile the pattern while actually just accumulating the length
6112 of memory required. This behaviour is triggered by passing a non-NULL final
6113 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6114 to compile parts of the pattern into; the compiled code is discarded when it is
6115 no longer needed, so hopefully this workspace will never overflow, though there
6116 is a test for its doing so. */
6117
6118 cd->bracount = cd->final_bracount = 0;
6119 cd->names_found = 0;
6120 cd->name_entry_size = 0;
6121 cd->name_table = NULL;
6122 cd->start_workspace = cworkspace;
6123 cd->start_code = cworkspace;
6124 cd->hwm = cworkspace;
6125 cd->start_pattern = (const uschar *)pattern;
6126 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6127 cd->req_varyopt = 0;
6128 cd->external_options = options;
6129 cd->external_flags = 0;
6130
6131 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6132 don't need to look at the result of the function here. The initial options have
6133 been put into the cd block so that they can be changed if an option setting is
6134 found within the regex right at the beginning. Bringing initial option settings
6135 outside can help speed up starting point checks. */
6136
6137 ptr += skipatstart;
6138 code = cworkspace;
6139 *code = OP_BRA;
6140 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6141 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6142 &length);
6143 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6144
6145 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6146 cd->hwm - cworkspace));
6147
6148 if (length > MAX_PATTERN_SIZE)
6149 {
6150 errorcode = ERR20;
6151 goto PCRE_EARLY_ERROR_RETURN;
6152 }
6153
6154 /* Compute the size of data block needed and get it, either from malloc or
6155 externally provided function. Integer overflow should no longer be possible
6156 because nowadays we limit the maximum value of cd->names_found and
6157 cd->name_entry_size. */
6158
6159 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6160 re = (real_pcre *)(pcre_malloc)(size);
6161
6162 if (re == NULL)
6163 {
6164 errorcode = ERR21;
6165 goto PCRE_EARLY_ERROR_RETURN;
6166 }
6167
6168 /* Put in the magic number, and save the sizes, initial options, internal
6169 flags, and character table pointer. NULL is used for the default character
6170 tables. The nullpad field is at the end; it's there to help in the case when a
6171 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6172 pointers. */
6173
6174 re->magic_number = MAGIC_NUMBER;
6175 re->size = size;
6176 re->options = cd->external_options;
6177 re->flags = cd->external_flags;
6178 re->dummy1 = 0;
6179 re->first_byte = 0;
6180 re->req_byte = 0;
6181 re->name_table_offset = sizeof(real_pcre);
6182 re->name_entry_size = cd->name_entry_size;
6183 re->name_count = cd->names_found;
6184 re->ref_count = 0;
6185 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6186 re->nullpad = NULL;
6187
6188 /* The starting points of the name/number translation table and of the code are
6189 passed around in the compile data block. The start/end pattern and initial
6190 options are already set from the pre-compile phase, as is the name_entry_size
6191 field. Reset the bracket count and the names_found field. Also reset the hwm
6192 field; this time it's used for remembering forward references to subpatterns.
6193 */
6194
6195 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6196 cd->bracount = 0;
6197 cd->names_found = 0;
6198 cd->name_table = (uschar *)re + re->name_table_offset;
6199 codestart = cd->name_table + re->name_entry_size * re->name_count;
6200 cd->start_code = codestart;
6201 cd->hwm = cworkspace;
6202 cd->req_varyopt = 0;
6203 cd->had_accept = FALSE;
6204
6205 /* Set up a starting, non-extracting bracket, then compile the expression. On
6206 error, errorcode will be set non-zero, so we don't need to look at the result
6207 of the function here. */
6208
6209 ptr = (const uschar *)pattern + skipatstart;
6210 code = (uschar *)codestart;
6211 *code = OP_BRA;
6212 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6213 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6214 re->top_bracket = cd->bracount;
6215 re->top_backref = cd->top_backref;
6216 re->flags = cd->external_flags;
6217
6218 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6219
6220 /* If not reached end of pattern on success, there's an excess bracket. */
6221
6222 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6223
6224 /* Fill in the terminating state and check for disastrous overflow, but
6225 if debugging, leave the test till after things are printed out. */
6226
6227 *code++ = OP_END;
6228
6229 #ifndef DEBUG
6230 if (code - codestart > length) errorcode = ERR23;
6231 #endif
6232
6233 /* Fill in any forward references that are required. */
6234
6235 while (errorcode == 0 && cd->hwm > cworkspace)
6236 {
6237 int offset, recno;
6238 const uschar *groupptr;
6239 cd->hwm -= LINK_SIZE;
6240 offset = GET(cd->hwm, 0);
6241 recno = GET(codestart, offset);
6242 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6243 if (groupptr == NULL) errorcode = ERR53;
6244 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6245 }
6246
6247 /* Give an error if there's back reference to a non-existent capturing
6248 subpattern. */
6249
6250 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6251
6252 /* Failed to compile, or error while post-processing */
6253
6254 if (errorcode != 0)
6255 {
6256 (pcre_free)(re);
6257 PCRE_EARLY_ERROR_RETURN:
6258 *erroroffset = ptr - (const uschar *)pattern;
6259 PCRE_EARLY_ERROR_RETURN2:
6260 *errorptr = find_error_text(errorcode);
6261 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6262 return NULL;
6263 }
6264
6265 /* If the anchored option was not passed, set the flag if we can determine that
6266 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6267 as starting with .* when DOTALL is set).
6268
6269 Otherwise, if we know what the first byte has to be, save it, because that
6270 speeds up unanchored matches no end. If not, see if we can set the
6271 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6272 start with ^. and also when all branches start with .* for non-DOTALL matches.
6273 */
6274
6275 if ((re->options & PCRE_ANCHORED) == 0)
6276 {
6277 int temp_options = re->options; /* May get changed during these scans */
6278 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6279 re->options |= PCRE_ANCHORED;
6280 else
6281 {
6282 if (firstbyte < 0)
6283 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6284 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6285 {
6286 int ch = firstbyte & 255;
6287 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6288 cd->fcc[ch] == ch)? ch : firstbyte;
6289 re->flags |= PCRE_FIRSTSET;
6290 }
6291 else if (is_startline(codestart, 0, cd->backref_map))
6292 re->flags |= PCRE_STARTLINE;
6293 }
6294 }
6295
6296 /* For an anchored pattern, we use the "required byte" only if it follows a
6297 variable length item in the regex. Remove the caseless flag for non-caseable
6298 bytes. */
6299
6300 if (reqbyte >= 0 &&
6301 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6302 {
6303 int ch = reqbyte & 255;
6304 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6305 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6306 re->flags |= PCRE_REQCHSET;
6307 }
6308
6309 /* Print out the compiled data if debugging is enabled. This is never the
6310 case when building a production library. */
6311
6312 #ifdef DEBUG
6313
6314 printf("Length = %d top_bracket = %d top_backref = %d\n",
6315 length, re->top_bracket, re->top_backref);
6316
6317 printf("Options=%08x\n", re->options);
6318
6319 if ((re->flags & PCRE_FIRSTSET) != 0)
6320 {
6321 int ch = re->first_byte & 255;
6322 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6323 "" : " (caseless)";
6324 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6325 else printf("First char = \\x%02x%s\n", ch, caseless);
6326 }
6327
6328 if ((re->flags & PCRE_REQCHSET) != 0)
6329 {
6330 int ch = re->req_byte & 255;
6331 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6332 "" : " (caseless)";
6333 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6334 else printf("Req char = \\x%02x%s\n", ch, caseless);
6335 }
6336
6337 pcre_printint(re, stdout, TRUE);
6338
6339 /* This check is done here in the debugging case so that the code that
6340 was compiled can be seen. */
6341
6342 if (code - codestart > length)
6343 {
6344 (pcre_free)(re);
6345 *errorptr = find_error_text(ERR23);
6346 *erroroffset = ptr - (uschar *)pattern;
6347 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6348 return NULL;
6349 }
6350 #endif /* DEBUG */
6351
6352 return (pcre *)re;
6353 }
6354
6355 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5