/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 341 - (show annotations)
Sat Apr 19 16:41:04 2008 UTC (7 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 202840 byte(s)
Error occurred while calculating annotation data.
Fix DFA (?!) bug; add support for JavaScript empty classes.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static const verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (? or (?-\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299 "a numbered reference must not be zero\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected\0"
305 "digit expected after (?+\0"
306 "] is an invalid data character in JavaScript compatibility mode";
307
308
309 /* Table to identify digits and hex digits. This is used when compiling
310 patterns. Note that the tables in chartables are dependent on the locale, and
311 may mark arbitrary characters as digits - but the PCRE compiling code expects
312 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
313 a private table here. It costs 256 bytes, but it is a lot faster than doing
314 character value tests (at least in some simple cases I timed), and in some
315 applications one wants PCRE to compile efficiently as well as match
316 efficiently.
317
318 For convenience, we use the same bit definitions as in chartables:
319
320 0x04 decimal digit
321 0x08 hexadecimal digit
322
323 Then we can use ctype_digit and ctype_xdigit in the code. */
324
325 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
326 static const unsigned char digitab[] =
327 {
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
334 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
335 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
336 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
340 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360
361 #else /* This is the "abnormal" case, for EBCDIC systems */
362 static const unsigned char digitab[] =
363 {
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
380 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
388 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
394 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
395 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
396
397 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
398 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
399 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
400 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
402 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
406 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
407 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
409 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
411 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
414 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
415 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
416 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
417 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
418 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
419 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
420 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
421 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
422 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
423 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
424 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
425 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
426 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
427 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
428 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
429 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
430 #endif
431
432
433 /* Definition to allow mutual recursion */
434
435 static BOOL
436 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437 int *, int *, branch_chain *, compile_data *, int *);
438
439
440
441 /*************************************************
442 * Find an error text *
443 *************************************************/
444
445 /* The error texts are now all in one long string, to save on relocations. As
446 some of the text is of unknown length, we can't use a table of offsets.
447 Instead, just count through the strings. This is not a performance issue
448 because it happens only when there has been a compilation error.
449
450 Argument: the error number
451 Returns: pointer to the error string
452 */
453
454 static const char *
455 find_error_text(int n)
456 {
457 const char *s = error_texts;
458 for (; n > 0; n--) while (*s++ != 0);
459 return s;
460 }
461
462
463 /*************************************************
464 * Handle escapes *
465 *************************************************/
466
467 /* This function is called when a \ has been encountered. It either returns a
468 positive value for a simple escape such as \n, or a negative value which
469 encodes one of the more complicated things such as \d. A backreference to group
470 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472 ptr is pointing at the \. On exit, it is on the final character of the escape
473 sequence.
474
475 Arguments:
476 ptrptr points to the pattern position pointer
477 errorcodeptr points to the errorcode variable
478 bracount number of previous extracting brackets
479 options the options bits
480 isclass TRUE if inside a character class
481
482 Returns: zero or positive => a data character
483 negative => a special escape sequence
484 on error, errorcodeptr is set
485 */
486
487 static int
488 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
489 int options, BOOL isclass)
490 {
491 BOOL utf8 = (options & PCRE_UTF8) != 0;
492 const uschar *ptr = *ptrptr + 1;
493 int c, i;
494
495 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
496 ptr--; /* Set pointer back to the last byte */
497
498 /* If backslash is at the end of the pattern, it's an error. */
499
500 if (c == 0) *errorcodeptr = ERR1;
501
502 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503 in a table. A non-zero result is something that can be returned immediately.
504 Otherwise further processing may be required. */
505
506 #ifndef EBCDIC /* ASCII coding */
507 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
508 else if ((i = escapes[c - '0']) != 0) c = i;
509
510 #else /* EBCDIC coding */
511 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
512 else if ((i = escapes[c - 0x48]) != 0) c = i;
513 #endif
514
515 /* Escapes that need further processing, or are illegal. */
516
517 else
518 {
519 const uschar *oldptr;
520 BOOL braced, negated;
521
522 switch (c)
523 {
524 /* A number of Perl escapes are not handled by PCRE. We give an explicit
525 error. */
526
527 case 'l':
528 case 'L':
529 case 'N':
530 case 'u':
531 case 'U':
532 *errorcodeptr = ERR37;
533 break;
534
535 /* \g must be followed by one of a number of specific things:
536
537 (1) A number, either plain or braced. If positive, it is an absolute
538 backreference. If negative, it is a relative backreference. This is a Perl
539 5.10 feature.
540
541 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542 is part of Perl's movement towards a unified syntax for back references. As
543 this is synonymous with \k{name}, we fudge it up by pretending it really
544 was \k.
545
546 (3) For Oniguruma compatibility we also support \g followed by a name or a
547 number either in angle brackets or in single quotes. However, these are
548 (possibly recursive) subroutine calls, _not_ backreferences. Just return
549 the -ESC_g code (cf \k). */
550
551 case 'g':
552 if (ptr[1] == '<' || ptr[1] == '\'')
553 {
554 c = -ESC_g;
555 break;
556 }
557
558 /* Handle the Perl-compatible cases */
559
560 if (ptr[1] == '{')
561 {
562 const uschar *p;
563 for (p = ptr+2; *p != 0 && *p != '}'; p++)
564 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565 if (*p != 0 && *p != '}')
566 {
567 c = -ESC_k;
568 break;
569 }
570 braced = TRUE;
571 ptr++;
572 }
573 else braced = FALSE;
574
575 if (ptr[1] == '-')
576 {
577 negated = TRUE;
578 ptr++;
579 }
580 else negated = FALSE;
581
582 c = 0;
583 while ((digitab[ptr[1]] & ctype_digit) != 0)
584 c = c * 10 + *(++ptr) - '0';
585
586 if (c < 0) /* Integer overflow */
587 {
588 *errorcodeptr = ERR61;
589 break;
590 }
591
592 if (braced && *(++ptr) != '}')
593 {
594 *errorcodeptr = ERR57;
595 break;
596 }
597
598 if (c == 0)
599 {
600 *errorcodeptr = ERR58;
601 break;
602 }
603
604 if (negated)
605 {
606 if (c > bracount)
607 {
608 *errorcodeptr = ERR15;
609 break;
610 }
611 c = bracount - (c - 1);
612 }
613
614 c = -(ESC_REF + c);
615 break;
616
617 /* The handling of escape sequences consisting of a string of digits
618 starting with one that is not zero is not straightforward. By experiment,
619 the way Perl works seems to be as follows:
620
621 Outside a character class, the digits are read as a decimal number. If the
622 number is less than 10, or if there are that many previous extracting
623 left brackets, then it is a back reference. Otherwise, up to three octal
624 digits are read to form an escaped byte. Thus \123 is likely to be octal
625 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
626 value is greater than 377, the least significant 8 bits are taken. Inside a
627 character class, \ followed by a digit is always an octal number. */
628
629 case '1': case '2': case '3': case '4': case '5':
630 case '6': case '7': case '8': case '9':
631
632 if (!isclass)
633 {
634 oldptr = ptr;
635 c -= '0';
636 while ((digitab[ptr[1]] & ctype_digit) != 0)
637 c = c * 10 + *(++ptr) - '0';
638 if (c < 0) /* Integer overflow */
639 {
640 *errorcodeptr = ERR61;
641 break;
642 }
643 if (c < 10 || c <= bracount)
644 {
645 c = -(ESC_REF + c);
646 break;
647 }
648 ptr = oldptr; /* Put the pointer back and fall through */
649 }
650
651 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
652 generates a binary zero byte and treats the digit as a following literal.
653 Thus we have to pull back the pointer by one. */
654
655 if ((c = *ptr) >= '8')
656 {
657 ptr--;
658 c = 0;
659 break;
660 }
661
662 /* \0 always starts an octal number, but we may drop through to here with a
663 larger first octal digit. The original code used just to take the least
664 significant 8 bits of octal numbers (I think this is what early Perls used
665 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666 than 3 octal digits. */
667
668 case '0':
669 c -= '0';
670 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671 c = c * 8 + *(++ptr) - '0';
672 if (!utf8 && c > 255) *errorcodeptr = ERR51;
673 break;
674
675 /* \x is complicated. \x{ddd} is a character number which can be greater
676 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
677 treated as a data character. */
678
679 case 'x':
680 if (ptr[1] == '{')
681 {
682 const uschar *pt = ptr + 2;
683 int count = 0;
684
685 c = 0;
686 while ((digitab[*pt] & ctype_xdigit) != 0)
687 {
688 register int cc = *pt++;
689 if (c == 0 && cc == '0') continue; /* Leading zeroes */
690 count++;
691
692 #ifndef EBCDIC /* ASCII coding */
693 if (cc >= 'a') cc -= 32; /* Convert to upper case */
694 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695 #else /* EBCDIC coding */
696 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
697 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698 #endif
699 }
700
701 if (*pt == '}')
702 {
703 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
704 ptr = pt;
705 break;
706 }
707
708 /* If the sequence of hex digits does not end with '}', then we don't
709 recognize this construct; fall through to the normal \x handling. */
710 }
711
712 /* Read just a single-byte hex-defined char */
713
714 c = 0;
715 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
716 {
717 int cc; /* Some compilers don't like ++ */
718 cc = *(++ptr); /* in initializers */
719 #ifndef EBCDIC /* ASCII coding */
720 if (cc >= 'a') cc -= 32; /* Convert to upper case */
721 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722 #else /* EBCDIC coding */
723 if (cc <= 'z') cc += 64; /* Convert to upper case */
724 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725 #endif
726 }
727 break;
728
729 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730 This coding is ASCII-specific, but then the whole concept of \cx is
731 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732
733 case 'c':
734 c = *(++ptr);
735 if (c == 0)
736 {
737 *errorcodeptr = ERR2;
738 break;
739 }
740
741 #ifndef EBCDIC /* ASCII coding */
742 if (c >= 'a' && c <= 'z') c -= 32;
743 c ^= 0x40;
744 #else /* EBCDIC coding */
745 if (c >= 'a' && c <= 'z') c += 64;
746 c ^= 0xC0;
747 #endif
748 break;
749
750 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751 other alphanumeric following \ is an error if PCRE_EXTRA was set;
752 otherwise, for Perl compatibility, it is a literal. This code looks a bit
753 odd, but there used to be some cases other than the default, and there may
754 be again in future, so I haven't "optimized" it. */
755
756 default:
757 if ((options & PCRE_EXTRA) != 0) switch(c)
758 {
759 default:
760 *errorcodeptr = ERR3;
761 break;
762 }
763 break;
764 }
765 }
766
767 *ptrptr = ptr;
768 return c;
769 }
770
771
772
773 #ifdef SUPPORT_UCP
774 /*************************************************
775 * Handle \P and \p *
776 *************************************************/
777
778 /* This function is called after \P or \p has been encountered, provided that
779 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
780 pointing at the P or p. On exit, it is pointing at the final character of the
781 escape sequence.
782
783 Argument:
784 ptrptr points to the pattern position pointer
785 negptr points to a boolean that is set TRUE for negation else FALSE
786 dptr points to an int that is set to the detailed property value
787 errorcodeptr points to the error code variable
788
789 Returns: type value from ucp_type_table, or -1 for an invalid type
790 */
791
792 static int
793 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
794 {
795 int c, i, bot, top;
796 const uschar *ptr = *ptrptr;
797 char name[32];
798
799 c = *(++ptr);
800 if (c == 0) goto ERROR_RETURN;
801
802 *negptr = FALSE;
803
804 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
805 negation. */
806
807 if (c == '{')
808 {
809 if (ptr[1] == '^')
810 {
811 *negptr = TRUE;
812 ptr++;
813 }
814 for (i = 0; i < (int)sizeof(name) - 1; i++)
815 {
816 c = *(++ptr);
817 if (c == 0) goto ERROR_RETURN;
818 if (c == '}') break;
819 name[i] = c;
820 }
821 if (c !='}') goto ERROR_RETURN;
822 name[i] = 0;
823 }
824
825 /* Otherwise there is just one following character */
826
827 else
828 {
829 name[0] = c;
830 name[1] = 0;
831 }
832
833 *ptrptr = ptr;
834
835 /* Search for a recognized property name using binary chop */
836
837 bot = 0;
838 top = _pcre_utt_size;
839
840 while (bot < top)
841 {
842 i = (bot + top) >> 1;
843 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844 if (c == 0)
845 {
846 *dptr = _pcre_utt[i].value;
847 return _pcre_utt[i].type;
848 }
849 if (c > 0) bot = i + 1; else top = i;
850 }
851
852 *errorcodeptr = ERR47;
853 *ptrptr = ptr;
854 return -1;
855
856 ERROR_RETURN:
857 *errorcodeptr = ERR46;
858 *ptrptr = ptr;
859 return -1;
860 }
861 #endif
862
863
864
865
866 /*************************************************
867 * Check for counted repeat *
868 *************************************************/
869
870 /* This function is called when a '{' is encountered in a place where it might
871 start a quantifier. It looks ahead to see if it really is a quantifier or not.
872 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
873 where the ddds are digits.
874
875 Arguments:
876 p pointer to the first char after '{'
877
878 Returns: TRUE or FALSE
879 */
880
881 static BOOL
882 is_counted_repeat(const uschar *p)
883 {
884 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
885 while ((digitab[*p] & ctype_digit) != 0) p++;
886 if (*p == '}') return TRUE;
887
888 if (*p++ != ',') return FALSE;
889 if (*p == '}') return TRUE;
890
891 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
892 while ((digitab[*p] & ctype_digit) != 0) p++;
893
894 return (*p == '}');
895 }
896
897
898
899 /*************************************************
900 * Read repeat counts *
901 *************************************************/
902
903 /* Read an item of the form {n,m} and return the values. This is called only
904 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
905 so the syntax is guaranteed to be correct, but we need to check the values.
906
907 Arguments:
908 p pointer to first char after '{'
909 minp pointer to int for min
910 maxp pointer to int for max
911 returned as -1 if no max
912 errorcodeptr points to error code variable
913
914 Returns: pointer to '}' on success;
915 current ptr on error, with errorcodeptr set non-zero
916 */
917
918 static const uschar *
919 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
920 {
921 int min = 0;
922 int max = -1;
923
924 /* Read the minimum value and do a paranoid check: a negative value indicates
925 an integer overflow. */
926
927 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
928 if (min < 0 || min > 65535)
929 {
930 *errorcodeptr = ERR5;
931 return p;
932 }
933
934 /* Read the maximum value if there is one, and again do a paranoid on its size.
935 Also, max must not be less than min. */
936
937 if (*p == '}') max = min; else
938 {
939 if (*(++p) != '}')
940 {
941 max = 0;
942 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
943 if (max < 0 || max > 65535)
944 {
945 *errorcodeptr = ERR5;
946 return p;
947 }
948 if (max < min)
949 {
950 *errorcodeptr = ERR4;
951 return p;
952 }
953 }
954 }
955
956 /* Fill in the required variables, and pass back the pointer to the terminating
957 '}'. */
958
959 *minp = min;
960 *maxp = max;
961 return p;
962 }
963
964
965
966 /*************************************************
967 * Find forward referenced subpattern *
968 *************************************************/
969
970 /* This function scans along a pattern's text looking for capturing
971 subpatterns, and counting them. If it finds a named pattern that matches the
972 name it is given, it returns its number. Alternatively, if the name is NULL, it
973 returns when it reaches a given numbered subpattern. This is used for forward
974 references to subpatterns. We know that if (?P< is encountered, the name will
975 be terminated by '>' because that is checked in the first pass.
976
977 Arguments:
978 ptr current position in the pattern
979 cd compile background data
980 name name to seek, or NULL if seeking a numbered subpattern
981 lorn name length, or subpattern number if name is NULL
982 xmode TRUE if we are in /x mode
983
984 Returns: the number of the named subpattern, or -1 if not found
985 */
986
987 static int
988 find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
989 BOOL xmode)
990 {
991 const uschar *thisname;
992 int count = cd->bracount;
993
994 for (; *ptr != 0; ptr++)
995 {
996 int term;
997
998 /* Skip over backslashed characters and also entire \Q...\E */
999
1000 if (*ptr == '\\')
1001 {
1002 if (*(++ptr) == 0) return -1;
1003 if (*ptr == 'Q') for (;;)
1004 {
1005 while (*(++ptr) != 0 && *ptr != '\\');
1006 if (*ptr == 0) return -1;
1007 if (*(++ptr) == 'E') break;
1008 }
1009 continue;
1010 }
1011
1012 /* Skip over character classes; this logic must be similar to the way they
1013 are handled for real. If the first character is '^', skip it. Also, if the
1014 first few characters (either before or after ^) are \Q\E or \E we skip them
1015 too. This makes for compatibility with Perl. */
1016
1017 if (*ptr == '[')
1018 {
1019 BOOL negate_class = FALSE;
1020 for (;;)
1021 {
1022 int c = *(++ptr);
1023 if (c == '\\')
1024 {
1025 if (ptr[1] == 'E') ptr++;
1026 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1027 else break;
1028 }
1029 else if (!negate_class && c == '^')
1030 negate_class = TRUE;
1031 else break;
1032 }
1033
1034 /* If the next character is ']', it is a data character that must be
1035 skipped, except in JavaScript compatibility mode. */
1036
1037 if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1038 ptr++;
1039
1040 while (*(++ptr) != ']')
1041 {
1042 if (*ptr == 0) return -1;
1043 if (*ptr == '\\')
1044 {
1045 if (*(++ptr) == 0) return -1;
1046 if (*ptr == 'Q') for (;;)
1047 {
1048 while (*(++ptr) != 0 && *ptr != '\\');
1049 if (*ptr == 0) return -1;
1050 if (*(++ptr) == 'E') break;
1051 }
1052 continue;
1053 }
1054 }
1055 continue;
1056 }
1057
1058 /* Skip comments in /x mode */
1059
1060 if (xmode && *ptr == '#')
1061 {
1062 while (*(++ptr) != 0 && *ptr != '\n');
1063 if (*ptr == 0) return -1;
1064 continue;
1065 }
1066
1067 /* An opening parens must now be a real metacharacter */
1068
1069 if (*ptr != '(') continue;
1070 if (ptr[1] != '?' && ptr[1] != '*')
1071 {
1072 count++;
1073 if (name == NULL && count == lorn) return count;
1074 continue;
1075 }
1076
1077 ptr += 2;
1078 if (*ptr == 'P') ptr++; /* Allow optional P */
1079
1080 /* We have to disambiguate (?<! and (?<= from (?<name> */
1081
1082 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1083 *ptr != '\'')
1084 continue;
1085
1086 count++;
1087
1088 if (name == NULL && count == lorn) return count;
1089 term = *ptr++;
1090 if (term == '<') term = '>';
1091 thisname = ptr;
1092 while (*ptr != term) ptr++;
1093 if (name != NULL && lorn == ptr - thisname &&
1094 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1095 return count;
1096 }
1097
1098 return -1;
1099 }
1100
1101
1102
1103 /*************************************************
1104 * Find first significant op code *
1105 *************************************************/
1106
1107 /* This is called by several functions that scan a compiled expression looking
1108 for a fixed first character, or an anchoring op code etc. It skips over things
1109 that do not influence this. For some calls, a change of option is important.
1110 For some calls, it makes sense to skip negative forward and all backward
1111 assertions, and also the \b assertion; for others it does not.
1112
1113 Arguments:
1114 code pointer to the start of the group
1115 options pointer to external options
1116 optbit the option bit whose changing is significant, or
1117 zero if none are
1118 skipassert TRUE if certain assertions are to be skipped
1119
1120 Returns: pointer to the first significant opcode
1121 */
1122
1123 static const uschar*
1124 first_significant_code(const uschar *code, int *options, int optbit,
1125 BOOL skipassert)
1126 {
1127 for (;;)
1128 {
1129 switch ((int)*code)
1130 {
1131 case OP_OPT:
1132 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1133 *options = (int)code[1];
1134 code += 2;
1135 break;
1136
1137 case OP_ASSERT_NOT:
1138 case OP_ASSERTBACK:
1139 case OP_ASSERTBACK_NOT:
1140 if (!skipassert) return code;
1141 do code += GET(code, 1); while (*code == OP_ALT);
1142 code += _pcre_OP_lengths[*code];
1143 break;
1144
1145 case OP_WORD_BOUNDARY:
1146 case OP_NOT_WORD_BOUNDARY:
1147 if (!skipassert) return code;
1148 /* Fall through */
1149
1150 case OP_CALLOUT:
1151 case OP_CREF:
1152 case OP_RREF:
1153 case OP_DEF:
1154 code += _pcre_OP_lengths[*code];
1155 break;
1156
1157 default:
1158 return code;
1159 }
1160 }
1161 /* Control never reaches here */
1162 }
1163
1164
1165
1166
1167 /*************************************************
1168 * Find the fixed length of a pattern *
1169 *************************************************/
1170
1171 /* Scan a pattern and compute the fixed length of subject that will match it,
1172 if the length is fixed. This is needed for dealing with backward assertions.
1173 In UTF8 mode, the result is in characters rather than bytes.
1174
1175 Arguments:
1176 code points to the start of the pattern (the bracket)
1177 options the compiling options
1178
1179 Returns: the fixed length, or -1 if there is no fixed length,
1180 or -2 if \C was encountered
1181 */
1182
1183 static int
1184 find_fixedlength(uschar *code, int options)
1185 {
1186 int length = -1;
1187
1188 register int branchlength = 0;
1189 register uschar *cc = code + 1 + LINK_SIZE;
1190
1191 /* Scan along the opcodes for this branch. If we get to the end of the
1192 branch, check the length against that of the other branches. */
1193
1194 for (;;)
1195 {
1196 int d;
1197 register int op = *cc;
1198 switch (op)
1199 {
1200 case OP_CBRA:
1201 case OP_BRA:
1202 case OP_ONCE:
1203 case OP_COND:
1204 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1205 if (d < 0) return d;
1206 branchlength += d;
1207 do cc += GET(cc, 1); while (*cc == OP_ALT);
1208 cc += 1 + LINK_SIZE;
1209 break;
1210
1211 /* Reached end of a branch; if it's a ket it is the end of a nested
1212 call. If it's ALT it is an alternation in a nested call. If it is
1213 END it's the end of the outer call. All can be handled by the same code. */
1214
1215 case OP_ALT:
1216 case OP_KET:
1217 case OP_KETRMAX:
1218 case OP_KETRMIN:
1219 case OP_END:
1220 if (length < 0) length = branchlength;
1221 else if (length != branchlength) return -1;
1222 if (*cc != OP_ALT) return length;
1223 cc += 1 + LINK_SIZE;
1224 branchlength = 0;
1225 break;
1226
1227 /* Skip over assertive subpatterns */
1228
1229 case OP_ASSERT:
1230 case OP_ASSERT_NOT:
1231 case OP_ASSERTBACK:
1232 case OP_ASSERTBACK_NOT:
1233 do cc += GET(cc, 1); while (*cc == OP_ALT);
1234 /* Fall through */
1235
1236 /* Skip over things that don't match chars */
1237
1238 case OP_REVERSE:
1239 case OP_CREF:
1240 case OP_RREF:
1241 case OP_DEF:
1242 case OP_OPT:
1243 case OP_CALLOUT:
1244 case OP_SOD:
1245 case OP_SOM:
1246 case OP_EOD:
1247 case OP_EODN:
1248 case OP_CIRC:
1249 case OP_DOLL:
1250 case OP_NOT_WORD_BOUNDARY:
1251 case OP_WORD_BOUNDARY:
1252 cc += _pcre_OP_lengths[*cc];
1253 break;
1254
1255 /* Handle literal characters */
1256
1257 case OP_CHAR:
1258 case OP_CHARNC:
1259 case OP_NOT:
1260 branchlength++;
1261 cc += 2;
1262 #ifdef SUPPORT_UTF8
1263 if ((options & PCRE_UTF8) != 0)
1264 {
1265 while ((*cc & 0xc0) == 0x80) cc++;
1266 }
1267 #endif
1268 break;
1269
1270 /* Handle exact repetitions. The count is already in characters, but we
1271 need to skip over a multibyte character in UTF8 mode. */
1272
1273 case OP_EXACT:
1274 branchlength += GET2(cc,1);
1275 cc += 4;
1276 #ifdef SUPPORT_UTF8
1277 if ((options & PCRE_UTF8) != 0)
1278 {
1279 while((*cc & 0x80) == 0x80) cc++;
1280 }
1281 #endif
1282 break;
1283
1284 case OP_TYPEEXACT:
1285 branchlength += GET2(cc,1);
1286 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1287 cc += 4;
1288 break;
1289
1290 /* Handle single-char matchers */
1291
1292 case OP_PROP:
1293 case OP_NOTPROP:
1294 cc += 2;
1295 /* Fall through */
1296
1297 case OP_NOT_DIGIT:
1298 case OP_DIGIT:
1299 case OP_NOT_WHITESPACE:
1300 case OP_WHITESPACE:
1301 case OP_NOT_WORDCHAR:
1302 case OP_WORDCHAR:
1303 case OP_ANY:
1304 branchlength++;
1305 cc++;
1306 break;
1307
1308 /* The single-byte matcher isn't allowed */
1309
1310 case OP_ANYBYTE:
1311 return -2;
1312
1313 /* Check a class for variable quantification */
1314
1315 #ifdef SUPPORT_UTF8
1316 case OP_XCLASS:
1317 cc += GET(cc, 1) - 33;
1318 /* Fall through */
1319 #endif
1320
1321 case OP_CLASS:
1322 case OP_NCLASS:
1323 cc += 33;
1324
1325 switch (*cc)
1326 {
1327 case OP_CRSTAR:
1328 case OP_CRMINSTAR:
1329 case OP_CRQUERY:
1330 case OP_CRMINQUERY:
1331 return -1;
1332
1333 case OP_CRRANGE:
1334 case OP_CRMINRANGE:
1335 if (GET2(cc,1) != GET2(cc,3)) return -1;
1336 branchlength += GET2(cc,1);
1337 cc += 5;
1338 break;
1339
1340 default:
1341 branchlength++;
1342 }
1343 break;
1344
1345 /* Anything else is variable length */
1346
1347 default:
1348 return -1;
1349 }
1350 }
1351 /* Control never gets here */
1352 }
1353
1354
1355
1356
1357 /*************************************************
1358 * Scan compiled regex for numbered bracket *
1359 *************************************************/
1360
1361 /* This little function scans through a compiled pattern until it finds a
1362 capturing bracket with the given number.
1363
1364 Arguments:
1365 code points to start of expression
1366 utf8 TRUE in UTF-8 mode
1367 number the required bracket number
1368
1369 Returns: pointer to the opcode for the bracket, or NULL if not found
1370 */
1371
1372 static const uschar *
1373 find_bracket(const uschar *code, BOOL utf8, int number)
1374 {
1375 for (;;)
1376 {
1377 register int c = *code;
1378 if (c == OP_END) return NULL;
1379
1380 /* XCLASS is used for classes that cannot be represented just by a bit
1381 map. This includes negated single high-valued characters. The length in
1382 the table is zero; the actual length is stored in the compiled code. */
1383
1384 if (c == OP_XCLASS) code += GET(code, 1);
1385
1386 /* Handle capturing bracket */
1387
1388 else if (c == OP_CBRA)
1389 {
1390 int n = GET2(code, 1+LINK_SIZE);
1391 if (n == number) return (uschar *)code;
1392 code += _pcre_OP_lengths[c];
1393 }
1394
1395 /* Otherwise, we can get the item's length from the table, except that for
1396 repeated character types, we have to test for \p and \P, which have an extra
1397 two bytes of parameters. */
1398
1399 else
1400 {
1401 switch(c)
1402 {
1403 case OP_TYPESTAR:
1404 case OP_TYPEMINSTAR:
1405 case OP_TYPEPLUS:
1406 case OP_TYPEMINPLUS:
1407 case OP_TYPEQUERY:
1408 case OP_TYPEMINQUERY:
1409 case OP_TYPEPOSSTAR:
1410 case OP_TYPEPOSPLUS:
1411 case OP_TYPEPOSQUERY:
1412 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1413 break;
1414
1415 case OP_TYPEUPTO:
1416 case OP_TYPEMINUPTO:
1417 case OP_TYPEEXACT:
1418 case OP_TYPEPOSUPTO:
1419 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1420 break;
1421 }
1422
1423 /* Add in the fixed length from the table */
1424
1425 code += _pcre_OP_lengths[c];
1426
1427 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1428 a multi-byte character. The length in the table is a minimum, so we have to
1429 arrange to skip the extra bytes. */
1430
1431 #ifdef SUPPORT_UTF8
1432 if (utf8) switch(c)
1433 {
1434 case OP_CHAR:
1435 case OP_CHARNC:
1436 case OP_EXACT:
1437 case OP_UPTO:
1438 case OP_MINUPTO:
1439 case OP_POSUPTO:
1440 case OP_STAR:
1441 case OP_MINSTAR:
1442 case OP_POSSTAR:
1443 case OP_PLUS:
1444 case OP_MINPLUS:
1445 case OP_POSPLUS:
1446 case OP_QUERY:
1447 case OP_MINQUERY:
1448 case OP_POSQUERY:
1449 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1450 break;
1451 }
1452 #endif
1453 }
1454 }
1455 }
1456
1457
1458
1459 /*************************************************
1460 * Scan compiled regex for recursion reference *
1461 *************************************************/
1462
1463 /* This little function scans through a compiled pattern until it finds an
1464 instance of OP_RECURSE.
1465
1466 Arguments:
1467 code points to start of expression
1468 utf8 TRUE in UTF-8 mode
1469
1470 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1471 */
1472
1473 static const uschar *
1474 find_recurse(const uschar *code, BOOL utf8)
1475 {
1476 for (;;)
1477 {
1478 register int c = *code;
1479 if (c == OP_END) return NULL;
1480 if (c == OP_RECURSE) return code;
1481
1482 /* XCLASS is used for classes that cannot be represented just by a bit
1483 map. This includes negated single high-valued characters. The length in
1484 the table is zero; the actual length is stored in the compiled code. */
1485
1486 if (c == OP_XCLASS) code += GET(code, 1);
1487
1488 /* Otherwise, we can get the item's length from the table, except that for
1489 repeated character types, we have to test for \p and \P, which have an extra
1490 two bytes of parameters. */
1491
1492 else
1493 {
1494 switch(c)
1495 {
1496 case OP_TYPESTAR:
1497 case OP_TYPEMINSTAR:
1498 case OP_TYPEPLUS:
1499 case OP_TYPEMINPLUS:
1500 case OP_TYPEQUERY:
1501 case OP_TYPEMINQUERY:
1502 case OP_TYPEPOSSTAR:
1503 case OP_TYPEPOSPLUS:
1504 case OP_TYPEPOSQUERY:
1505 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1506 break;
1507
1508 case OP_TYPEPOSUPTO:
1509 case OP_TYPEUPTO:
1510 case OP_TYPEMINUPTO:
1511 case OP_TYPEEXACT:
1512 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1513 break;
1514 }
1515
1516 /* Add in the fixed length from the table */
1517
1518 code += _pcre_OP_lengths[c];
1519
1520 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1521 by a multi-byte character. The length in the table is a minimum, so we have
1522 to arrange to skip the extra bytes. */
1523
1524 #ifdef SUPPORT_UTF8
1525 if (utf8) switch(c)
1526 {
1527 case OP_CHAR:
1528 case OP_CHARNC:
1529 case OP_EXACT:
1530 case OP_UPTO:
1531 case OP_MINUPTO:
1532 case OP_POSUPTO:
1533 case OP_STAR:
1534 case OP_MINSTAR:
1535 case OP_POSSTAR:
1536 case OP_PLUS:
1537 case OP_MINPLUS:
1538 case OP_POSPLUS:
1539 case OP_QUERY:
1540 case OP_MINQUERY:
1541 case OP_POSQUERY:
1542 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1543 break;
1544 }
1545 #endif
1546 }
1547 }
1548 }
1549
1550
1551
1552 /*************************************************
1553 * Scan compiled branch for non-emptiness *
1554 *************************************************/
1555
1556 /* This function scans through a branch of a compiled pattern to see whether it
1557 can match the empty string or not. It is called from could_be_empty()
1558 below and from compile_branch() when checking for an unlimited repeat of a
1559 group that can match nothing. Note that first_significant_code() skips over
1560 backward and negative forward assertions when its final argument is TRUE. If we
1561 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1562 bracket whose current branch will already have been scanned.
1563
1564 Arguments:
1565 code points to start of search
1566 endcode points to where to stop
1567 utf8 TRUE if in UTF8 mode
1568
1569 Returns: TRUE if what is matched could be empty
1570 */
1571
1572 static BOOL
1573 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1574 {
1575 register int c;
1576 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1577 code < endcode;
1578 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1579 {
1580 const uschar *ccode;
1581
1582 c = *code;
1583
1584 /* Skip over forward assertions; the other assertions are skipped by
1585 first_significant_code() with a TRUE final argument. */
1586
1587 if (c == OP_ASSERT)
1588 {
1589 do code += GET(code, 1); while (*code == OP_ALT);
1590 c = *code;
1591 continue;
1592 }
1593
1594 /* Groups with zero repeats can of course be empty; skip them. */
1595
1596 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1597 {
1598 code += _pcre_OP_lengths[c];
1599 do code += GET(code, 1); while (*code == OP_ALT);
1600 c = *code;
1601 continue;
1602 }
1603
1604 /* For other groups, scan the branches. */
1605
1606 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1607 {
1608 BOOL empty_branch;
1609 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1610
1611 /* Scan a closed bracket */
1612
1613 empty_branch = FALSE;
1614 do
1615 {
1616 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1617 empty_branch = TRUE;
1618 code += GET(code, 1);
1619 }
1620 while (*code == OP_ALT);
1621 if (!empty_branch) return FALSE; /* All branches are non-empty */
1622 c = *code;
1623 continue;
1624 }
1625
1626 /* Handle the other opcodes */
1627
1628 switch (c)
1629 {
1630 /* Check for quantifiers after a class. XCLASS is used for classes that
1631 cannot be represented just by a bit map. This includes negated single
1632 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1633 actual length is stored in the compiled code, so we must update "code"
1634 here. */
1635
1636 #ifdef SUPPORT_UTF8
1637 case OP_XCLASS:
1638 ccode = code += GET(code, 1);
1639 goto CHECK_CLASS_REPEAT;
1640 #endif
1641
1642 case OP_CLASS:
1643 case OP_NCLASS:
1644 ccode = code + 33;
1645
1646 #ifdef SUPPORT_UTF8
1647 CHECK_CLASS_REPEAT:
1648 #endif
1649
1650 switch (*ccode)
1651 {
1652 case OP_CRSTAR: /* These could be empty; continue */
1653 case OP_CRMINSTAR:
1654 case OP_CRQUERY:
1655 case OP_CRMINQUERY:
1656 break;
1657
1658 default: /* Non-repeat => class must match */
1659 case OP_CRPLUS: /* These repeats aren't empty */
1660 case OP_CRMINPLUS:
1661 return FALSE;
1662
1663 case OP_CRRANGE:
1664 case OP_CRMINRANGE:
1665 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1666 break;
1667 }
1668 break;
1669
1670 /* Opcodes that must match a character */
1671
1672 case OP_PROP:
1673 case OP_NOTPROP:
1674 case OP_EXTUNI:
1675 case OP_NOT_DIGIT:
1676 case OP_DIGIT:
1677 case OP_NOT_WHITESPACE:
1678 case OP_WHITESPACE:
1679 case OP_NOT_WORDCHAR:
1680 case OP_WORDCHAR:
1681 case OP_ANY:
1682 case OP_ANYBYTE:
1683 case OP_CHAR:
1684 case OP_CHARNC:
1685 case OP_NOT:
1686 case OP_PLUS:
1687 case OP_MINPLUS:
1688 case OP_POSPLUS:
1689 case OP_EXACT:
1690 case OP_NOTPLUS:
1691 case OP_NOTMINPLUS:
1692 case OP_NOTPOSPLUS:
1693 case OP_NOTEXACT:
1694 case OP_TYPEPLUS:
1695 case OP_TYPEMINPLUS:
1696 case OP_TYPEPOSPLUS:
1697 case OP_TYPEEXACT:
1698 return FALSE;
1699
1700 /* These are going to continue, as they may be empty, but we have to
1701 fudge the length for the \p and \P cases. */
1702
1703 case OP_TYPESTAR:
1704 case OP_TYPEMINSTAR:
1705 case OP_TYPEPOSSTAR:
1706 case OP_TYPEQUERY:
1707 case OP_TYPEMINQUERY:
1708 case OP_TYPEPOSQUERY:
1709 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1710 break;
1711
1712 /* Same for these */
1713
1714 case OP_TYPEUPTO:
1715 case OP_TYPEMINUPTO:
1716 case OP_TYPEPOSUPTO:
1717 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1718 break;
1719
1720 /* End of branch */
1721
1722 case OP_KET:
1723 case OP_KETRMAX:
1724 case OP_KETRMIN:
1725 case OP_ALT:
1726 return TRUE;
1727
1728 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1729 MINUPTO, and POSUPTO may be followed by a multibyte character */
1730
1731 #ifdef SUPPORT_UTF8
1732 case OP_STAR:
1733 case OP_MINSTAR:
1734 case OP_POSSTAR:
1735 case OP_QUERY:
1736 case OP_MINQUERY:
1737 case OP_POSQUERY:
1738 case OP_UPTO:
1739 case OP_MINUPTO:
1740 case OP_POSUPTO:
1741 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1742 break;
1743 #endif
1744 }
1745 }
1746
1747 return TRUE;
1748 }
1749
1750
1751
1752 /*************************************************
1753 * Scan compiled regex for non-emptiness *
1754 *************************************************/
1755
1756 /* This function is called to check for left recursive calls. We want to check
1757 the current branch of the current pattern to see if it could match the empty
1758 string. If it could, we must look outwards for branches at other levels,
1759 stopping when we pass beyond the bracket which is the subject of the recursion.
1760
1761 Arguments:
1762 code points to start of the recursion
1763 endcode points to where to stop (current RECURSE item)
1764 bcptr points to the chain of current (unclosed) branch starts
1765 utf8 TRUE if in UTF-8 mode
1766
1767 Returns: TRUE if what is matched could be empty
1768 */
1769
1770 static BOOL
1771 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1772 BOOL utf8)
1773 {
1774 while (bcptr != NULL && bcptr->current >= code)
1775 {
1776 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1777 bcptr = bcptr->outer;
1778 }
1779 return TRUE;
1780 }
1781
1782
1783
1784 /*************************************************
1785 * Check for POSIX class syntax *
1786 *************************************************/
1787
1788 /* This function is called when the sequence "[:" or "[." or "[=" is
1789 encountered in a character class. It checks whether this is followed by a
1790 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1791 reach an unescaped ']' without the special preceding character, return FALSE.
1792
1793 Originally, this function only recognized a sequence of letters between the
1794 terminators, but it seems that Perl recognizes any sequence of characters,
1795 though of course unknown POSIX names are subsequently rejected. Perl gives an
1796 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1797 didn't consider this to be a POSIX class. Likewise for [:1234:].
1798
1799 The problem in trying to be exactly like Perl is in the handling of escapes. We
1800 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1801 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1802 below handles the special case of \], but does not try to do any other escape
1803 processing. This makes it different from Perl for cases such as [:l\ower:]
1804 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1805 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1806 I think.
1807
1808 Arguments:
1809 ptr pointer to the initial [
1810 endptr where to return the end pointer
1811
1812 Returns: TRUE or FALSE
1813 */
1814
1815 static BOOL
1816 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1817 {
1818 int terminator; /* Don't combine these lines; the Solaris cc */
1819 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1820 for (++ptr; *ptr != 0; ptr++)
1821 {
1822 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1823 {
1824 if (*ptr == ']') return FALSE;
1825 if (*ptr == terminator && ptr[1] == ']')
1826 {
1827 *endptr = ptr;
1828 return TRUE;
1829 }
1830 }
1831 }
1832 return FALSE;
1833 }
1834
1835
1836
1837
1838 /*************************************************
1839 * Check POSIX class name *
1840 *************************************************/
1841
1842 /* This function is called to check the name given in a POSIX-style class entry
1843 such as [:alnum:].
1844
1845 Arguments:
1846 ptr points to the first letter
1847 len the length of the name
1848
1849 Returns: a value representing the name, or -1 if unknown
1850 */
1851
1852 static int
1853 check_posix_name(const uschar *ptr, int len)
1854 {
1855 const char *pn = posix_names;
1856 register int yield = 0;
1857 while (posix_name_lengths[yield] != 0)
1858 {
1859 if (len == posix_name_lengths[yield] &&
1860 strncmp((const char *)ptr, pn, len) == 0) return yield;
1861 pn += posix_name_lengths[yield] + 1;
1862 yield++;
1863 }
1864 return -1;
1865 }
1866
1867
1868 /*************************************************
1869 * Adjust OP_RECURSE items in repeated group *
1870 *************************************************/
1871
1872 /* OP_RECURSE items contain an offset from the start of the regex to the group
1873 that is referenced. This means that groups can be replicated for fixed
1874 repetition simply by copying (because the recursion is allowed to refer to
1875 earlier groups that are outside the current group). However, when a group is
1876 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1877 inserted before it, after it has been compiled. This means that any OP_RECURSE
1878 items within it that refer to the group itself or any contained groups have to
1879 have their offsets adjusted. That one of the jobs of this function. Before it
1880 is called, the partially compiled regex must be temporarily terminated with
1881 OP_END.
1882
1883 This function has been extended with the possibility of forward references for
1884 recursions and subroutine calls. It must also check the list of such references
1885 for the group we are dealing with. If it finds that one of the recursions in
1886 the current group is on this list, it adjusts the offset in the list, not the
1887 value in the reference (which is a group number).
1888
1889 Arguments:
1890 group points to the start of the group
1891 adjust the amount by which the group is to be moved
1892 utf8 TRUE in UTF-8 mode
1893 cd contains pointers to tables etc.
1894 save_hwm the hwm forward reference pointer at the start of the group
1895
1896 Returns: nothing
1897 */
1898
1899 static void
1900 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1901 uschar *save_hwm)
1902 {
1903 uschar *ptr = group;
1904
1905 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1906 {
1907 int offset;
1908 uschar *hc;
1909
1910 /* See if this recursion is on the forward reference list. If so, adjust the
1911 reference. */
1912
1913 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1914 {
1915 offset = GET(hc, 0);
1916 if (cd->start_code + offset == ptr + 1)
1917 {
1918 PUT(hc, 0, offset + adjust);
1919 break;
1920 }
1921 }
1922
1923 /* Otherwise, adjust the recursion offset if it's after the start of this
1924 group. */
1925
1926 if (hc >= cd->hwm)
1927 {
1928 offset = GET(ptr, 1);
1929 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1930 }
1931
1932 ptr += 1 + LINK_SIZE;
1933 }
1934 }
1935
1936
1937
1938 /*************************************************
1939 * Insert an automatic callout point *
1940 *************************************************/
1941
1942 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1943 callout points before each pattern item.
1944
1945 Arguments:
1946 code current code pointer
1947 ptr current pattern pointer
1948 cd pointers to tables etc
1949
1950 Returns: new code pointer
1951 */
1952
1953 static uschar *
1954 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1955 {
1956 *code++ = OP_CALLOUT;
1957 *code++ = 255;
1958 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1959 PUT(code, LINK_SIZE, 0); /* Default length */
1960 return code + 2*LINK_SIZE;
1961 }
1962
1963
1964
1965 /*************************************************
1966 * Complete a callout item *
1967 *************************************************/
1968
1969 /* A callout item contains the length of the next item in the pattern, which
1970 we can't fill in till after we have reached the relevant point. This is used
1971 for both automatic and manual callouts.
1972
1973 Arguments:
1974 previous_callout points to previous callout item
1975 ptr current pattern pointer
1976 cd pointers to tables etc
1977
1978 Returns: nothing
1979 */
1980
1981 static void
1982 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1983 {
1984 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1985 PUT(previous_callout, 2 + LINK_SIZE, length);
1986 }
1987
1988
1989
1990 #ifdef SUPPORT_UCP
1991 /*************************************************
1992 * Get othercase range *
1993 *************************************************/
1994
1995 /* This function is passed the start and end of a class range, in UTF-8 mode
1996 with UCP support. It searches up the characters, looking for internal ranges of
1997 characters in the "other" case. Each call returns the next one, updating the
1998 start address.
1999
2000 Arguments:
2001 cptr points to starting character value; updated
2002 d end value
2003 ocptr where to put start of othercase range
2004 odptr where to put end of othercase range
2005
2006 Yield: TRUE when range returned; FALSE when no more
2007 */
2008
2009 static BOOL
2010 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2011 unsigned int *odptr)
2012 {
2013 unsigned int c, othercase, next;
2014
2015 for (c = *cptr; c <= d; c++)
2016 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
2017
2018 if (c > d) return FALSE;
2019
2020 *ocptr = othercase;
2021 next = othercase + 1;
2022
2023 for (++c; c <= d; c++)
2024 {
2025 if (_pcre_ucp_othercase(c) != next) break;
2026 next++;
2027 }
2028
2029 *odptr = next - 1;
2030 *cptr = c;
2031
2032 return TRUE;
2033 }
2034 #endif /* SUPPORT_UCP */
2035
2036
2037
2038 /*************************************************
2039 * Check if auto-possessifying is possible *
2040 *************************************************/
2041
2042 /* This function is called for unlimited repeats of certain items, to see
2043 whether the next thing could possibly match the repeated item. If not, it makes
2044 sense to automatically possessify the repeated item.
2045
2046 Arguments:
2047 op_code the repeated op code
2048 this data for this item, depends on the opcode
2049 utf8 TRUE in UTF-8 mode
2050 utf8_char used for utf8 character bytes, NULL if not relevant
2051 ptr next character in pattern
2052 options options bits
2053 cd contains pointers to tables etc.
2054
2055 Returns: TRUE if possessifying is wanted
2056 */
2057
2058 static BOOL
2059 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2060 const uschar *ptr, int options, compile_data *cd)
2061 {
2062 int next;
2063
2064 /* Skip whitespace and comments in extended mode */
2065
2066 if ((options & PCRE_EXTENDED) != 0)
2067 {
2068 for (;;)
2069 {
2070 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2071 if (*ptr == '#')
2072 {
2073 while (*(++ptr) != 0)
2074 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2075 }
2076 else break;
2077 }
2078 }
2079
2080 /* If the next item is one that we can handle, get its value. A non-negative
2081 value is a character, a negative value is an escape value. */
2082
2083 if (*ptr == '\\')
2084 {
2085 int temperrorcode = 0;
2086 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2087 if (temperrorcode != 0) return FALSE;
2088 ptr++; /* Point after the escape sequence */
2089 }
2090
2091 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2092 {
2093 #ifdef SUPPORT_UTF8
2094 if (utf8) { GETCHARINC(next, ptr); } else
2095 #endif
2096 next = *ptr++;
2097 }
2098
2099 else return FALSE;
2100
2101 /* Skip whitespace and comments in extended mode */
2102
2103 if ((options & PCRE_EXTENDED) != 0)
2104 {
2105 for (;;)
2106 {
2107 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2108 if (*ptr == '#')
2109 {
2110 while (*(++ptr) != 0)
2111 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2112 }
2113 else break;
2114 }
2115 }
2116
2117 /* If the next thing is itself optional, we have to give up. */
2118
2119 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2120 return FALSE;
2121
2122 /* Now compare the next item with the previous opcode. If the previous is a
2123 positive single character match, "item" either contains the character or, if
2124 "item" is greater than 127 in utf8 mode, the character's bytes are in
2125 utf8_char. */
2126
2127
2128 /* Handle cases when the next item is a character. */
2129
2130 if (next >= 0) switch(op_code)
2131 {
2132 case OP_CHAR:
2133 #ifdef SUPPORT_UTF8
2134 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2135 #endif
2136 return item != next;
2137
2138 /* For CHARNC (caseless character) we must check the other case. If we have
2139 Unicode property support, we can use it to test the other case of
2140 high-valued characters. */
2141
2142 case OP_CHARNC:
2143 #ifdef SUPPORT_UTF8
2144 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2145 #endif
2146 if (item == next) return FALSE;
2147 #ifdef SUPPORT_UTF8
2148 if (utf8)
2149 {
2150 unsigned int othercase;
2151 if (next < 128) othercase = cd->fcc[next]; else
2152 #ifdef SUPPORT_UCP
2153 othercase = _pcre_ucp_othercase((unsigned int)next);
2154 #else
2155 othercase = NOTACHAR;
2156 #endif
2157 return (unsigned int)item != othercase;
2158 }
2159 else
2160 #endif /* SUPPORT_UTF8 */
2161 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2162
2163 /* For OP_NOT, "item" must be a single-byte character. */
2164
2165 case OP_NOT:
2166 if (item == next) return TRUE;
2167 if ((options & PCRE_CASELESS) == 0) return FALSE;
2168 #ifdef SUPPORT_UTF8
2169 if (utf8)
2170 {
2171 unsigned int othercase;
2172 if (next < 128) othercase = cd->fcc[next]; else
2173 #ifdef SUPPORT_UCP
2174 othercase = _pcre_ucp_othercase(next);
2175 #else
2176 othercase = NOTACHAR;
2177 #endif
2178 return (unsigned int)item == othercase;
2179 }
2180 else
2181 #endif /* SUPPORT_UTF8 */
2182 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2183
2184 case OP_DIGIT:
2185 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2186
2187 case OP_NOT_DIGIT:
2188 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2189
2190 case OP_WHITESPACE:
2191 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2192
2193 case OP_NOT_WHITESPACE:
2194 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2195
2196 case OP_WORDCHAR:
2197 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2198
2199 case OP_NOT_WORDCHAR:
2200 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2201
2202 case OP_HSPACE:
2203 case OP_NOT_HSPACE:
2204 switch(next)
2205 {
2206 case 0x09:
2207 case 0x20:
2208 case 0xa0:
2209 case 0x1680:
2210 case 0x180e:
2211 case 0x2000:
2212 case 0x2001:
2213 case 0x2002:
2214 case 0x2003:
2215 case 0x2004:
2216 case 0x2005:
2217 case 0x2006:
2218 case 0x2007:
2219 case 0x2008:
2220 case 0x2009:
2221 case 0x200A:
2222 case 0x202f:
2223 case 0x205f:
2224 case 0x3000:
2225 return op_code != OP_HSPACE;
2226 default:
2227 return op_code == OP_HSPACE;
2228 }
2229
2230 case OP_VSPACE:
2231 case OP_NOT_VSPACE:
2232 switch(next)
2233 {
2234 case 0x0a:
2235 case 0x0b:
2236 case 0x0c:
2237 case 0x0d:
2238 case 0x85:
2239 case 0x2028:
2240 case 0x2029:
2241 return op_code != OP_VSPACE;
2242 default:
2243 return op_code == OP_VSPACE;
2244 }
2245
2246 default:
2247 return FALSE;
2248 }
2249
2250
2251 /* Handle the case when the next item is \d, \s, etc. */
2252
2253 switch(op_code)
2254 {
2255 case OP_CHAR:
2256 case OP_CHARNC:
2257 #ifdef SUPPORT_UTF8
2258 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2259 #endif
2260 switch(-next)
2261 {
2262 case ESC_d:
2263 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2264
2265 case ESC_D:
2266 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2267
2268 case ESC_s:
2269 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2270
2271 case ESC_S:
2272 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2273
2274 case ESC_w:
2275 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2276
2277 case ESC_W:
2278 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2279
2280 case ESC_h:
2281 case ESC_H:
2282 switch(item)
2283 {
2284 case 0x09:
2285 case 0x20:
2286 case 0xa0:
2287 case 0x1680:
2288 case 0x180e:
2289 case 0x2000:
2290 case 0x2001:
2291 case 0x2002:
2292 case 0x2003:
2293 case 0x2004:
2294 case 0x2005:
2295 case 0x2006:
2296 case 0x2007:
2297 case 0x2008:
2298 case 0x2009:
2299 case 0x200A:
2300 case 0x202f:
2301 case 0x205f:
2302 case 0x3000:
2303 return -next != ESC_h;
2304 default:
2305 return -next == ESC_h;
2306 }
2307
2308 case ESC_v:
2309 case ESC_V:
2310 switch(item)
2311 {
2312 case 0x0a:
2313 case 0x0b:
2314 case 0x0c:
2315 case 0x0d:
2316 case 0x85:
2317 case 0x2028:
2318 case 0x2029:
2319 return -next != ESC_v;
2320 default:
2321 return -next == ESC_v;
2322 }
2323
2324 default:
2325 return FALSE;
2326 }
2327
2328 case OP_DIGIT:
2329 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2330 next == -ESC_h || next == -ESC_v;
2331
2332 case OP_NOT_DIGIT:
2333 return next == -ESC_d;
2334
2335 case OP_WHITESPACE:
2336 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2337
2338 case OP_NOT_WHITESPACE:
2339 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2340
2341 case OP_HSPACE:
2342 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2343
2344 case OP_NOT_HSPACE:
2345 return next == -ESC_h;
2346
2347 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2348 case OP_VSPACE:
2349 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2350
2351 case OP_NOT_VSPACE:
2352 return next == -ESC_v;
2353
2354 case OP_WORDCHAR:
2355 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2356
2357 case OP_NOT_WORDCHAR:
2358 return next == -ESC_w || next == -ESC_d;
2359
2360 default:
2361 return FALSE;
2362 }
2363
2364 /* Control does not reach here */
2365 }
2366
2367
2368
2369 /*************************************************
2370 * Compile one branch *
2371 *************************************************/
2372
2373 /* Scan the pattern, compiling it into the a vector. If the options are
2374 changed during the branch, the pointer is used to change the external options
2375 bits. This function is used during the pre-compile phase when we are trying
2376 to find out the amount of memory needed, as well as during the real compile
2377 phase. The value of lengthptr distinguishes the two phases.
2378
2379 Arguments:
2380 optionsptr pointer to the option bits
2381 codeptr points to the pointer to the current code point
2382 ptrptr points to the current pattern pointer
2383 errorcodeptr points to error code variable
2384 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2385 reqbyteptr set to the last literal character required, else < 0
2386 bcptr points to current branch chain
2387 cd contains pointers to tables etc.
2388 lengthptr NULL during the real compile phase
2389 points to length accumulator during pre-compile phase
2390
2391 Returns: TRUE on success
2392 FALSE, with *errorcodeptr set non-zero on error
2393 */
2394
2395 static BOOL
2396 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2397 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2398 compile_data *cd, int *lengthptr)
2399 {
2400 int repeat_type, op_type;
2401 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2402 int bravalue = 0;
2403 int greedy_default, greedy_non_default;
2404 int firstbyte, reqbyte;
2405 int zeroreqbyte, zerofirstbyte;
2406 int req_caseopt, reqvary, tempreqvary;
2407 int options = *optionsptr;
2408 int after_manual_callout = 0;
2409 int length_prevgroup = 0;
2410 register int c;
2411 register uschar *code = *codeptr;
2412 uschar *last_code = code;
2413 uschar *orig_code = code;
2414 uschar *tempcode;
2415 BOOL inescq = FALSE;
2416 BOOL groupsetfirstbyte = FALSE;
2417 const uschar *ptr = *ptrptr;
2418 const uschar *tempptr;
2419 uschar *previous = NULL;
2420 uschar *previous_callout = NULL;
2421 uschar *save_hwm = NULL;
2422 uschar classbits[32];
2423
2424 #ifdef SUPPORT_UTF8
2425 BOOL class_utf8;
2426 BOOL utf8 = (options & PCRE_UTF8) != 0;
2427 uschar *class_utf8data;
2428 uschar *class_utf8data_base;
2429 uschar utf8_char[6];
2430 #else
2431 BOOL utf8 = FALSE;
2432 uschar *utf8_char = NULL;
2433 #endif
2434
2435 #ifdef DEBUG
2436 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2437 #endif
2438
2439 /* Set up the default and non-default settings for greediness */
2440
2441 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2442 greedy_non_default = greedy_default ^ 1;
2443
2444 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2445 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2446 matches a non-fixed char first char; reqbyte just remains unset if we never
2447 find one.
2448
2449 When we hit a repeat whose minimum is zero, we may have to adjust these values
2450 to take the zero repeat into account. This is implemented by setting them to
2451 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2452 item types that can be repeated set these backoff variables appropriately. */
2453
2454 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2455
2456 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2457 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2458 value > 255. It is added into the firstbyte or reqbyte variables to record the
2459 case status of the value. This is used only for ASCII characters. */
2460
2461 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2462
2463 /* Switch on next character until the end of the branch */
2464
2465 for (;; ptr++)
2466 {
2467 BOOL negate_class;
2468 BOOL should_flip_negation;
2469 BOOL possessive_quantifier;
2470 BOOL is_quantifier;
2471 BOOL is_recurse;
2472 BOOL reset_bracount;
2473 int class_charcount;
2474 int class_lastchar;
2475 int newoptions;
2476 int recno;
2477 int refsign;
2478 int skipbytes;
2479 int subreqbyte;
2480 int subfirstbyte;
2481 int terminator;
2482 int mclength;
2483 uschar mcbuffer[8];
2484
2485 /* Get next byte in the pattern */
2486
2487 c = *ptr;
2488
2489 /* If we are in the pre-compile phase, accumulate the length used for the
2490 previous cycle of this loop. */
2491
2492 if (lengthptr != NULL)
2493 {
2494 #ifdef DEBUG
2495 if (code > cd->hwm) cd->hwm = code; /* High water info */
2496 #endif
2497 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2498 {
2499 *errorcodeptr = ERR52;
2500 goto FAILED;
2501 }
2502
2503 /* There is at least one situation where code goes backwards: this is the
2504 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2505 the class is simply eliminated. However, it is created first, so we have to
2506 allow memory for it. Therefore, don't ever reduce the length at this point.
2507 */
2508
2509 if (code < last_code) code = last_code;
2510
2511 /* Paranoid check for integer overflow */
2512
2513 if (OFLOW_MAX - *lengthptr < code - last_code)
2514 {
2515 *errorcodeptr = ERR20;
2516 goto FAILED;
2517 }
2518
2519 *lengthptr += code - last_code;
2520 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2521
2522 /* If "previous" is set and it is not at the start of the work space, move
2523 it back to there, in order to avoid filling up the work space. Otherwise,
2524 if "previous" is NULL, reset the current code pointer to the start. */
2525
2526 if (previous != NULL)
2527 {
2528 if (previous > orig_code)
2529 {
2530 memmove(orig_code, previous, code - previous);
2531 code -= previous - orig_code;
2532 previous = orig_code;
2533 }
2534 }
2535 else code = orig_code;
2536
2537 /* Remember where this code item starts so we can pick up the length
2538 next time round. */
2539
2540 last_code = code;
2541 }
2542
2543 /* In the real compile phase, just check the workspace used by the forward
2544 reference list. */
2545
2546 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2547 {
2548 *errorcodeptr = ERR52;
2549 goto FAILED;
2550 }
2551
2552 /* If in \Q...\E, check for the end; if not, we have a literal */
2553
2554 if (inescq && c != 0)
2555 {
2556 if (c == '\\' && ptr[1] == 'E')
2557 {
2558 inescq = FALSE;
2559 ptr++;
2560 continue;
2561 }
2562 else
2563 {
2564 if (previous_callout != NULL)
2565 {
2566 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2567 complete_callout(previous_callout, ptr, cd);
2568 previous_callout = NULL;
2569 }
2570 if ((options & PCRE_AUTO_CALLOUT) != 0)
2571 {
2572 previous_callout = code;
2573 code = auto_callout(code, ptr, cd);
2574 }
2575 goto NORMAL_CHAR;
2576 }
2577 }
2578
2579 /* Fill in length of a previous callout, except when the next thing is
2580 a quantifier. */
2581
2582 is_quantifier = c == '*' || c == '+' || c == '?' ||
2583 (c == '{' && is_counted_repeat(ptr+1));
2584
2585 if (!is_quantifier && previous_callout != NULL &&
2586 after_manual_callout-- <= 0)
2587 {
2588 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2589 complete_callout(previous_callout, ptr, cd);
2590 previous_callout = NULL;
2591 }
2592
2593 /* In extended mode, skip white space and comments */
2594
2595 if ((options & PCRE_EXTENDED) != 0)
2596 {
2597 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2598 if (c == '#')
2599 {
2600 while (*(++ptr) != 0)
2601 {
2602 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2603 }
2604 if (*ptr != 0) continue;
2605
2606 /* Else fall through to handle end of string */
2607 c = 0;
2608 }
2609 }
2610
2611 /* No auto callout for quantifiers. */
2612
2613 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2614 {
2615 previous_callout = code;
2616 code = auto_callout(code, ptr, cd);
2617 }
2618
2619 switch(c)
2620 {
2621 /* ===================================================================*/
2622 case 0: /* The branch terminates at string end */
2623 case '|': /* or | or ) */
2624 case ')':
2625 *firstbyteptr = firstbyte;
2626 *reqbyteptr = reqbyte;
2627 *codeptr = code;
2628 *ptrptr = ptr;
2629 if (lengthptr != NULL)
2630 {
2631 if (OFLOW_MAX - *lengthptr < code - last_code)
2632 {
2633 *errorcodeptr = ERR20;
2634 goto FAILED;
2635 }
2636 *lengthptr += code - last_code; /* To include callout length */
2637 DPRINTF((">> end branch\n"));
2638 }
2639 return TRUE;
2640
2641
2642 /* ===================================================================*/
2643 /* Handle single-character metacharacters. In multiline mode, ^ disables
2644 the setting of any following char as a first character. */
2645
2646 case '^':
2647 if ((options & PCRE_MULTILINE) != 0)
2648 {
2649 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2650 }
2651 previous = NULL;
2652 *code++ = OP_CIRC;
2653 break;
2654
2655 case '$':
2656 previous = NULL;
2657 *code++ = OP_DOLL;
2658 break;
2659
2660 /* There can never be a first char if '.' is first, whatever happens about
2661 repeats. The value of reqbyte doesn't change either. */
2662
2663 case '.':
2664 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2665 zerofirstbyte = firstbyte;
2666 zeroreqbyte = reqbyte;
2667 previous = code;
2668 *code++ = OP_ANY;
2669 break;
2670
2671
2672 /* ===================================================================*/
2673 /* Character classes. If the included characters are all < 256, we build a
2674 32-byte bitmap of the permitted characters, except in the special case
2675 where there is only one such character. For negated classes, we build the
2676 map as usual, then invert it at the end. However, we use a different opcode
2677 so that data characters > 255 can be handled correctly.
2678
2679 If the class contains characters outside the 0-255 range, a different
2680 opcode is compiled. It may optionally have a bit map for characters < 256,
2681 but those above are are explicitly listed afterwards. A flag byte tells
2682 whether the bitmap is present, and whether this is a negated class or not.
2683
2684 In JavaScript compatibility mode, an isolated ']' causes an error. In
2685 default (Perl) mode, it is treated as a data character. */
2686
2687 case ']':
2688 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2689 {
2690 *errorcodeptr = ERR64;
2691 goto FAILED;
2692 }
2693 goto NORMAL_CHAR;
2694
2695 case '[':
2696 previous = code;
2697
2698 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2699 they are encountered at the top level, so we'll do that too. */
2700
2701 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2702 check_posix_syntax(ptr, &tempptr))
2703 {
2704 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2705 goto FAILED;
2706 }
2707
2708 /* If the first character is '^', set the negation flag and skip it. Also,
2709 if the first few characters (either before or after ^) are \Q\E or \E we
2710 skip them too. This makes for compatibility with Perl. */
2711
2712 negate_class = FALSE;
2713 for (;;)
2714 {
2715 c = *(++ptr);
2716 if (c == '\\')
2717 {
2718 if (ptr[1] == 'E') ptr++;
2719 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2720 else break;
2721 }
2722 else if (!negate_class && c == '^')
2723 negate_class = TRUE;
2724 else break;
2725 }
2726
2727 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2728 an initial ']' is taken as a data character -- the code below handles
2729 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2730 [^] must match any character, so generate OP_ALLANY. */
2731
2732 if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2733 {
2734 *code++ = negate_class? OP_ALLANY : OP_FAIL;
2735 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2736 zerofirstbyte = firstbyte;
2737 break;
2738 }
2739
2740 /* If a class contains a negative special such as \S, we need to flip the
2741 negation flag at the end, so that support for characters > 255 works
2742 correctly (they are all included in the class). */
2743
2744 should_flip_negation = FALSE;
2745
2746 /* Keep a count of chars with values < 256 so that we can optimize the case
2747 of just a single character (as long as it's < 256). However, For higher
2748 valued UTF-8 characters, we don't yet do any optimization. */
2749
2750 class_charcount = 0;
2751 class_lastchar = -1;
2752
2753 /* Initialize the 32-char bit map to all zeros. We build the map in a
2754 temporary bit of memory, in case the class contains only 1 character (less
2755 than 256), because in that case the compiled code doesn't use the bit map.
2756 */
2757
2758 memset(classbits, 0, 32 * sizeof(uschar));
2759
2760 #ifdef SUPPORT_UTF8
2761 class_utf8 = FALSE; /* No chars >= 256 */
2762 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2763 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2764 #endif
2765
2766 /* Process characters until ] is reached. By writing this as a "do" it
2767 means that an initial ] is taken as a data character. At the start of the
2768 loop, c contains the first byte of the character. */
2769
2770 if (c != 0) do
2771 {
2772 const uschar *oldptr;
2773
2774 #ifdef SUPPORT_UTF8
2775 if (utf8 && c > 127)
2776 { /* Braces are required because the */
2777 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2778 }
2779
2780 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2781 data and reset the pointer. This is so that very large classes that
2782 contain a zillion UTF-8 characters no longer overwrite the work space
2783 (which is on the stack). */
2784
2785 if (lengthptr != NULL)
2786 {
2787 *lengthptr += class_utf8data - class_utf8data_base;
2788 class_utf8data = class_utf8data_base;
2789 }
2790
2791 #endif
2792
2793 /* Inside \Q...\E everything is literal except \E */
2794
2795 if (inescq)
2796 {
2797 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2798 {
2799 inescq = FALSE; /* Reset literal state */
2800 ptr++; /* Skip the 'E' */
2801 continue; /* Carry on with next */
2802 }
2803 goto CHECK_RANGE; /* Could be range if \E follows */
2804 }
2805
2806 /* Handle POSIX class names. Perl allows a negation extension of the
2807 form [:^name:]. A square bracket that doesn't match the syntax is
2808 treated as a literal. We also recognize the POSIX constructions
2809 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2810 5.6 and 5.8 do. */
2811
2812 if (c == '[' &&
2813 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2814 check_posix_syntax(ptr, &tempptr))
2815 {
2816 BOOL local_negate = FALSE;
2817 int posix_class, taboffset, tabopt;
2818 register const uschar *cbits = cd->cbits;
2819 uschar pbits[32];
2820
2821 if (ptr[1] != ':')
2822 {
2823 *errorcodeptr = ERR31;
2824 goto FAILED;
2825 }
2826
2827 ptr += 2;
2828 if (*ptr == '^')
2829 {
2830 local_negate = TRUE;
2831 should_flip_negation = TRUE; /* Note negative special */
2832 ptr++;
2833 }
2834
2835 posix_class = check_posix_name(ptr, tempptr - ptr);
2836 if (posix_class < 0)
2837 {
2838 *errorcodeptr = ERR30;
2839 goto FAILED;
2840 }
2841
2842 /* If matching is caseless, upper and lower are converted to
2843 alpha. This relies on the fact that the class table starts with
2844 alpha, lower, upper as the first 3 entries. */
2845
2846 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2847 posix_class = 0;
2848
2849 /* We build the bit map for the POSIX class in a chunk of local store
2850 because we may be adding and subtracting from it, and we don't want to
2851 subtract bits that may be in the main map already. At the end we or the
2852 result into the bit map that is being built. */
2853
2854 posix_class *= 3;
2855
2856 /* Copy in the first table (always present) */
2857
2858 memcpy(pbits, cbits + posix_class_maps[posix_class],
2859 32 * sizeof(uschar));
2860
2861 /* If there is a second table, add or remove it as required. */
2862
2863 taboffset = posix_class_maps[posix_class + 1];
2864 tabopt = posix_class_maps[posix_class + 2];
2865
2866 if (taboffset >= 0)
2867 {
2868 if (tabopt >= 0)
2869 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2870 else
2871 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2872 }
2873
2874 /* Not see if we need to remove any special characters. An option
2875 value of 1 removes vertical space and 2 removes underscore. */
2876
2877 if (tabopt < 0) tabopt = -tabopt;
2878 if (tabopt == 1) pbits[1] &= ~0x3c;
2879 else if (tabopt == 2) pbits[11] &= 0x7f;
2880
2881 /* Add the POSIX table or its complement into the main table that is
2882 being built and we are done. */
2883
2884 if (local_negate)
2885 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2886 else
2887 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2888
2889 ptr = tempptr + 1;
2890 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2891 continue; /* End of POSIX syntax handling */
2892 }
2893
2894 /* Backslash may introduce a single character, or it may introduce one
2895 of the specials, which just set a flag. The sequence \b is a special
2896 case. Inside a class (and only there) it is treated as backspace.
2897 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2898 to 'or' into the one we are building. We assume they have more than one
2899 character in them, so set class_charcount bigger than one. */
2900
2901 if (c == '\\')
2902 {
2903 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2904 if (*errorcodeptr != 0) goto FAILED;
2905
2906 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2907 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2908 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2909 else if (-c == ESC_Q) /* Handle start of quoted string */
2910 {
2911 if (ptr[1] == '\\' && ptr[2] == 'E')
2912 {
2913 ptr += 2; /* avoid empty string */
2914 }
2915 else inescq = TRUE;
2916 continue;
2917 }
2918 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2919
2920 if (c < 0)
2921 {
2922 register const uschar *cbits = cd->cbits;
2923 class_charcount += 2; /* Greater than 1 is what matters */
2924
2925 /* Save time by not doing this in the pre-compile phase. */
2926
2927 if (lengthptr == NULL) switch (-c)
2928 {
2929 case ESC_d:
2930 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2931 continue;
2932
2933 case ESC_D:
2934 should_flip_negation = TRUE;
2935 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2936 continue;
2937
2938 case ESC_w:
2939 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2940 continue;
2941
2942 case ESC_W:
2943 should_flip_negation = TRUE;
2944 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2945 continue;
2946
2947 case ESC_s:
2948 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2949 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2950 continue;
2951
2952 case ESC_S:
2953 should_flip_negation = TRUE;
2954 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2955 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2956 continue;
2957
2958 default: /* Not recognized; fall through */
2959 break; /* Need "default" setting to stop compiler warning. */
2960 }
2961
2962 /* In the pre-compile phase, just do the recognition. */
2963
2964 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2965 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2966
2967 /* We need to deal with \H, \h, \V, and \v in both phases because
2968 they use extra memory. */
2969
2970 if (-c == ESC_h)
2971 {
2972 SETBIT(classbits, 0x09); /* VT */
2973 SETBIT(classbits, 0x20); /* SPACE */
2974 SETBIT(classbits, 0xa0); /* NSBP */
2975 #ifdef SUPPORT_UTF8
2976 if (utf8)
2977 {
2978 class_utf8 = TRUE;
2979 *class_utf8data++ = XCL_SINGLE;
2980 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2981 *class_utf8data++ = XCL_SINGLE;
2982 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2983 *class_utf8data++ = XCL_RANGE;
2984 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2985 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2986 *class_utf8data++ = XCL_SINGLE;
2987 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2988 *class_utf8data++ = XCL_SINGLE;
2989 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2990 *class_utf8data++ = XCL_SINGLE;
2991 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2992 }
2993 #endif
2994 continue;
2995 }
2996
2997 if (-c == ESC_H)
2998 {
2999 for (c = 0; c < 32; c++)
3000 {
3001 int x = 0xff;
3002 switch (c)
3003 {
3004 case 0x09/8: x ^= 1 << (0x09%8); break;
3005 case 0x20/8: x ^= 1 << (0x20%8); break;
3006 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3007 default: break;
3008 }
3009 classbits[c] |= x;
3010 }
3011
3012 #ifdef SUPPORT_UTF8
3013 if (utf8)
3014 {
3015 class_utf8 = TRUE;
3016 *class_utf8data++ = XCL_RANGE;
3017 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3018 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3019 *class_utf8data++ = XCL_RANGE;
3020 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3021 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3022 *class_utf8data++ = XCL_RANGE;
3023 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3024 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3025 *class_utf8data++ = XCL_RANGE;
3026 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3027 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3028 *class_utf8data++ = XCL_RANGE;
3029 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3030 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3031 *class_utf8data++ = XCL_RANGE;
3032 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3033 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3034 *class_utf8data++ = XCL_RANGE;
3035 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3036 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3037 }
3038 #endif
3039 continue;
3040 }
3041
3042 if (-c == ESC_v)
3043 {
3044 SETBIT(classbits, 0x0a); /* LF */
3045 SETBIT(classbits, 0x0b); /* VT */
3046 SETBIT(classbits, 0x0c); /* FF */
3047 SETBIT(classbits, 0x0d); /* CR */
3048 SETBIT(classbits, 0x85); /* NEL */
3049 #ifdef SUPPORT_UTF8
3050 if (utf8)
3051 {
3052 class_utf8 = TRUE;
3053 *class_utf8data++ = XCL_RANGE;
3054 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3055 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3056 }
3057 #endif
3058 continue;
3059 }
3060
3061 if (-c == ESC_V)
3062 {
3063 for (c = 0; c < 32; c++)
3064 {
3065 int x = 0xff;
3066 switch (c)
3067 {
3068 case 0x0a/8: x ^= 1 << (0x0a%8);
3069 x ^= 1 << (0x0b%8);
3070 x ^= 1 << (0x0c%8);
3071 x ^= 1 << (0x0d%8);
3072 break;
3073 case 0x85/8: x ^= 1 << (0x85%8); break;
3074 default: break;
3075 }
3076 classbits[c] |= x;
3077 }
3078
3079 #ifdef SUPPORT_UTF8
3080 if (utf8)
3081 {
3082 class_utf8 = TRUE;
3083 *class_utf8data++ = XCL_RANGE;
3084 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3085 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3086 *class_utf8data++ = XCL_RANGE;
3087 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3088 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3089 }
3090 #endif
3091 continue;
3092 }
3093
3094 /* We need to deal with \P and \p in both phases. */
3095
3096 #ifdef SUPPORT_UCP
3097 if (-c == ESC_p || -c == ESC_P)
3098 {
3099 BOOL negated;
3100 int pdata;
3101 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3102 if (ptype < 0) goto FAILED;
3103 class_utf8 = TRUE;
3104 *class_utf8data++ = ((-c == ESC_p) != negated)?
3105 XCL_PROP : XCL_NOTPROP;
3106 *class_utf8data++ = ptype;
3107 *class_utf8data++ = pdata;
3108 class_charcount -= 2; /* Not a < 256 character */
3109 continue;
3110 }
3111 #endif
3112 /* Unrecognized escapes are faulted if PCRE is running in its
3113 strict mode. By default, for compatibility with Perl, they are
3114 treated as literals. */
3115
3116 if ((options & PCRE_EXTRA) != 0)
3117 {
3118 *errorcodeptr = ERR7;
3119 goto FAILED;
3120 }
3121
3122 class_charcount -= 2; /* Undo the default count from above */
3123 c = *ptr; /* Get the final character and fall through */
3124 }
3125
3126 /* Fall through if we have a single character (c >= 0). This may be
3127 greater than 256 in UTF-8 mode. */
3128
3129 } /* End of backslash handling */
3130
3131 /* A single character may be followed by '-' to form a range. However,
3132 Perl does not permit ']' to be the end of the range. A '-' character
3133 at the end is treated as a literal. Perl ignores orphaned \E sequences
3134 entirely. The code for handling \Q and \E is messy. */
3135
3136 CHECK_RANGE:
3137 while (ptr[1] == '\\' && ptr[2] == 'E')
3138 {
3139 inescq = FALSE;
3140 ptr += 2;
3141 }
3142
3143 oldptr = ptr;
3144
3145 /* Remember \r or \n */
3146
3147 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3148
3149 /* Check for range */
3150
3151 if (!inescq && ptr[1] == '-')
3152 {
3153 int d;
3154 ptr += 2;
3155 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3156
3157 /* If we hit \Q (not followed by \E) at this point, go into escaped
3158 mode. */
3159
3160 while (*ptr == '\\' && ptr[1] == 'Q')
3161 {
3162 ptr += 2;
3163 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3164 inescq = TRUE;
3165 break;
3166 }
3167
3168 if (*ptr == 0 || (!inescq && *ptr == ']'))
3169 {
3170 ptr = oldptr;
3171 goto LONE_SINGLE_CHARACTER;
3172 }
3173
3174 #ifdef SUPPORT_UTF8
3175 if (utf8)
3176 { /* Braces are required because the */
3177 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3178 }
3179 else
3180 #endif
3181 d = *ptr; /* Not UTF-8 mode */
3182
3183 /* The second part of a range can be a single-character escape, but
3184 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3185 in such circumstances. */
3186
3187 if (!inescq && d == '\\')
3188 {
3189 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3190 if (*errorcodeptr != 0) goto FAILED;
3191
3192 /* \b is backspace; \X is literal X; \R is literal R; any other
3193 special means the '-' was literal */
3194
3195 if (d < 0)
3196 {
3197 if (d == -ESC_b) d = '\b';
3198 else if (d == -ESC_X) d = 'X';
3199 else if (d == -ESC_R) d = 'R'; else
3200 {
3201 ptr = oldptr;
3202 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3203 }
3204 }
3205 }
3206
3207 /* Check that the two values are in the correct order. Optimize
3208 one-character ranges */
3209
3210 if (d < c)
3211 {
3212 *errorcodeptr = ERR8;
3213 goto FAILED;
3214 }
3215
3216 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3217
3218 /* Remember \r or \n */
3219
3220 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3221
3222 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3223 matching, we have to use an XCLASS with extra data items. Caseless
3224 matching for characters > 127 is available only if UCP support is
3225 available. */
3226
3227 #ifdef SUPPORT_UTF8
3228 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3229 {
3230 class_utf8 = TRUE;
3231
3232 /* With UCP support, we can find the other case equivalents of
3233 the relevant characters. There may be several ranges. Optimize how
3234 they fit with the basic range. */
3235
3236 #ifdef SUPPORT_UCP
3237 if ((options & PCRE_CASELESS) != 0)
3238 {
3239 unsigned int occ, ocd;
3240 unsigned int cc = c;
3241 unsigned int origd = d;
3242 while (get_othercase_range(&cc, origd, &occ, &ocd))
3243 {
3244 if (occ >= (unsigned int)c &&
3245 ocd <= (unsigned int)d)
3246 continue; /* Skip embedded ranges */
3247
3248 if (occ < (unsigned int)c &&
3249 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3250 { /* if there is overlap, */
3251 c = occ; /* noting that if occ < c */
3252 continue; /* we can't have ocd > d */
3253 } /* because a subrange is */
3254 if (ocd > (unsigned int)d &&
3255 occ <= (unsigned int)d + 1) /* always shorter than */
3256 { /* the basic range. */
3257 d = ocd;
3258 continue;
3259 }
3260
3261 if (occ == ocd)
3262 {
3263 *class_utf8data++ = XCL_SINGLE;
3264 }
3265 else
3266 {
3267 *class_utf8data++ = XCL_RANGE;
3268 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3269 }
3270 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3271 }
3272 }
3273 #endif /* SUPPORT_UCP */
3274
3275 /* Now record the original range, possibly modified for UCP caseless
3276 overlapping ranges. */
3277
3278 *class_utf8data++ = XCL_RANGE;
3279 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3280 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3281
3282 /* With UCP support, we are done. Without UCP support, there is no
3283 caseless matching for UTF-8 characters > 127; we can use the bit map
3284 for the smaller ones. */
3285
3286 #ifdef SUPPORT_UCP
3287 continue; /* With next character in the class */
3288 #else
3289 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3290
3291 /* Adjust upper limit and fall through to set up the map */
3292
3293 d = 127;
3294
3295 #endif /* SUPPORT_UCP */
3296 }
3297 #endif /* SUPPORT_UTF8 */
3298
3299 /* We use the bit map for all cases when not in UTF-8 mode; else
3300 ranges that lie entirely within 0-127 when there is UCP support; else
3301 for partial ranges without UCP support. */
3302
3303 class_charcount += d - c + 1;
3304 class_lastchar = d;
3305
3306 /* We can save a bit of time by skipping this in the pre-compile. */
3307
3308 if (lengthptr == NULL) for (; c <= d; c++)
3309 {
3310 classbits[c/8] |= (1 << (c&7));
3311 if ((options & PCRE_CASELESS) != 0)
3312 {
3313 int uc = cd->fcc[c]; /* flip case */
3314 classbits[uc/8] |= (1 << (uc&7));
3315 }
3316 }
3317
3318 continue; /* Go get the next char in the class */
3319 }
3320
3321 /* Handle a lone single character - we can get here for a normal
3322 non-escape char, or after \ that introduces a single character or for an
3323 apparent range that isn't. */
3324
3325 LONE_SINGLE_CHARACTER:
3326
3327 /* Handle a character that cannot go in the bit map */
3328
3329 #ifdef SUPPORT_UTF8
3330 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3331 {
3332 class_utf8 = TRUE;
3333 *class_utf8data++ = XCL_SINGLE;
3334 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3335
3336 #ifdef SUPPORT_UCP
3337 if ((options & PCRE_CASELESS) != 0)
3338 {
3339 unsigned int othercase;
3340 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3341 {
3342 *class_utf8data++ = XCL_SINGLE;
3343 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3344 }
3345 }
3346 #endif /* SUPPORT_UCP */
3347
3348 }
3349 else
3350 #endif /* SUPPORT_UTF8 */
3351
3352 /* Handle a single-byte character */
3353 {
3354 classbits[c/8] |= (1 << (c&7));
3355 if ((options & PCRE_CASELESS) != 0)
3356 {
3357 c = cd->fcc[c]; /* flip case */
3358 classbits[c/8] |= (1 << (c&7));
3359 }
3360 class_charcount++;
3361 class_lastchar = c;
3362 }
3363 }
3364
3365 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3366
3367 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3368
3369 if (c == 0) /* Missing terminating ']' */
3370 {
3371 *errorcodeptr = ERR6;
3372 goto FAILED;
3373 }
3374
3375
3376 /* This code has been disabled because it would mean that \s counts as
3377 an explicit \r or \n reference, and that's not really what is wanted. Now
3378 we set the flag only if there is a literal "\r" or "\n" in the class. */
3379
3380 #if 0
3381 /* Remember whether \r or \n are in this class */
3382
3383 if (negate_class)
3384 {
3385 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3386 }
3387 else
3388 {
3389 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3390 }
3391 #endif
3392
3393
3394 /* If class_charcount is 1, we saw precisely one character whose value is
3395 less than 256. As long as there were no characters >= 128 and there was no
3396 use of \p or \P, in other words, no use of any XCLASS features, we can
3397 optimize.
3398
3399 In UTF-8 mode, we can optimize the negative case only if there were no
3400 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3401 operate on single-bytes only. This is an historical hangover. Maybe one day
3402 we can tidy these opcodes to handle multi-byte characters.
3403
3404 The optimization throws away the bit map. We turn the item into a
3405 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3406 that OP_NOT does not support multibyte characters. In the positive case, it
3407 can cause firstbyte to be set. Otherwise, there can be no first char if
3408 this item is first, whatever repeat count may follow. In the case of
3409 reqbyte, save the previous value for reinstating. */
3410
3411 #ifdef SUPPORT_UTF8
3412 if (class_charcount == 1 && !class_utf8 &&
3413 (!utf8 || !negate_class || class_lastchar < 128))
3414 #else
3415 if (class_charcount == 1)
3416 #endif
3417 {
3418 zeroreqbyte = reqbyte;
3419
3420 /* The OP_NOT opcode works on one-byte characters only. */
3421
3422 if (negate_class)
3423 {
3424 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3425 zerofirstbyte = firstbyte;
3426 *code++ = OP_NOT;
3427 *code++ = class_lastchar;
3428 break;
3429 }
3430
3431 /* For a single, positive character, get the value into mcbuffer, and
3432 then we can handle this with the normal one-character code. */
3433
3434 #ifdef SUPPORT_UTF8
3435 if (utf8 && class_lastchar > 127)
3436 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3437 else
3438 #endif
3439 {
3440 mcbuffer[0] = class_lastchar;
3441 mclength = 1;
3442 }
3443 goto ONE_CHAR;
3444 } /* End of 1-char optimization */
3445
3446 /* The general case - not the one-char optimization. If this is the first
3447 thing in the branch, there can be no first char setting, whatever the
3448 repeat count. Any reqbyte setting must remain unchanged after any kind of
3449 repeat. */
3450
3451 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3452 zerofirstbyte = firstbyte;
3453 zeroreqbyte = reqbyte;
3454
3455 /* If there are characters with values > 255, we have to compile an
3456 extended class, with its own opcode, unless there was a negated special
3457 such as \S in the class, because in that case all characters > 255 are in
3458 the class, so any that were explicitly given as well can be ignored. If
3459 (when there are explicit characters > 255 that must be listed) there are no
3460 characters < 256, we can omit the bitmap in the actual compiled code. */
3461
3462 #ifdef SUPPORT_UTF8
3463 if (class_utf8 && !should_flip_negation)
3464 {
3465 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3466 *code++ = OP_XCLASS;
3467 code += LINK_SIZE;
3468 *code = negate_class? XCL_NOT : 0;
3469
3470 /* If the map is required, move up the extra data to make room for it;
3471 otherwise just move the code pointer to the end of the extra data. */
3472
3473 if (class_charcount > 0)
3474 {
3475 *code++ |= XCL_MAP;
3476 memmove(code + 32, code, class_utf8data - code);
3477 memcpy(code, classbits, 32);
3478 code = class_utf8data + 32;
3479 }
3480 else code = class_utf8data;
3481
3482 /* Now fill in the complete length of the item */
3483
3484 PUT(previous, 1, code - previous);
3485 break; /* End of class handling */
3486 }
3487 #endif
3488
3489 /* If there are no characters > 255, set the opcode to OP_CLASS or
3490 OP_NCLASS, depending on whether the whole class was negated and whether
3491 there were negative specials such as \S in the class. Then copy the 32-byte
3492 map into the code vector, negating it if necessary. */
3493
3494 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3495 if (negate_class)
3496 {
3497 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3498 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3499 }
3500 else
3501 {
3502 memcpy(code, classbits, 32);
3503 }
3504 code += 32;
3505 break;
3506
3507
3508 /* ===================================================================*/
3509 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3510 has been tested above. */
3511
3512 case '{':
3513 if (!is_quantifier) goto NORMAL_CHAR;
3514 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3515 if (*errorcodeptr != 0) goto FAILED;
3516 goto REPEAT;
3517
3518 case '*':
3519 repeat_min = 0;
3520 repeat_max = -1;
3521 goto REPEAT;
3522
3523 case '+':
3524 repeat_min = 1;
3525 repeat_max = -1;
3526 goto REPEAT;
3527
3528 case '?':
3529 repeat_min = 0;
3530 repeat_max = 1;
3531
3532 REPEAT:
3533 if (previous == NULL)
3534 {
3535 *errorcodeptr = ERR9;
3536 goto FAILED;
3537 }
3538
3539 if (repeat_min == 0)
3540 {
3541 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3542 reqbyte = zeroreqbyte; /* Ditto */
3543 }
3544
3545 /* Remember whether this is a variable length repeat */
3546
3547 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3548
3549 op_type = 0; /* Default single-char op codes */
3550 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3551
3552 /* Save start of previous item, in case we have to move it up to make space
3553 for an inserted OP_ONCE for the additional '+' extension. */
3554
3555 tempcode = previous;
3556
3557 /* If the next character is '+', we have a possessive quantifier. This
3558 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3559 If the next character is '?' this is a minimizing repeat, by default,
3560 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3561 repeat type to the non-default. */
3562
3563 if (ptr[1] == '+')
3564 {
3565 repeat_type = 0; /* Force greedy */
3566 possessive_quantifier = TRUE;
3567 ptr++;
3568 }
3569 else if (ptr[1] == '?')
3570 {
3571 repeat_type = greedy_non_default;
3572 ptr++;
3573 }
3574 else repeat_type = greedy_default;
3575
3576 /* If previous was a character match, abolish the item and generate a
3577 repeat item instead. If a char item has a minumum of more than one, ensure
3578 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3579 the first thing in a branch because the x will have gone into firstbyte
3580 instead. */
3581
3582 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3583 {
3584 /* Deal with UTF-8 characters that take up more than one byte. It's
3585 easier to write this out separately than try to macrify it. Use c to
3586 hold the length of the character in bytes, plus 0x80 to flag that it's a
3587 length rather than a small character. */
3588
3589 #ifdef SUPPORT_UTF8
3590 if (utf8 && (code[-1] & 0x80) != 0)
3591 {
3592 uschar *lastchar = code - 1;
3593 while((*lastchar & 0xc0) == 0x80) lastchar--;
3594 c = code - lastchar; /* Length of UTF-8 character */
3595 memcpy(utf8_char, lastchar, c); /* Save the char */
3596 c |= 0x80; /* Flag c as a length */
3597 }
3598 else
3599 #endif
3600
3601 /* Handle the case of a single byte - either with no UTF8 support, or
3602 with UTF-8 disabled, or for a UTF-8 character < 128. */
3603
3604 {
3605 c = code[-1];
3606 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3607 }
3608
3609 /* If the repetition is unlimited, it pays to see if the next thing on
3610 the line is something that cannot possibly match this character. If so,
3611 automatically possessifying this item gains some performance in the case
3612 where the match fails. */
3613
3614 if (!possessive_quantifier &&
3615 repeat_max < 0 &&
3616 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3617 options, cd))
3618 {
3619 repeat_type = 0; /* Force greedy */
3620 possessive_quantifier = TRUE;
3621 }
3622
3623 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3624 }
3625
3626 /* If previous was a single negated character ([^a] or similar), we use
3627 one of the special opcodes, replacing it. The code is shared with single-
3628 character repeats by setting opt_type to add a suitable offset into
3629 repeat_type. We can also test for auto-possessification. OP_NOT is
3630 currently used only for single-byte chars. */
3631
3632 else if (*previous == OP_NOT)
3633 {
3634 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3635 c = previous[1];
3636 if (!possessive_quantifier &&
3637 repeat_max < 0 &&
3638 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3639 {
3640 repeat_type = 0; /* Force greedy */
3641 possessive_quantifier = TRUE;
3642 }
3643 goto OUTPUT_SINGLE_REPEAT;
3644 }
3645
3646 /* If previous was a character type match (\d or similar), abolish it and
3647 create a suitable repeat item. The code is shared with single-character
3648 repeats by setting op_type to add a suitable offset into repeat_type. Note
3649 the the Unicode property types will be present only when SUPPORT_UCP is
3650 defined, but we don't wrap the little bits of code here because it just
3651 makes it horribly messy. */
3652
3653 else if (*previous < OP_EODN)
3654 {
3655 uschar *oldcode;
3656 int prop_type, prop_value;
3657 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3658 c = *previous;
3659
3660 if (!possessive_quantifier &&
3661 repeat_max < 0 &&
3662 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3663 {
3664 repeat_type = 0; /* Force greedy */
3665 possessive_quantifier = TRUE;
3666 }
3667
3668 OUTPUT_SINGLE_REPEAT:
3669 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3670 {
3671 prop_type = previous[1];
3672 prop_value = previous[2];
3673 }
3674 else prop_type = prop_value = -1;
3675
3676 oldcode = code;
3677 code = previous; /* Usually overwrite previous item */
3678
3679 /* If the maximum is zero then the minimum must also be zero; Perl allows
3680 this case, so we do too - by simply omitting the item altogether. */
3681
3682 if (repeat_max == 0) goto END_REPEAT;
3683
3684 /* All real repeats make it impossible to handle partial matching (maybe
3685 one day we will be able to remove this restriction). */
3686
3687 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3688
3689 /* Combine the op_type with the repeat_type */
3690
3691 repeat_type += op_type;
3692
3693 /* A minimum of zero is handled either as the special case * or ?, or as
3694 an UPTO, with the maximum given. */
3695
3696 if (repeat_min == 0)
3697 {
3698 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3699 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3700 else
3701 {
3702 *code++ = OP_UPTO + repeat_type;
3703 PUT2INC(code, 0, repeat_max);
3704 }
3705 }
3706
3707 /* A repeat minimum of 1 is optimized into some special cases. If the
3708 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3709 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3710 one less than the maximum. */
3711
3712 else if (repeat_min == 1)
3713 {
3714 if (repeat_max == -1)
3715 *code++ = OP_PLUS + repeat_type;
3716 else
3717 {
3718 code = oldcode; /* leave previous item in place */
3719 if (repeat_max == 1) goto END_REPEAT;
3720 *code++ = OP_UPTO + repeat_type;
3721 PUT2INC(code, 0, repeat_max - 1);
3722 }
3723 }
3724
3725 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3726 handled as an EXACT followed by an UPTO. */
3727
3728 else
3729 {
3730 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3731 PUT2INC(code, 0, repeat_min);
3732
3733 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3734 we have to insert the character for the previous code. For a repeated
3735 Unicode property match, there are two extra bytes that define the
3736 required property. In UTF-8 mode, long characters have their length in
3737 c, with the 0x80 bit as a flag. */
3738
3739 if (repeat_max < 0)
3740 {
3741 #ifdef SUPPORT_UTF8
3742 if (utf8 && c >= 128)
3743 {
3744 memcpy(code, utf8_char, c & 7);
3745 code += c & 7;
3746 }
3747 else
3748 #endif
3749 {
3750 *code++ = c;
3751 if (prop_type >= 0)
3752 {
3753 *code++ = prop_type;
3754 *code++ = prop_value;
3755 }
3756 }
3757 *code++ = OP_STAR + repeat_type;
3758 }
3759
3760 /* Else insert an UPTO if the max is greater than the min, again
3761 preceded by the character, for the previously inserted code. If the
3762 UPTO is just for 1 instance, we can use QUERY instead. */
3763
3764 else if (repeat_max != repeat_min)
3765 {
3766 #ifdef SUPPORT_UTF8
3767 if (utf8 && c >= 128)
3768 {
3769 memcpy(code, utf8_char, c & 7);
3770 code += c & 7;
3771 }
3772 else
3773 #endif
3774 *code++ = c;
3775 if (prop_type >= 0)
3776 {
3777 *code++ = prop_type;
3778 *code++ = prop_value;
3779 }
3780 repeat_max -= repeat_min;
3781
3782 if (repeat_max == 1)
3783 {
3784 *code++ = OP_QUERY + repeat_type;
3785 }
3786 else
3787 {
3788 *code++ = OP_UPTO + repeat_type;
3789 PUT2INC(code, 0, repeat_max);
3790 }
3791 }
3792 }
3793
3794 /* The character or character type itself comes last in all cases. */
3795
3796 #ifdef SUPPORT_UTF8
3797 if (utf8 && c >= 128)
3798 {
3799 memcpy(code, utf8_char, c & 7);
3800 code += c & 7;
3801 }
3802 else
3803 #endif
3804 *code++ = c;
3805
3806 /* For a repeated Unicode property match, there are two extra bytes that
3807 define the required property. */
3808
3809 #ifdef SUPPORT_UCP
3810 if (prop_type >= 0)
3811 {
3812 *code++ = prop_type;
3813 *code++ = prop_value;
3814 }
3815 #endif
3816 }
3817
3818 /* If previous was a character class or a back reference, we put the repeat
3819 stuff after it, but just skip the item if the repeat was {0,0}. */
3820
3821 else if (*previous == OP_CLASS ||
3822 *previous == OP_NCLASS ||
3823 #ifdef SUPPORT_UTF8
3824 *previous == OP_XCLASS ||
3825 #endif
3826 *previous == OP_REF)
3827 {
3828 if (repeat_max == 0)
3829 {
3830 code = previous;
3831 goto END_REPEAT;
3832 }
3833
3834 /* All real repeats make it impossible to handle partial matching (maybe
3835 one day we will be able to remove this restriction). */
3836
3837 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3838
3839 if (repeat_min == 0 && repeat_max == -1)
3840 *code++ = OP_CRSTAR + repeat_type;
3841 else if (repeat_min == 1 && repeat_max == -1)
3842 *code++ = OP_CRPLUS + repeat_type;
3843 else if (repeat_min == 0 && repeat_max == 1)
3844 *code++ = OP_CRQUERY + repeat_type;
3845 else
3846 {
3847 *code++ = OP_CRRANGE + repeat_type;
3848 PUT2INC(code, 0, repeat_min);
3849 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3850 PUT2INC(code, 0, repeat_max);
3851 }
3852 }
3853
3854 /* If previous was a bracket group, we may have to replicate it in certain
3855 cases. */
3856
3857 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3858 *previous == OP_ONCE || *previous == OP_COND)
3859 {
3860 register int i;
3861 int ketoffset = 0;
3862 int len = code - previous;
3863 uschar *bralink = NULL;
3864
3865 /* Repeating a DEFINE group is pointless */
3866
3867 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3868 {
3869 *errorcodeptr = ERR55;
3870 goto FAILED;
3871 }
3872
3873 /* If the maximum repeat count is unlimited, find the end of the bracket
3874 by scanning through from the start, and compute the offset back to it
3875 from the current code pointer. There may be an OP_OPT setting following
3876 the final KET, so we can't find the end just by going back from the code
3877 pointer. */
3878
3879 if (repeat_max == -1)
3880 {
3881 register uschar *ket = previous;
3882 do ket += GET(ket, 1); while (*ket != OP_KET);
3883 ketoffset = code - ket;
3884 }
3885
3886 /* The case of a zero minimum is special because of the need to stick
3887 OP_BRAZERO in front of it, and because the group appears once in the
3888 data, whereas in other cases it appears the minimum number of times. For
3889 this reason, it is simplest to treat this case separately, as otherwise
3890 the code gets far too messy. There are several special subcases when the
3891 minimum is zero. */
3892
3893 if (repeat_min == 0)
3894 {
3895 /* If the maximum is also zero, we used to just omit the group from the
3896 output altogether, like this:
3897
3898 ** if (repeat_max == 0)
3899 ** {
3900 ** code = previous;
3901 ** goto END_REPEAT;
3902 ** }
3903
3904 However, that fails when a group is referenced as a subroutine from
3905 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3906 so that it is skipped on execution. As we don't have a list of which
3907 groups are referenced, we cannot do this selectively.
3908
3909 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3910 and do no more at this point. However, we do need to adjust any
3911 OP_RECURSE calls inside the group that refer to the group itself or any
3912 internal or forward referenced group, because the offset is from the
3913 start of the whole regex. Temporarily terminate the pattern while doing
3914 this. */
3915
3916 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3917 {
3918 *code = OP_END;
3919 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3920 memmove(previous+1, previous, len);
3921 code++;
3922 if (repeat_max == 0)
3923 {
3924 *previous++ = OP_SKIPZERO;
3925 goto END_REPEAT;
3926 }
3927 *previous++ = OP_BRAZERO + repeat_type;
3928 }
3929
3930 /* If the maximum is greater than 1 and limited, we have to replicate
3931 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3932 The first one has to be handled carefully because it's the original
3933 copy, which has to be moved up. The remainder can be handled by code
3934 that is common with the non-zero minimum case below. We have to
3935 adjust the value or repeat_max, since one less copy is required. Once
3936 again, we may have to adjust any OP_RECURSE calls inside the group. */
3937
3938 else
3939 {
3940 int offset;
3941 *code = OP_END;
3942 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3943 memmove(previous + 2 + LINK_SIZE, previous, len);
3944 code += 2 + LINK_SIZE;
3945 *previous++ = OP_BRAZERO + repeat_type;
3946 *previous++ = OP_BRA;
3947
3948 /* We chain together the bracket offset fields that have to be
3949 filled in later when the ends of the brackets are reached. */
3950
3951 offset = (bralink == NULL)? 0 : previous - bralink;
3952 bralink = previous;
3953 PUTINC(previous, 0, offset);
3954 }
3955
3956 repeat_max--;
3957 }
3958
3959 /* If the minimum is greater than zero, replicate the group as many
3960 times as necessary, and adjust the maximum to the number of subsequent
3961 copies that we need. If we set a first char from the group, and didn't
3962 set a required char, copy the latter from the former. If there are any
3963 forward reference subroutine calls in the group, there will be entries on
3964 the workspace list; replicate these with an appropriate increment. */
3965
3966 else
3967 {
3968 if (repeat_min > 1)
3969 {
3970 /* In the pre-compile phase, we don't actually do the replication. We
3971 just adjust the length as if we had. Do some paranoid checks for
3972 potential integer overflow. */
3973
3974 if (lengthptr != NULL)
3975 {
3976 int delta = (repeat_min - 1)*length_prevgroup;
3977 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3978 (double)INT_MAX ||
3979 OFLOW_MAX - *lengthptr < delta)
3980 {
3981 *errorcodeptr = ERR20;
3982 goto FAILED;
3983 }
3984 *lengthptr += delta;
3985 }
3986
3987 /* This is compiling for real */
3988
3989 else
3990 {
3991 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3992 for (i = 1; i < repeat_min; i++)
3993 {
3994 uschar *hc;
3995 uschar *this_hwm = cd->hwm;
3996 memcpy(code, previous, len);
3997 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3998 {
3999 PUT(cd->hwm, 0, GET(hc, 0) + len);
4000 cd->hwm += LINK_SIZE;
4001 }
4002 save_hwm = this_hwm;
4003 code += len;
4004 }
4005 }
4006 }
4007
4008 if (repeat_max > 0) repeat_max -= repeat_min;
4009 }
4010
4011 /* This code is common to both the zero and non-zero minimum cases. If
4012 the maximum is limited, it replicates the group in a nested fashion,
4013 remembering the bracket starts on a stack. In the case of a zero minimum,
4014 the first one was set up above. In all cases the repeat_max now specifies
4015 the number of additional copies needed. Again, we must remember to
4016 replicate entries on the forward reference list. */
4017
4018 if (repeat_max >= 0)
4019 {
4020 /* In the pre-compile phase, we don't actually do the replication. We
4021 just adjust the length as if we had. For each repetition we must add 1
4022 to the length for BRAZERO and for all but the last repetition we must
4023 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4024 paranoid checks to avoid integer overflow. */
4025
4026 if (lengthptr != NULL && repeat_max > 0)
4027 {
4028 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4029 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4030 if ((double)repeat_max *
4031 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4032 > (double)INT_MAX ||
4033 OFLOW_MAX - *lengthptr < delta)
4034 {
4035 *errorcodeptr = ERR20;
4036 goto FAILED;
4037 }
4038 *lengthptr += delta;
4039 }
4040
4041 /* This is compiling for real */
4042
4043 else for (i = repeat_max - 1; i >= 0; i--)
4044 {
4045 uschar *hc;
4046 uschar *this_hwm = cd->hwm;
4047
4048 *code++ = OP_BRAZERO + repeat_type;
4049
4050 /* All but the final copy start a new nesting, maintaining the
4051 chain of brackets outstanding. */
4052
4053 if (i != 0)
4054 {
4055 int offset;
4056 *code++ = OP_BRA;
4057 offset = (bralink == NULL)? 0 : code - bralink;
4058 bralink = code;
4059 PUTINC(code, 0, offset);
4060 }
4061
4062 memcpy(code, previous, len);
4063 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4064 {
4065 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4066 cd->hwm += LINK_SIZE;
4067 }
4068 save_hwm = this_hwm;
4069 code += len;
4070 }
4071
4072 /* Now chain through the pending brackets, and fill in their length
4073 fields (which are holding the chain links pro tem). */
4074
4075 while (bralink != NULL)
4076 {
4077 int oldlinkoffset;
4078 int offset = code - bralink + 1;
4079 uschar *bra = code - offset;
4080 oldlinkoffset = GET(bra, 1);
4081 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4082 *code++ = OP_KET;
4083 PUTINC(code, 0, offset);
4084 PUT(bra, 1, offset);
4085 }
4086 }
4087
4088 /* If the maximum is unlimited, set a repeater in the final copy. We
4089 can't just offset backwards from the current code point, because we
4090 don't know if there's been an options resetting after the ket. The
4091 correct offset was computed above.
4092
4093 Then, when we are doing the actual compile phase, check to see whether
4094 this group is a non-atomic one that could match an empty string. If so,
4095 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4096 that runtime checking can be done. [This check is also applied to
4097 atomic groups at runtime, but in a different way.] */
4098
4099 else
4100 {
4101 uschar *ketcode = code - ketoffset;
4102 uschar *bracode = ketcode - GET(ketcode, 1);
4103 *ketcode = OP_KETRMAX + repeat_type;
4104 if (lengthptr == NULL && *bracode != OP_ONCE)
4105 {
4106 uschar *scode = bracode;
4107 do
4108 {
4109 if (could_be_empty_branch(scode, ketcode, utf8))
4110 {
4111 *bracode += OP_SBRA - OP_BRA;
4112 break;
4113 }
4114 scode += GET(scode, 1);
4115 }
4116 while (*scode == OP_ALT);
4117 }
4118 }
4119 }
4120
4121 /* If previous is OP_FAIL, it was generated by an empty class [] in
4122 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4123 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4124 error above. We can just ignore the repeat in JS case. */
4125
4126 else if (*previous == OP_FAIL) goto END_REPEAT;
4127
4128 /* Else there's some kind of shambles */
4129
4130 else
4131 {
4132 *errorcodeptr = ERR11;
4133 goto FAILED;
4134 }
4135
4136 /* If the character following a repeat is '+', or if certain optimization
4137 tests above succeeded, possessive_quantifier is TRUE. For some of the
4138 simpler opcodes, there is an special alternative opcode for this. For
4139 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4140 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4141 but the special opcodes can optimize it a bit. The repeated item starts at
4142 tempcode, not at previous, which might be the first part of a string whose
4143 (former) last char we repeated.
4144
4145 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4146 an 'upto' may follow. We skip over an 'exact' item, and then test the
4147 length of what remains before proceeding. */
4148
4149 if (possessive_quantifier)
4150 {
4151 int len;
4152 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4153 *tempcode == OP_NOTEXACT)
4154 tempcode += _pcre_OP_lengths[*tempcode] +
4155 ((*tempcode == OP_TYPEEXACT &&
4156 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4157 len = code - tempcode;
4158 if (len > 0) switch (*tempcode)
4159 {
4160 case OP_STAR: *tempcode = OP_POSSTAR; break;
4161 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4162 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4163 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4164
4165 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4166 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4167 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4168 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4169
4170 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4171 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4172 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4173 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4174
4175 default:
4176 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4177 code += 1 + LINK_SIZE;
4178 len += 1 + LINK_SIZE;
4179 tempcode[0] = OP_ONCE;
4180 *code++ = OP_KET;
4181 PUTINC(code, 0, len);
4182 PUT(tempcode, 1, len);
4183 break;
4184 }
4185 }
4186
4187 /* In all case we no longer have a previous item. We also set the
4188 "follows varying string" flag for subsequently encountered reqbytes if
4189 it isn't already set and we have just passed a varying length item. */
4190
4191 END_REPEAT:
4192 previous = NULL;
4193 cd->req_varyopt |= reqvary;
4194 break;
4195
4196
4197 /* ===================================================================*/
4198 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4199 lookbehind or option setting or condition or all the other extended
4200 parenthesis forms. */
4201
4202 case '(':
4203 newoptions = options;
4204 skipbytes = 0;
4205 bravalue = OP_CBRA;
4206 save_hwm = cd->hwm;
4207 reset_bracount = FALSE;
4208
4209 /* First deal with various "verbs" that can be introduced by '*'. */
4210
4211 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4212 {
4213 int i, namelen;
4214 const char *vn = verbnames;
4215 const uschar *name = ++ptr;
4216 previous = NULL;
4217 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4218 if (*ptr == ':')
4219 {
4220 *errorcodeptr = ERR59; /* Not supported */
4221 goto FAILED;
4222 }
4223 if (*ptr != ')')
4224 {
4225 *errorcodeptr = ERR60;
4226 goto FAILED;
4227 }
4228 namelen = ptr - name;
4229 for (i = 0; i < verbcount; i++)
4230 {
4231 if (namelen == verbs[i].len &&
4232 strncmp((char *)name, vn, namelen) == 0)
4233 {
4234 *code = verbs[i].op;
4235 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4236 break;
4237 }
4238 vn += verbs[i].len + 1;
4239 }
4240 if (i < verbcount) continue;
4241 *errorcodeptr = ERR60;
4242 goto FAILED;
4243 }
4244
4245 /* Deal with the extended parentheses; all are introduced by '?', and the
4246 appearance of any of them means that this is not a capturing group. */
4247
4248 else if (*ptr == '?')
4249 {
4250 int i, set, unset, namelen;
4251 int *optset;
4252 const uschar *name;
4253 uschar *slot;
4254
4255 switch (*(++ptr))
4256 {
4257 case '#': /* Comment; skip to ket */
4258 ptr++;
4259 while (*ptr != 0 && *ptr != ')') ptr++;
4260 if (*ptr == 0)
4261 {
4262 *errorcodeptr = ERR18;
4263 goto FAILED;
4264 }
4265 continue;
4266
4267
4268 /* ------------------------------------------------------------ */
4269 case '|': /* Reset capture count for each branch */
4270 reset_bracount = TRUE;
4271 /* Fall through */
4272
4273 /* ------------------------------------------------------------ */
4274 case ':': /* Non-capturing bracket */
4275 bravalue = OP_BRA;
4276 ptr++;
4277 break;
4278
4279
4280 /* ------------------------------------------------------------ */
4281 case '(':
4282 bravalue = OP_COND; /* Conditional group */
4283
4284 /* A condition can be an assertion, a number (referring to a numbered
4285 group), a name (referring to a named group), or 'R', referring to
4286 recursion. R<digits> and R&name are also permitted for recursion tests.
4287
4288 There are several syntaxes for testing a named group: (?(name)) is used
4289 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4290
4291 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4292 be the recursive thing or the name 'R' (and similarly for 'R' followed
4293 by digits), and (b) a number could be a name that consists of digits.
4294 In both cases, we look for a name first; if not found, we try the other
4295 cases. */
4296
4297 /* For conditions that are assertions, check the syntax, and then exit
4298 the switch. This will take control down to where bracketed groups,
4299 including assertions, are processed. */
4300
4301 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4302 break;
4303
4304 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4305 below), and all need to skip 3 bytes at the start of the group. */
4306
4307 code[1+LINK_SIZE] = OP_CREF;
4308 skipbytes = 3;
4309 refsign = -1;
4310
4311 /* Check for a test for recursion in a named group. */
4312
4313 if (ptr[1] == 'R' && ptr[2] == '&')
4314 {
4315 terminator = -1;
4316 ptr += 2;
4317 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4318 }
4319
4320 /* Check for a test for a named group's having been set, using the Perl
4321 syntax (?(<name>) or (?('name') */
4322
4323 else if (ptr[1] == '<')
4324 {
4325 terminator = '>';
4326 ptr++;
4327 }
4328 else if (ptr[1] == '\'')
4329 {
4330 terminator = '\'';
4331 ptr++;
4332 }
4333 else
4334 {
4335 terminator = 0;
4336 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4337 }
4338
4339 /* We now expect to read a name; any thing else is an error */
4340
4341 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4342 {
4343 ptr += 1; /* To get the right offset */
4344 *errorcodeptr = ERR28;
4345 goto FAILED;
4346 }
4347
4348 /* Read the name, but also get it as a number if it's all digits */
4349
4350 recno = 0;
4351 name = ++ptr;
4352 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4353 {
4354 if (recno >= 0)
4355 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4356 recno * 10 + *ptr - '0' : -1;
4357 ptr++;
4358 }
4359 namelen = ptr - name;
4360
4361 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4362 {
4363 ptr--; /* Error offset */
4364 *errorcodeptr = ERR26;
4365 goto FAILED;
4366 }
4367
4368 /* Do no further checking in the pre-compile phase. */
4369
4370 if (lengthptr != NULL) break;
4371
4372 /* In the real compile we do the work of looking for the actual
4373 reference. If the string started with "+" or "-" we require the rest to
4374 be digits, in which case recno will be set. */
4375
4376 if (refsign > 0)
4377 {
4378 if (recno <= 0)
4379 {
4380 *errorcodeptr = ERR58;
4381 goto FAILED;
4382 }
4383 recno = (refsign == '-')?
4384 cd->bracount - recno + 1 : recno +cd->bracount;
4385 if (recno <= 0 || recno > cd->final_bracount)
4386 {
4387 *errorcodeptr = ERR15;
4388 goto FAILED;
4389 }
4390 PUT2(code, 2+LINK_SIZE, recno);
4391 break;
4392 }
4393
4394 /* Otherwise (did not start with "+" or "-"), start by looking for the
4395 name. */
4396
4397 slot = cd->name_table;
4398 for (i = 0; i < cd->names_found; i++)
4399 {
4400 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4401 slot += cd->name_entry_size;
4402 }
4403
4404 /* Found a previous named subpattern */
4405
4406 if (i < cd->names_found)
4407 {
4408 recno = GET2(slot, 0);
4409 PUT2(code, 2+LINK_SIZE, recno);
4410 }
4411
4412 /* Search the pattern for a forward reference */
4413
4414 else if ((i = find_parens(ptr, cd, name, namelen,
4415 (options & PCRE_EXTENDED) != 0)) > 0)
4416 {
4417 PUT2(code, 2+LINK_SIZE, i);
4418 }
4419
4420 /* If terminator == 0 it means that the name followed directly after
4421 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4422 some further alternatives to try. For the cases where terminator != 0
4423 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4424 now checked all the possibilities, so give an error. */
4425
4426 else if (terminator != 0)
4427 {
4428 *errorcodeptr = ERR15;
4429 goto FAILED;
4430 }
4431
4432 /* Check for (?(R) for recursion. Allow digits after R to specify a
4433 specific group number. */
4434
4435 else if (*name == 'R')
4436 {
4437 recno = 0;
4438 for (i = 1; i < namelen; i++)
4439 {
4440 if ((digitab[name[i]] & ctype_digit) == 0)
4441 {
4442 *errorcodeptr = ERR15;
4443 goto FAILED;
4444 }
4445 recno = recno * 10 + name[i] - '0';
4446 }
4447 if (recno == 0) recno = RREF_ANY;
4448 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4449 PUT2(code, 2+LINK_SIZE, recno);
4450 }
4451
4452 /* Similarly, check for the (?(DEFINE) "condition", which is always
4453 false. */
4454
4455 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4456 {
4457 code[1+LINK_SIZE] = OP_DEF;
4458 skipbytes = 1;
4459 }
4460
4461 /* Check for the "name" actually being a subpattern number. We are
4462 in the second pass here, so final_bracount is set. */
4463
4464 else if (recno > 0 && recno <= cd->final_bracount)
4465 {
4466 PUT2(code, 2+LINK_SIZE, recno);
4467 }
4468
4469 /* Either an unidentified subpattern, or a reference to (?(0) */
4470
4471 else
4472 {
4473 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4474 goto FAILED;
4475 }
4476 break;
4477
4478
4479 /* ------------------------------------------------------------ */
4480 case '=': /* Positive lookahead */
4481 bravalue = OP_ASSERT;
4482 ptr++;
4483 break;
4484
4485
4486 /* ------------------------------------------------------------ */
4487 case '!': /* Negative lookahead */
4488 ptr++;
4489 if (*ptr == ')') /* Optimize (?!) */
4490 {
4491 *code++ = OP_FAIL;
4492 previous = NULL;
4493 continue;
4494 }
4495 bravalue = OP_ASSERT_NOT;
4496 break;
4497
4498
4499 /* ------------------------------------------------------------ */
4500 case '<': /* Lookbehind or named define */
4501 switch (ptr[1])
4502 {
4503 case '=': /* Positive lookbehind */
4504 bravalue = OP_ASSERTBACK;
4505 ptr += 2;
4506 break;
4507
4508 case '!': /* Negative lookbehind */
4509 bravalue = OP_ASSERTBACK_NOT;
4510 ptr += 2;
4511 break;
4512
4513 default: /* Could be name define, else bad */
4514 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4515 ptr++; /* Correct offset for error */
4516 *errorcodeptr = ERR24;
4517 goto FAILED;
4518 }
4519 break;
4520
4521
4522 /* ------------------------------------------------------------ */
4523 case '>': /* One-time brackets */
4524 bravalue = OP_ONCE;
4525 ptr++;
4526 break;
4527
4528
4529 /* ------------------------------------------------------------ */
4530 case 'C': /* Callout - may be followed by digits; */
4531 previous_callout = code; /* Save for later completion */
4532 after_manual_callout = 1; /* Skip one item before completing */
4533 *code++ = OP_CALLOUT;
4534 {
4535 int n = 0;
4536 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4537 n = n * 10 + *ptr - '0';
4538 if (*ptr != ')')
4539 {
4540 *errorcodeptr = ERR39;
4541 goto FAILED;
4542 }
4543 if (n > 255)
4544 {
4545 *errorcodeptr = ERR38;
4546 goto FAILED;
4547 }
4548 *code++ = n;
4549 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4550 PUT(code, LINK_SIZE, 0); /* Default length */
4551 code += 2 * LINK_SIZE;
4552 }
4553 previous = NULL;
4554 continue;
4555
4556
4557 /* ------------------------------------------------------------ */
4558 case 'P': /* Python-style named subpattern handling */
4559 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4560 {
4561 is_recurse = *ptr == '>';
4562 terminator = ')';
4563 goto NAMED_REF_OR_RECURSE;
4564 }
4565 else if (*ptr != '<') /* Test for Python-style definition */
4566 {
4567 *errorcodeptr = ERR41;
4568 goto FAILED;
4569 }
4570 /* Fall through to handle (?P< as (?< is handled */
4571
4572
4573 /* ------------------------------------------------------------ */
4574 DEFINE_NAME: /* Come here from (?< handling */
4575 case '\'':
4576 {
4577 terminator = (*ptr == '<')? '>' : '\'';
4578 name = ++ptr;
4579
4580 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4581 namelen = ptr - name;
4582
4583 /* In the pre-compile phase, just do a syntax check. */
4584
4585 if (lengthptr != NULL)
4586 {
4587 if (*ptr != terminator)
4588 {
4589 *errorcodeptr = ERR42;
4590 goto FAILED;
4591 }
4592 if (cd->names_found >= MAX_NAME_COUNT)
4593 {
4594 *errorcodeptr = ERR49;
4595 goto FAILED;
4596 }
4597 if (namelen + 3 > cd->name_entry_size)
4598 {
4599 cd->name_entry_size = namelen + 3;
4600 if (namelen > MAX_NAME_SIZE)
4601 {
4602 *errorcodeptr = ERR48;
4603 goto FAILED;
4604 }
4605 }
4606 }
4607
4608 /* In the real compile, create the entry in the table */
4609
4610 else
4611 {
4612 slot = cd->name_table;
4613 for (i = 0; i < cd->names_found; i++)
4614 {
4615 int crc = memcmp(name, slot+2, namelen);
4616 if (crc == 0)
4617 {
4618 if (slot[2+namelen] == 0)
4619 {
4620 if ((options & PCRE_DUPNAMES) == 0)
4621 {
4622 *errorcodeptr = ERR43;
4623 goto FAILED;
4624 }
4625 }
4626 else crc = -1; /* Current name is substring */
4627 }
4628 if (crc < 0)
4629 {
4630 memmove(slot + cd->name_entry_size, slot,
4631 (cd->names_found - i) * cd->name_entry_size);
4632 break;
4633 }
4634 slot += cd->name_entry_size;
4635 }
4636
4637 PUT2(slot, 0, cd->bracount + 1);
4638 memcpy(slot + 2, name, namelen);
4639 slot[2+namelen] = 0;
4640 }
4641 }
4642
4643 /* In both cases, count the number of names we've encountered. */
4644
4645 ptr++; /* Move past > or ' */
4646 cd->names_found++;
4647 goto NUMBERED_GROUP;
4648
4649
4650 /* ------------------------------------------------------------ */
4651 case '&': /* Perl recursion/subroutine syntax */
4652 terminator = ')';
4653 is_recurse = TRUE;
4654 /* Fall through */
4655
4656 /* We come here from the Python syntax above that handles both
4657 references (?P=name) and recursion (?P>name), as well as falling
4658 through from the Perl recursion syntax (?&name). We also come here from
4659 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4660 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4661
4662 NAMED_REF_OR_RECURSE:
4663 name = ++ptr;
4664 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4665 namelen = ptr - name;
4666
4667 /* In the pre-compile phase, do a syntax check and set a dummy
4668 reference number. */
4669
4670 if (lengthptr != NULL)
4671 {
4672 if (namelen == 0)
4673 {
4674 *errorcodeptr = ERR62;
4675 goto FAILED;
4676 }
4677 if (*ptr != terminator)
4678 {
4679 *errorcodeptr = ERR42;
4680 goto FAILED;
4681 }
4682 if (namelen > MAX_NAME_SIZE)
4683 {
4684 *errorcodeptr = ERR48;
4685 goto FAILED;
4686 }
4687 recno = 0;
4688 }
4689
4690 /* In the real compile, seek the name in the table. We check the name
4691 first, and then check that we have reached the end of the name in the
4692 table. That way, if the name that is longer than any in the table,
4693 the comparison will fail without reading beyond the table entry. */
4694
4695 else
4696 {
4697 slot = cd->name_table;
4698 for (i = 0; i < cd->names_found; i++)
4699 {
4700 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4701 slot[2+namelen] == 0)
4702 break;
4703 slot += cd->name_entry_size;
4704 }
4705
4706 if (i < cd->names_found) /* Back reference */
4707 {
4708 recno = GET2(slot, 0);
4709 }
4710 else if ((recno = /* Forward back reference */
4711 find_parens(ptr, cd, name, namelen,
4712 (options & PCRE_EXTENDED) != 0)) <= 0)
4713 {
4714 *errorcodeptr = ERR15;
4715 goto FAILED;
4716 }
4717 }
4718
4719 /* In both phases, we can now go to the code than handles numerical
4720 recursion or backreferences. */
4721
4722 if (is_recurse) goto HANDLE_RECURSION;
4723 else goto HANDLE_REFERENCE;
4724
4725
4726 /* ------------------------------------------------------------ */
4727 case 'R': /* Recursion */
4728 ptr++; /* Same as (?0) */
4729 /* Fall through */
4730
4731
4732 /* ------------------------------------------------------------ */
4733 case '-': case '+':
4734 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4735 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4736 {
4737 const uschar *called;
4738 terminator = ')';
4739
4740 /* Come here from the \g<...> and \g'...' code (Oniguruma
4741 compatibility). However, the syntax has been checked to ensure that
4742 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4743 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4744 ever be taken. */
4745
4746 HANDLE_NUMERICAL_RECURSION:
4747
4748 if ((refsign = *ptr) == '+')
4749 {
4750 ptr++;
4751 if ((digitab[*ptr] & ctype_digit) == 0)
4752 {
4753 *errorcodeptr = ERR63;
4754 goto FAILED;
4755 }
4756 }
4757 else if (refsign == '-')
4758 {
4759 if ((digitab[ptr[1]] & ctype_digit) == 0)
4760 goto OTHER_CHAR_AFTER_QUERY;
4761 ptr++;
4762 }
4763
4764 recno = 0;
4765 while((digitab[*ptr] & ctype_digit) != 0)
4766 recno = recno * 10 + *ptr++ - '0';
4767
4768 if (*ptr != terminator)
4769 {
4770 *errorcodeptr = ERR29;
4771 goto FAILED;
4772 }
4773
4774 if (refsign == '-')
4775 {
4776 if (recno == 0)
4777 {
4778 *errorcodeptr = ERR58;
4779 goto FAILED;
4780 }
4781 recno = cd->bracount - recno + 1;
4782 if (recno <= 0)
4783 {
4784 *errorcodeptr = ERR15;
4785 goto FAILED;
4786 }
4787 }
4788 else if (refsign == '+')
4789 {
4790 if (recno == 0)
4791 {
4792 *errorcodeptr = ERR58;
4793 goto FAILED;
4794 }
4795 recno += cd->bracount;
4796 }
4797
4798 /* Come here from code above that handles a named recursion */
4799
4800 HANDLE_RECURSION:
4801
4802 previous = code;
4803 called = cd->start_code;
4804
4805 /* When we are actually compiling, find the bracket that is being
4806 referenced. Temporarily end the regex in case it doesn't exist before
4807 this point. If we end up with a forward reference, first check that
4808 the bracket does occur later so we can give the error (and position)
4809 now. Then remember this forward reference in the workspace so it can
4810 be filled in at the end. */
4811
4812 if (lengthptr == NULL)
4813 {
4814 *code = OP_END;
4815 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4816
4817 /* Forward reference */
4818
4819 if (called == NULL)
4820 {
4821 if (find_parens(ptr, cd, NULL, recno,
4822 (options & PCRE_EXTENDED) != 0) < 0)
4823 {
4824 *errorcodeptr = ERR15;
4825 goto FAILED;
4826 }
4827 called = cd->start_code + recno;
4828 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4829 }
4830
4831 /* If not a forward reference, and the subpattern is still open,
4832 this is a recursive call. We check to see if this is a left
4833 recursion that could loop for ever, and diagnose that case. */
4834
4835 else if (GET(called, 1) == 0 &&
4836 could_be_empty(called, code, bcptr, utf8))
4837 {
4838 *errorcodeptr = ERR40;
4839 goto FAILED;
4840 }
4841 }
4842
4843 /* Insert the recursion/subroutine item, automatically wrapped inside
4844 "once" brackets. Set up a "previous group" length so that a
4845 subsequent quantifier will work. */
4846
4847 *code = OP_ONCE;
4848 PUT(code, 1, 2 + 2*LINK_SIZE);
4849 code += 1 + LINK_SIZE;
4850
4851 *code = OP_RECURSE;
4852 PUT(code, 1, called - cd->start_code);
4853 code += 1 + LINK_SIZE;
4854
4855 *code = OP_KET;
4856 PUT(code, 1, 2 + 2*LINK_SIZE);
4857 code += 1 + LINK_SIZE;
4858
4859 length_prevgroup = 3 + 3*LINK_SIZE;
4860 }
4861
4862 /* Can't determine a first byte now */
4863
4864 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4865 continue;
4866
4867
4868 /* ------------------------------------------------------------ */
4869 default: /* Other characters: check option setting */
4870 OTHER_CHAR_AFTER_QUERY:
4871 set = unset = 0;
4872 optset = &set;
4873
4874 while (*ptr != ')' && *ptr != ':')
4875 {
4876 switch (*ptr++)
4877 {
4878 case '-': optset = &unset; break;
4879
4880 case 'J': /* Record that it changed in the external options */
4881 *optset |= PCRE_DUPNAMES;
4882 cd->external_flags |= PCRE_JCHANGED;
4883 break;
4884
4885 case 'i': *optset |= PCRE_CASELESS; break;
4886 case 'm': *optset |= PCRE_MULTILINE; break;
4887 case 's': *optset |= PCRE_DOTALL; break;
4888 case 'x': *optset |= PCRE_EXTENDED; break;
4889 case 'U': *optset |= PCRE_UNGREEDY; break;
4890 case 'X': *optset |= PCRE_EXTRA; break;
4891
4892 default: *errorcodeptr = ERR12;
4893 ptr--; /* Correct the offset */
4894 goto FAILED;
4895 }
4896 }
4897
4898 /* Set up the changed option bits, but don't change anything yet. */
4899
4900 newoptions = (options | set) & (~unset);
4901
4902 /* If the options ended with ')' this is not the start of a nested
4903 group with option changes, so the options change at this level. If this
4904 item is right at the start of the pattern, the options can be
4905 abstracted and made external in the pre-compile phase, and ignored in
4906 the compile phase. This can be helpful when matching -- for instance in
4907 caseless checking of required bytes.
4908
4909 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4910 definitely *not* at the start of the pattern because something has been
4911 compiled. In the pre-compile phase, however, the code pointer can have
4912 that value after the start, because it gets reset as code is discarded
4913 during the pre-compile. However, this can happen only at top level - if
4914 we are within parentheses, the starting BRA will still be present. At
4915 any parenthesis level, the length value can be used to test if anything
4916 has been compiled at that level. Thus, a test for both these conditions
4917 is necessary to ensure we correctly detect the start of the pattern in
4918 both phases.
4919
4920 If we are not at the pattern start, compile code to change the ims
4921 options if this setting actually changes any of them. We also pass the
4922 new setting back so that it can be put at the start of any following
4923 branches, and when this group ends (if we are in a group), a resetting
4924 item can be compiled. */
4925
4926 if (*ptr == ')')
4927 {
4928 if (code == cd->start_code + 1 + LINK_SIZE &&
4929 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4930 {
4931 cd->external_options = newoptions;
4932 options = newoptions;
4933 }
4934 else
4935 {
4936 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4937 {
4938 *code++ = OP_OPT;
4939 *code++ = newoptions & PCRE_IMS;
4940 }
4941
4942 /* Change options at this level, and pass them back for use
4943 in subsequent branches. Reset the greedy defaults and the case
4944 value for firstbyte and reqbyte. */
4945
4946 *optionsptr = options = newoptions;
4947 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4948 greedy_non_default = greedy_default ^ 1;
4949 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4950 }
4951
4952 previous = NULL; /* This item can't be repeated */
4953 continue; /* It is complete */
4954 }
4955
4956 /* If the options ended with ':' we are heading into a nested group
4957 with possible change of options. Such groups are non-capturing and are
4958 not assertions of any kind. All we need to do is skip over the ':';
4959 the newoptions value is handled below. */
4960
4961 bravalue = OP_BRA;
4962 ptr++;
4963 } /* End of switch for character following (? */
4964 } /* End of (? handling */
4965
4966 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4967 all unadorned brackets become non-capturing and behave like (?:...)
4968 brackets. */
4969
4970 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4971 {
4972 bravalue = OP_BRA;
4973 }
4974
4975 /* Else we have a capturing group. */
4976
4977 else
4978 {
4979 NUMBERED_GROUP:
4980 cd->bracount += 1;
4981 PUT2(code, 1+LINK_SIZE, cd->bracount);
4982 skipbytes = 2;
4983 }
4984
4985 /* Process nested bracketed regex. Assertions may not be repeated, but
4986 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4987 non-register variable in order to be able to pass its address because some
4988 compilers complain otherwise. Pass in a new setting for the ims options if
4989 they have changed. */
4990
4991 previous = (bravalue >= OP_ONCE)? code : NULL;
4992 *code = bravalue;
4993 tempcode = code;
4994 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4995 length_prevgroup = 0; /* Initialize for pre-compile phase */
4996
4997 if (!compile_regex(
4998 newoptions, /* The complete new option state */
4999 options & PCRE_IMS, /* The previous ims option state */
5000 &tempcode, /* Where to put code (updated) */
5001 &ptr, /* Input pointer (updated) */
5002 errorcodeptr, /* Where to put an error message */
5003 (bravalue == OP_ASSERTBACK ||
5004 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5005 reset_bracount, /* True if (?| group */
5006 skipbytes, /* Skip over bracket number */
5007 &subfirstbyte, /* For possible first char */
5008 &subreqbyte, /* For possible last char */
5009 bcptr, /* Current branch chain */
5010 cd, /* Tables block */
5011 (lengthptr == NULL)? NULL : /* Actual compile phase */
5012 &length_prevgroup /* Pre-compile phase */
5013 ))
5014 goto FAILED;
5015
5016 /* At the end of compiling, code is still pointing to the start of the
5017 group, while tempcode has been updated to point past the end of the group
5018 and any option resetting that may follow it. The pattern pointer (ptr)
5019 is on the bracket. */
5020
5021 /* If this is a conditional bracket, check that there are no more than
5022 two branches in the group, or just one if it's a DEFINE group. We do this
5023 in the real compile phase, not in the pre-pass, where the whole group may
5024 not be available. */
5025
5026 if (bravalue == OP_COND && lengthptr == NULL)
5027 {
5028 uschar *tc = code;
5029 int condcount = 0;
5030
5031 do {
5032 condcount++;
5033 tc += GET(tc,1);
5034 }
5035 while (*tc != OP_KET);
5036
5037 /* A DEFINE group is never obeyed inline (the "condition" is always
5038 false). It must have only one branch. */
5039
5040 if (code[LINK_SIZE+1] == OP_DEF)
5041 {
5042 if (condcount > 1)
5043 {
5044 *errorcodeptr = ERR54;
5045 goto FAILED;
5046 }
5047 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5048 }
5049
5050 /* A "normal" conditional group. If there is just one branch, we must not
5051 make use of its firstbyte or reqbyte, because this is equivalent to an
5052 empty second branch. */
5053
5054 else
5055 {
5056 if (condcount > 2)
5057 {
5058 *errorcodeptr = ERR27;
5059 goto FAILED;
5060 }
5061 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5062 }
5063 }
5064
5065 /* Error if hit end of pattern */
5066
5067 if (*ptr != ')')
5068 {
5069 *errorcodeptr = ERR14;
5070 goto FAILED;
5071 }
5072
5073 /* In the pre-compile phase, update the length by the length of the group,
5074 less the brackets at either end. Then reduce the compiled code to just a
5075 set of non-capturing brackets so that it doesn't use much memory if it is
5076 duplicated by a quantifier.*/
5077
5078 if (lengthptr != NULL)
5079 {
5080 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5081 {
5082 *errorcodeptr = ERR20;
5083 goto FAILED;
5084 }
5085 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5086 *code++ = OP_BRA;
5087 PUTINC(code, 0, 1 + LINK_SIZE);
5088 *code++ = OP_KET;
5089 PUTINC(code, 0, 1 + LINK_SIZE);
5090 break; /* No need to waste time with special character handling */
5091 }
5092
5093 /* Otherwise update the main code pointer to the end of the group. */
5094
5095 code = tempcode;
5096
5097 /* For a DEFINE group, required and first character settings are not
5098 relevant. */
5099
5100 if (bravalue == OP_DEF) break;
5101
5102 /* Handle updating of the required and first characters for other types of
5103 group. Update for normal brackets of all kinds, and conditions with two
5104 branches (see code above). If the bracket is followed by a quantifier with
5105 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5106 zerofirstbyte outside the main loop so that they can be accessed for the
5107 back off. */
5108
5109 zeroreqbyte = reqbyte;
5110 zerofirstbyte = firstbyte;
5111 groupsetfirstbyte = FALSE;
5112
5113 if (bravalue >= OP_ONCE)
5114 {
5115 /* If we have not yet set a firstbyte in this branch, take it from the
5116 subpattern, remembering that it was set here so that a repeat of more
5117 than one can replicate it as reqbyte if necessary. If the subpattern has
5118 no firstbyte, set "none" for the whole branch. In both cases, a zero
5119 repeat forces firstbyte to "none". */
5120
5121 if (firstbyte == REQ_UNSET)
5122 {
5123 if (subfirstbyte >= 0)
5124 {
5125 firstbyte = subfirstbyte;
5126 groupsetfirstbyte = TRUE;
5127 }
5128 else firstbyte = REQ_NONE;
5129 zerofirstbyte = REQ_NONE;
5130 }
5131
5132 /* If firstbyte was previously set, convert the subpattern's firstbyte
5133 into reqbyte if there wasn't one, using the vary flag that was in
5134 existence beforehand. */
5135
5136 else if (subfirstbyte >= 0 && subreqbyte < 0)
5137 subreqbyte = subfirstbyte | tempreqvary;
5138
5139 /* If the subpattern set a required byte (or set a first byte that isn't
5140 really the first byte - see above), set it. */
5141
5142 if (subreqbyte >= 0) reqbyte = subreqbyte;
5143 }
5144
5145 /* For a forward assertion, we take the reqbyte, if set. This can be
5146 helpful if the pattern that follows the assertion doesn't set a different
5147 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5148 for an assertion, however because it leads to incorrect effect for patterns
5149 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5150 of a firstbyte. This is overcome by a scan at the end if there's no
5151 firstbyte, looking for an asserted first char. */
5152
5153 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5154 break; /* End of processing '(' */
5155
5156
5157 /* ===================================================================*/
5158 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5159 are arranged to be the negation of the corresponding OP_values. For the
5160 back references, the values are ESC_REF plus the reference number. Only
5161 back references and those types that consume a character may be repeated.
5162 We can test for values between ESC_b and ESC_Z for the latter; this may
5163 have to change if any new ones are ever created. */
5164
5165 case '\\':
5166 tempptr = ptr;
5167 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5168 if (*errorcodeptr != 0) goto FAILED;
5169
5170 if (c < 0)
5171 {
5172 if (-c == ESC_Q) /* Handle start of quoted string */
5173 {
5174 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5175 else inescq = TRUE;
5176 continue;
5177 }
5178
5179 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5180
5181 /* For metasequences that actually match a character, we disable the
5182 setting of a first character if it hasn't already been set. */
5183
5184 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5185 firstbyte = REQ_NONE;
5186
5187 /* Set values to reset to if this is followed by a zero repeat. */
5188
5189 zerofirstbyte = firstbyte;
5190 zeroreqbyte = reqbyte;
5191
5192 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5193 is a subroutine call by number (Oniguruma syntax). In fact, the value
5194 -ESC_g is returned only for these cases. So we don't need to check for <
5195 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5196 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5197 that is a synonym for a named back reference). */
5198
5199 if (-c == ESC_g)
5200 {
5201 const uschar *p;
5202 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
5203 terminator = (*(++ptr) == '<')? '>' : '\'';
5204
5205 /* These two statements stop the compiler for warning about possibly
5206 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5207 fact, because we actually check for a number below, the paths that
5208 would actually be in error are never taken. */
5209
5210 skipbytes = 0;
5211 reset_bracount = FALSE;
5212
5213 /* Test for a name */
5214
5215 if (ptr[1] != '+' && ptr[1] != '-')
5216 {
5217 BOOL isnumber = TRUE;
5218 for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5219 {
5220 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5221 if ((cd->ctypes[*p] & ctype_word) == 0) break;
5222 }
5223 if (*p != terminator)
5224 {
5225 *errorcodeptr = ERR57;
5226 break;
5227 }
5228 if (isnumber)
5229 {
5230 ptr++;
5231 goto HANDLE_NUMERICAL_RECURSION;
5232 }
5233 is_recurse = TRUE;
5234 goto NAMED_REF_OR_RECURSE;
5235 }
5236
5237 /* Test a signed number in angle brackets or quotes. */
5238
5239 p = ptr + 2;
5240 while ((digitab[*p] & ctype_digit) != 0) p++;
5241 if (*p != terminator)
5242 {
5243 *errorcodeptr = ERR57;
5244 break;
5245 }
5246 ptr++;
5247 goto HANDLE_NUMERICAL_RECURSION;
5248 }
5249
5250 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5251 We also support \k{name} (.NET syntax) */
5252
5253 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5254 {
5255 is_recurse = FALSE;
5256 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5257 goto NAMED_REF_OR_RECURSE;
5258 }
5259
5260 /* Back references are handled specially; must disable firstbyte if
5261 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5262 ':' later. */
5263
5264 if (-c >= ESC_REF)
5265 {
5266 recno = -c - ESC_REF;
5267
5268 HANDLE_REFERENCE: /* Come here from named backref handling */
5269 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5270 previous = code;
5271 *code++ = OP_REF;
5272 PUT2INC(code, 0, recno);
5273 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5274 if (recno > cd->top_backref) cd->top_backref = recno;
5275 }
5276
5277 /* So are Unicode property matches, if supported. */
5278
5279 #ifdef SUPPORT_UCP
5280 else if (-c == ESC_P || -c == ESC_p)
5281 {
5282 BOOL negated;
5283 int pdata;
5284 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5285 if (ptype < 0) goto FAILED;
5286 previous = code;
5287 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5288 *code++ = ptype;
5289 *code++ = pdata;
5290 }
5291 #else
5292
5293 /* If Unicode properties are not supported, \X, \P, and \p are not
5294 allowed. */
5295
5296 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5297 {
5298 *errorcodeptr = ERR45;
5299 goto FAILED;
5300 }
5301 #endif
5302
5303 /* For the rest (including \X when Unicode properties are supported), we
5304 can obtain the OP value by negating the escape value. */
5305
5306 else
5307 {
5308 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5309 *code++ = -c;
5310 }
5311 continue;
5312 }
5313
5314 /* We have a data character whose value is in c. In UTF-8 mode it may have
5315 a value > 127. We set its representation in the length/buffer, and then
5316 handle it as a data character. */
5317
5318 #ifdef SUPPORT_UTF8
5319 if (utf8 && c > 127)
5320 mclength = _pcre_ord2utf8(c, mcbuffer);
5321 else
5322 #endif
5323
5324 {
5325 mcbuffer[0] = c;
5326 mclength = 1;
5327 }
5328 goto ONE_CHAR;
5329
5330
5331 /* ===================================================================*/
5332 /* Handle a literal character. It is guaranteed not to be whitespace or #
5333 when the extended flag is set. If we are in UTF-8 mode, it may be a
5334 multi-byte literal character. */
5335
5336 default:
5337 NORMAL_CHAR:
5338 mclength = 1;
5339 mcbuffer[0] = c;
5340
5341 #ifdef SUPPORT_UTF8
5342 if (utf8 && c >= 0xc0)
5343 {
5344 while ((ptr[1] & 0xc0) == 0x80)
5345 mcbuffer[mclength++] = *(++ptr);
5346 }
5347 #endif
5348
5349 /* At this point we have the character's bytes in mcbuffer, and the length
5350 in mclength. When not in UTF-8 mode, the length is always 1. */
5351
5352 ONE_CHAR:
5353 previous = code;
5354 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5355 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5356
5357 /* Remember if \r or \n were seen */
5358
5359 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5360 cd->external_flags |= PCRE_HASCRORLF;
5361
5362 /* Set the first and required bytes appropriately. If no previous first
5363 byte, set it from this character, but revert to none on a zero repeat.
5364 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5365 repeat. */
5366
5367 if (firstbyte == REQ_UNSET)
5368 {
5369 zerofirstbyte = REQ_NONE;
5370 zeroreqbyte = reqbyte;
5371
5372 /* If the character is more than one byte long, we can set firstbyte
5373 only if it is not to be matched caselessly. */
5374
5375 if (mclength == 1 || req_caseopt == 0)
5376 {
5377 firstbyte = mcbuffer[0] | req_caseopt;
5378 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5379 }
5380 else firstbyte = reqbyte = REQ_NONE;
5381 }
5382
5383 /* firstbyte was previously set; we can set reqbyte only the length is
5384 1 or the matching is caseful. */
5385
5386 else
5387 {
5388 zerofirstbyte = firstbyte;
5389 zeroreqbyte = reqbyte;
5390 if (mclength == 1 || req_caseopt == 0)
5391 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5392 }
5393
5394 break; /* End of literal character handling */
5395 }
5396 } /* end of big loop */
5397
5398
5399 /* Control never reaches here by falling through, only by a goto for all the
5400 error states. Pass back the position in the pattern so that it can be displayed
5401 to the user for diagnosing the error. */
5402
5403 FAILED:
5404 *ptrptr = ptr;
5405 return FALSE;
5406 }
5407
5408
5409
5410
5411 /*************************************************
5412 * Compile sequence of alternatives *
5413 *************************************************/
5414
5415 /* On entry, ptr is pointing past the bracket character, but on return it
5416 points to the closing bracket, or vertical bar, or end of string. The code
5417 variable is pointing at the byte into which the BRA operator has been stored.
5418 If the ims options are changed at the start (for a (?ims: group) or during any
5419 branch, we need to insert an OP_OPT item at the start of every following branch
5420 to ensure they get set correctly at run time, and also pass the new options
5421 into every subsequent branch compile.
5422
5423 This function is used during the pre-compile phase when we are trying to find
5424 out the amount of memory needed, as well as during the real compile phase. The
5425 value of lengthptr distinguishes the two phases.
5426
5427 Arguments:
5428 options option bits, including any changes for this subpattern
5429 oldims previous settings of ims option bits
5430 codeptr -> the address of the current code pointer
5431 ptrptr -> the address of the current pattern pointer
5432 errorcodeptr -> pointer to error code variable
5433 lookbehind TRUE if this is a lookbehind assertion
5434 reset_bracount TRUE to reset the count for each branch
5435 skipbytes skip this many bytes at start (for brackets and OP_COND)
5436 firstbyteptr place to put the first required character, or a negative number
5437 reqbyteptr place to put the last required character, or a negative number
5438 bcptr pointer to the chain of currently open branches
5439 cd points to the data block with tables pointers etc.
5440 lengthptr NULL during the real compile phase
5441 points to length accumulator during pre-compile phase
5442
5443 Returns: TRUE on success
5444 */
5445
5446 static BOOL
5447 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5448 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5449 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5450 int *lengthptr)
5451 {
5452 const uschar *ptr = *ptrptr;
5453 uschar *code = *codeptr;
5454 uschar *last_branch = code;
5455 uschar *start_bracket = code;
5456 uschar *reverse_count = NULL;
5457 int firstbyte, reqbyte;
5458 int branchfirstbyte, branchreqbyte;
5459 int length;
5460 int orig_bracount;
5461 int max_bracount;
5462 branch_chain bc;
5463
5464 bc.outer = bcptr;
5465 bc.current = code;
5466
5467 firstbyte = reqbyte = REQ_UNSET;
5468
5469 /* Accumulate the length for use in the pre-compile phase. Start with the
5470 length of the BRA and KET and any extra bytes that are required at the
5471 beginning. We accumulate in a local variable to save frequent testing of
5472 lenthptr for NULL. We cannot do this by looking at the value of code at the
5473 start and end of each alternative, because compiled items are discarded during
5474 the pre-compile phase so that the work space is not exceeded. */
5475
5476 length = 2 + 2*LINK_SIZE + skipbytes;
5477
5478 /* WARNING: If the above line is changed for any reason, you must also change
5479 the code that abstracts option settings at the start of the pattern and makes
5480 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5481 pre-compile phase to find out whether anything has yet been compiled or not. */
5482
5483 /* Offset is set zero to mark that this bracket is still open */
5484
5485 PUT(code, 1, 0);
5486 code += 1 + LINK_SIZE + skipbytes;
5487
5488 /* Loop for each alternative branch */
5489
5490 orig_bracount = max_bracount = cd->bracount;
5491 for (;;)
5492 {
5493 /* For a (?| group, reset the capturing bracket count so that each branch
5494 uses the same numbers. */
5495
5496 if (reset_bracount) cd->bracount = orig_bracount;
5497
5498 /* Handle a change of ims options at the start of the branch */
5499
5500 if ((options & PCRE_IMS) != oldims)
5501 {
5502 *code++ = OP_OPT;
5503 *code++ = options & PCRE_IMS;
5504 length += 2;
5505 }
5506
5507 /* Set up dummy OP_REVERSE if lookbehind assertion */
5508
5509 if (lookbehind)
5510 {
5511 *code++ = OP_REVERSE;
5512 reverse_count = code;
5513 PUTINC(code, 0, 0);
5514 length += 1 + LINK_SIZE;
5515 }
5516
5517 /* Now compile the branch; in the pre-compile phase its length gets added
5518 into the length. */
5519
5520 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5521 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5522 {
5523 *ptrptr = ptr;
5524 return FALSE;
5525 }
5526
5527 /* Keep the highest bracket count in case (?| was used and some branch
5528 has fewer than the rest. */
5529
5530 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5531
5532 /* In the real compile phase, there is some post-processing to be done. */
5533
5534 if (lengthptr == NULL)
5535 {
5536 /* If this is the first branch, the firstbyte and reqbyte values for the
5537 branch become the values for the regex. */
5538
5539 if (*last_branch != OP_ALT)
5540 {
5541 firstbyte = branchfirstbyte;
5542 reqbyte = branchreqbyte;
5543 }
5544
5545 /* If this is not the first branch, the first char and reqbyte have to
5546 match the values from all the previous branches, except that if the
5547 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5548 and we set REQ_VARY for the regex. */
5549
5550 else
5551 {
5552 /* If we previously had a firstbyte, but it doesn't match the new branch,
5553 we have to abandon the firstbyte for the regex, but if there was
5554 previously no reqbyte, it takes on the value of the old firstbyte. */
5555
5556 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5557 {
5558 if (reqbyte < 0) reqbyte = firstbyte;
5559 firstbyte = REQ_NONE;
5560 }
5561
5562 /* If we (now or from before) have no firstbyte, a firstbyte from the
5563 branch becomes a reqbyte if there isn't a branch reqbyte. */
5564
5565 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5566 branchreqbyte = branchfirstbyte;
5567
5568 /* Now ensure that the reqbytes match */
5569
5570 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5571 reqbyte = REQ_NONE;
5572 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5573 }
5574
5575 /* If lookbehind, check that this branch matches a fixed-length string, and
5576 put the length into the OP_REVERSE item. Temporarily mark the end of the
5577 branch with OP_END. */
5578
5579 if (lookbehind)
5580 {
5581 int fixed_length;
5582 *code = OP_END;
5583 fixed_length = find_fixedlength(last_branch, options);
5584 DPRINTF(("fixed length = %d\n", fixed_length));
5585 if (fixed_length < 0)
5586 {
5587 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5588 *ptrptr = ptr;
5589 return FALSE;
5590 }
5591 PUT(reverse_count, 0, fixed_length);
5592 }
5593 }
5594
5595 /* Reached end of expression, either ')' or end of pattern. In the real
5596 compile phase, go back through the alternative branches and reverse the chain
5597 of offsets, with the field in the BRA item now becoming an offset to the
5598 first alternative. If there are no alternatives, it points to the end of the
5599 group. The length in the terminating ket is always the length of the whole
5600 bracketed item. If any of the ims options were changed inside the group,
5601 compile a resetting op-code following, except at the very end of the pattern.
5602 Return leaving the pointer at the terminating char. */
5603
5604 if (*ptr != '|')
5605 {
5606 if (lengthptr == NULL)
5607 {
5608 int branch_length = code - last_branch;
5609 do
5610 {
5611 int prev_length = GET(last_branch, 1);
5612 PUT(last_branch, 1, branch_length);
5613 branch_length = prev_length;
5614 last_branch -= branch_length;
5615 }
5616 while (branch_length > 0);
5617 }
5618
5619 /* Fill in the ket */
5620
5621 *code = OP_KET;
5622 PUT(code, 1, code - start_bracket);
5623 code += 1 + LINK_SIZE;
5624
5625 /* Resetting option if needed */
5626
5627 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5628 {
5629 *code++ = OP_OPT;
5630 *code++ = oldims;
5631 length += 2;
5632 }
5633
5634 /* Retain the highest bracket number, in case resetting was used. */
5635
5636 cd->bracount = max_bracount;
5637
5638 /* Set values to pass back */
5639
5640 *codeptr = code;
5641 *ptrptr = ptr;
5642 *firstbyteptr = firstbyte;
5643 *reqbyteptr = reqbyte;
5644 if (lengthptr != NULL)
5645 {
5646 if (OFLOW_MAX - *lengthptr < length)
5647 {
5648 *errorcodeptr = ERR20;
5649 return FALSE;
5650 }
5651 *lengthptr += length;
5652 }
5653 return TRUE;
5654 }
5655
5656 /* Another branch follows. In the pre-compile phase, we can move the code
5657 pointer back to where it was for the start of the first branch. (That is,
5658 pretend that each branch is the only one.)
5659
5660 In the real compile phase, insert an ALT node. Its length field points back
5661 to the previous branch while the bracket remains open. At the end the chain
5662 is reversed. It's done like this so that the start of the bracket has a
5663 zero offset until it is closed, making it possible to detect recursion. */
5664
5665 if (lengthptr != NULL)
5666 {
5667 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5668 length += 1 + LINK_SIZE;
5669 }
5670 else
5671 {
5672 *code = OP_ALT;
5673 PUT(code, 1, code - last_branch);
5674 bc.current = last_branch = code;
5675 code += 1 + LINK_SIZE;
5676 }
5677
5678 ptr++;
5679 }
5680 /* Control never reaches here */
5681 }
5682
5683
5684
5685
5686 /*************************************************
5687 * Check for anchored expression *
5688 *************************************************/
5689
5690 /* Try to find out if this is an anchored regular expression. Consider each
5691 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5692 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5693 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5694 counts, since OP_CIRC can match in the middle.
5695
5696 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5697 This is the code for \G, which means "match at start of match position, taking
5698 into account the match offset".
5699
5700 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5701 because that will try the rest of the pattern at all possible matching points,
5702 so there is no point trying again.... er ....
5703
5704 .... except when the .* appears inside capturing parentheses, and there is a
5705 subsequent back reference to those parentheses. We haven't enough information
5706 to catch that case precisely.
5707
5708 At first, the best we could do was to detect when .* was in capturing brackets
5709 and the highest back reference was greater than or equal to that level.
5710 However, by keeping a bitmap of the first 31 back references, we can catch some
5711 of the more common cases more precisely.
5712
5713 Arguments:
5714 code points to start of expression (the bracket)
5715 options points to the options setting
5716 bracket_map a bitmap of which brackets we are inside while testing; this
5717 handles up to substring 31; after that we just have to take
5718 the less precise approach
5719 backref_map the back reference bitmap
5720
5721 Returns: TRUE or FALSE
5722 */
5723
5724 static BOOL
5725 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5726 unsigned int backref_map)
5727 {
5728 do {
5729 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5730 options, PCRE_MULTILINE, FALSE);
5731 register int op = *scode;
5732
5733 /* Non-capturing brackets */
5734
5735 if (op == OP_BRA)
5736 {
5737 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5738 }
5739
5740 /* Capturing brackets */
5741
5742 else if (op == OP_CBRA)
5743 {
5744 int n = GET2(scode, 1+LINK_SIZE);
5745 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5746 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5747 }
5748
5749 /* Other brackets */
5750
5751 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5752 {
5753 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5754 }
5755
5756 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5757 are or may be referenced. */
5758
5759 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5760 op == OP_TYPEPOSSTAR) &&
5761 (*options & PCRE_DOTALL) != 0)
5762 {
5763 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5764 }
5765
5766 /* Check for explicit anchoring */
5767
5768 else if (op != OP_SOD && op != OP_SOM &&
5769 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5770 return FALSE;
5771 code += GET(code, 1);
5772 }
5773 while (*code == OP_ALT); /* Loop for each alternative */
5774 return TRUE;
5775 }
5776
5777
5778
5779 /*************************************************
5780 * Check for starting with ^ or .* *
5781 *************************************************/
5782
5783 /* This is called to find out if every branch starts with ^ or .* so that
5784 "first char" processing can be done to speed things up in multiline
5785 matching and for non-DOTALL patterns that start with .* (which must start at
5786 the beginning or after \n). As in the case of is_anchored() (see above), we
5787 have to take account of back references to capturing brackets that contain .*
5788 because in that case we can't make the assumption.
5789
5790 Arguments:
5791 code points to start of expression (the bracket)
5792 bracket_map a bitmap of which brackets we are inside while testing; this
5793 handles up to substring 31; after that we just have to take
5794 the less precise approach
5795 backref_map the back reference bitmap
5796
5797 Returns: TRUE or FALSE
5798 */
5799
5800 static BOOL
5801 is_startline(const uschar *code, unsigned int bracket_map,
5802 unsigned int backref_map)
5803 {
5804 do {
5805 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5806 NULL, 0, FALSE);
5807 register int op = *scode;
5808
5809 /* Non-capturing brackets */
5810
5811 if (op == OP_BRA)
5812 {
5813 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5814 }
5815
5816 /* Capturing brackets */
5817
5818 else if (op == OP_CBRA)
5819 {
5820 int n = GET2(scode, 1+LINK_SIZE);
5821 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5822 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5823 }
5824
5825 /* Other brackets */
5826
5827 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5828 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5829
5830 /* .* means "start at start or after \n" if it isn't in brackets that
5831 may be referenced. */
5832
5833 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5834 {
5835 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5836 }
5837
5838 /* Check for explicit circumflex */
5839
5840 else if (op != OP_CIRC) return FALSE;
5841
5842 /* Move on to the next alternative */
5843
5844 code += GET(code, 1);
5845 }
5846 while (*code == OP_ALT); /* Loop for each alternative */
5847 return TRUE;
5848 }
5849
5850
5851
5852 /*************************************************
5853 * Check for asserted fixed first char *
5854 *************************************************/
5855
5856 /* During compilation, the "first char" settings from forward assertions are
5857 discarded, because they can cause conflicts with actual literals that follow.
5858 However, if we end up without a first char setting for an unanchored pattern,
5859 it is worth scanning the regex to see if there is an initial asserted first
5860 char. If all branches start with the same asserted char, or with a bracket all
5861 of whose alternatives start with the same asserted char (recurse ad lib), then
5862 we return that char, otherwise -1.
5863
5864 Arguments:
5865 code points to start of expression (the bracket)
5866 options pointer to the options (used to check casing changes)
5867 inassert TRUE if in an assertion
5868
5869 Returns: -1 or the fixed first char
5870 */
5871
5872 static int
5873 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5874 {
5875 register int c = -1;
5876 do {
5877 int d;
5878 const uschar *scode =
5879 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5880 register int op = *scode;
5881
5882 switch(op)
5883 {
5884 default:
5885 return -1;
5886
5887 case OP_BRA:
5888 case OP_CBRA:
5889 case OP_ASSERT:
5890 case OP_ONCE:
5891 case OP_COND:
5892 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5893 return -1;
5894 if (c < 0) c = d; else if (c != d) return -1;
5895 break;
5896
5897 case OP_EXACT: /* Fall through */
5898 scode += 2;
5899
5900 case OP_CHAR:
5901 case OP_CHARNC:
5902 case OP_PLUS:
5903 case OP_MINPLUS:
5904 case OP_POSPLUS:
5905 if (!inassert) return -1;
5906 if (c < 0)
5907 {
5908 c = scode[1];
5909 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5910 }
5911 else if (c != scode[1]) return -1;
5912 break;
5913 }
5914
5915 code += GET(code, 1);
5916 }
5917 while (*code == OP_ALT);
5918 return c;
5919 }
5920
5921
5922
5923 /*************************************************
5924 * Compile a Regular Expression *
5925 *************************************************/
5926
5927 /* This function takes a string and returns a pointer to a block of store
5928 holding a compiled version of the expression. The original API for this
5929 function had no error code return variable; it is retained for backwards
5930 compatibility. The new function is given a new name.
5931
5932 Arguments:
5933 pattern the regular expression
5934 options various option bits
5935 errorcodeptr pointer to error code variable (pcre_compile2() only)
5936 can be NULL if you don't want a code value
5937 errorptr pointer to pointer to error text
5938 erroroffset ptr offset in pattern where error was detected
5939 tables pointer to character tables or NULL
5940
5941 Returns: pointer to compiled data block, or NULL on error,
5942 with errorptr and erroroffset set
5943 */
5944
5945 PCRE_EXP_DEFN pcre *
5946 pcre_compile(const char *pattern, int options, const char **errorptr,
5947 int *erroroffset, const unsigned char *tables)
5948 {
5949 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5950 }
5951
5952
5953 PCRE_EXP_DEFN pcre *
5954 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5955 const char **errorptr, int *erroroffset, const unsigned char *tables)
5956 {
5957 real_pcre *re;
5958 int length = 1; /* For final END opcode */
5959 int firstbyte, reqbyte, newline;
5960 int errorcode = 0;
5961 int skipatstart = 0;
5962 #ifdef SUPPORT_UTF8
5963 BOOL utf8;
5964 #endif
5965 size_t size;
5966 uschar *code;
5967 const uschar *codestart;
5968 const uschar *ptr;
5969 compile_data compile_block;
5970 compile_data *cd = &compile_block;
5971
5972 /* This space is used for "compiling" into during the first phase, when we are
5973 computing the amount of memory that is needed. Compiled items are thrown away
5974 as soon as possible, so that a fairly large buffer should be sufficient for
5975 this purpose. The same space is used in the second phase for remembering where
5976 to fill in forward references to subpatterns. */
5977
5978 uschar cworkspace[COMPILE_WORK_SIZE];
5979
5980 /* Set this early so that early errors get offset 0. */
5981
5982 ptr = (const uschar *)pattern;
5983
5984 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5985 can do is just return NULL, but we can set a code value if there is a code
5986 pointer. */
5987
5988 if (errorptr == NULL)
5989 {
5990 if (errorcodeptr != NULL) *errorcodeptr = 99;
5991 return NULL;
5992 }
5993
5994 *errorptr = NULL;
5995 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5996
5997 /* However, we can give a message for this error */
5998
5999 if (erroroffset == NULL)
6000 {
6001 errorcode = ERR16;
6002 goto PCRE_EARLY_ERROR_RETURN2;
6003 }
6004
6005 *erroroffset = 0;
6006
6007 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6008
6009 #ifdef SUPPORT_UTF8
6010 utf8 = (options & PCRE_UTF8) != 0;
6011 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6012 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6013 {
6014 errorcode = ERR44;
6015 goto PCRE_EARLY_ERROR_RETURN2;
6016 }
6017 #else
6018 if ((options & PCRE_UTF8) != 0)
6019 {
6020 errorcode = ERR32;
6021 goto PCRE_EARLY_ERROR_RETURN;
6022 }
6023 #endif
6024
6025 if ((options & ~PUBLIC_OPTIONS) != 0)
6026 {
6027 errorcode = ERR17;
6028 goto PCRE_EARLY_ERROR_RETURN;
6029 }
6030
6031 /* Set up pointers to the individual character tables */
6032
6033 if (tables == NULL) tables = _pcre_default_tables;
6034 cd->lcc = tables + lcc_offset;
6035 cd->fcc = tables + fcc_offset;
6036 cd->cbits = tables + cbits_offset;
6037 cd->ctypes = tables + ctypes_offset;
6038
6039 /* Check for global one-time settings at the start of the pattern, and remember
6040 the offset for later. */
6041
6042 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
6043 {
6044 int newnl = 0;
6045 int newbsr = 0;
6046
6047 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
6048 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6049 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
6050 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6051 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
6052 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6053 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
6054 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6055 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
6056 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6057
6058 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
6059 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6060 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
6061 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6062
6063 if (newnl != 0)
6064 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6065 else if (newbsr != 0)
6066 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6067 else break;
6068 }
6069
6070 /* Check validity of \R options. */
6071
6072 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6073 {
6074 case 0:
6075 case PCRE_BSR_ANYCRLF:
6076 case PCRE_BSR_UNICODE:
6077 break;
6078 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6079 }
6080
6081 /* Handle different types of newline. The three bits give seven cases. The
6082 current code allows for fixed one- or two-byte sequences, plus "any" and
6083 "anycrlf". */
6084
6085 switch (options & PCRE_NEWLINE_BITS)
6086 {
6087 case 0: newline = NEWLINE; break; /* Build-time default */
6088 case PCRE_NEWLINE_CR: newline = '\r'; break;
6089 case PCRE_NEWLINE_LF: newline = '\n'; break;
6090 case PCRE_NEWLINE_CR+
6091 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6092 case PCRE_NEWLINE_ANY: newline = -1; break;
6093 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6094 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6095 }
6096
6097 if (newline == -2)
6098 {
6099 cd->nltype = NLTYPE_ANYCRLF;
6100 }
6101 else if (newline < 0)
6102 {
6103 cd->nltype = NLTYPE_ANY;
6104 }
6105 else
6106 {
6107 cd->nltype = NLTYPE_FIXED;
6108 if (newline > 255)
6109 {
6110 cd->nllen = 2;
6111 cd->nl[0] = (newline >> 8) & 255;
6112 cd->nl[1] = newline & 255;
6113 }
6114 else
6115 {
6116 cd->nllen = 1;
6117 cd->nl[0] = newline;
6118 }
6119 }
6120
6121 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6122 references to help in deciding whether (.*) can be treated as anchored or not.
6123 */
6124
6125 cd->top_backref = 0;
6126 cd->backref_map = 0;
6127
6128 /* Reflect pattern for debugging output */
6129
6130 DPRINTF(("------------------------------------------------------------------\n"));
6131 DPRINTF(("%s\n", pattern));
6132
6133 /* Pretend to compile the pattern while actually just accumulating the length
6134 of memory required. This behaviour is triggered by passing a non-NULL final
6135 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6136 to compile parts of the pattern into; the compiled code is discarded when it is
6137 no longer needed, so hopefully this workspace will never overflow, though there
6138 is a test for its doing so. */
6139
6140 cd->bracount = cd->final_bracount = 0;
6141 cd->names_found = 0;
6142 cd->name_entry_size = 0;
6143 cd->name_table = NULL;
6144 cd->start_workspace = cworkspace;
6145 cd->start_code = cworkspace;
6146 cd->hwm = cworkspace;
6147 cd->start_pattern = (const uschar *)pattern;
6148 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6149 cd->req_varyopt = 0;
6150 cd->external_options = options;
6151 cd->external_flags = 0;
6152
6153 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6154 don't need to look at the result of the function here. The initial options have
6155 been put into the cd block so that they can be changed if an option setting is
6156 found within the regex right at the beginning. Bringing initial option settings
6157 outside can help speed up starting point checks. */
6158
6159 ptr += skipatstart;
6160 code = cworkspace;
6161 *code = OP_BRA;
6162 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6163 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6164 &length);
6165 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6166
6167 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6168 cd->hwm - cworkspace));
6169
6170 if (length > MAX_PATTERN_SIZE)
6171 {
6172 errorcode = ERR20;
6173 goto PCRE_EARLY_ERROR_RETURN;
6174 }
6175
6176 /* Compute the size of data block needed and get it, either from malloc or
6177 externally provided function. Integer overflow should no longer be possible
6178 because nowadays we limit the maximum value of cd->names_found and
6179 cd->name_entry_size. */
6180
6181 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6182 re = (real_pcre *)(pcre_malloc)(size);
6183
6184 if (re == NULL)
6185 {
6186 errorcode = ERR21;
6187 goto PCRE_EARLY_ERROR_RETURN;
6188 }
6189
6190 /* Put in the magic number, and save the sizes, initial options, internal
6191 flags, and character table pointer. NULL is used for the default character
6192 tables. The nullpad field is at the end; it's there to help in the case when a
6193 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6194 pointers. */
6195
6196 re->magic_number = MAGIC_NUMBER;
6197 re->size = size;
6198 re->options = cd->external_options;
6199 re->flags = cd->external_flags;
6200 re->dummy1 = 0;
6201 re->first_byte = 0;
6202 re->req_byte = 0;
6203 re->name_table_offset = sizeof(real_pcre);
6204 re->name_entry_size = cd->name_entry_size;
6205 re->name_count = cd->names_found;
6206 re->ref_count = 0;
6207 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6208 re->nullpad = NULL;
6209
6210 /* The starting points of the name/number translation table and of the code are
6211 passed around in the compile data block. The start/end pattern and initial
6212 options are already set from the pre-compile phase, as is the name_entry_size
6213 field. Reset the bracket count and the names_found field. Also reset the hwm
6214 field; this time it's used for remembering forward references to subpatterns.
6215 */
6216
6217 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6218 cd->bracount = 0;
6219 cd->names_found = 0;
6220 cd->name_table = (uschar *)re + re->name_table_offset;
6221 codestart = cd->name_table + re->name_entry_size * re->name_count;
6222 cd->start_code = codestart;
6223 cd->hwm = cworkspace;
6224 cd->req_varyopt = 0;
6225 cd->had_accept = FALSE;
6226
6227 /* Set up a starting, non-extracting bracket, then compile the expression. On
6228 error, errorcode will be set non-zero, so we don't need to look at the result
6229 of the function here. */
6230
6231 ptr = (const uschar *)pattern + skipatstart;
6232 code = (uschar *)codestart;
6233 *code = OP_BRA;
6234 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6235 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6236 re->top_bracket = cd->bracount;
6237 re->top_backref = cd->top_backref;
6238 re->flags = cd->external_flags;
6239
6240 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6241
6242 /* If not reached end of pattern on success, there's an excess bracket. */
6243
6244 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6245
6246 /* Fill in the terminating state and check for disastrous overflow, but
6247 if debugging, leave the test till after things are printed out. */
6248
6249 *code++ = OP_END;
6250
6251 #ifndef DEBUG
6252 if (code - codestart > length) errorcode = ERR23;
6253 #endif
6254
6255 /* Fill in any forward references that are required. */
6256
6257 while (errorcode == 0 && cd->hwm > cworkspace)
6258 {
6259 int offset, recno;
6260 const uschar *groupptr;
6261 cd->hwm -= LINK_SIZE;
6262 offset = GET(cd->hwm, 0);
6263 recno = GET(codestart, offset);
6264 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6265 if (groupptr == NULL) errorcode = ERR53;
6266 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6267 }
6268
6269 /* Give an error if there's back reference to a non-existent capturing
6270 subpattern. */
6271
6272 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6273
6274 /* Failed to compile, or error while post-processing */
6275
6276 if (errorcode != 0)
6277 {
6278 (pcre_free)(re);
6279 PCRE_EARLY_ERROR_RETURN:
6280 *erroroffset = ptr - (const uschar *)pattern;
6281 PCRE_EARLY_ERROR_RETURN2:
6282 *errorptr = find_error_text(errorcode);
6283 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6284 return NULL;
6285 }
6286
6287 /* If the anchored option was not passed, set the flag if we can determine that
6288 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6289 as starting with .* when DOTALL is set).
6290
6291 Otherwise, if we know what the first byte has to be, save it, because that
6292 speeds up unanchored matches no end. If not, see if we can set the
6293 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6294 start with ^. and also when all branches start with .* for non-DOTALL matches.
6295 */
6296
6297 if ((re->options & PCRE_ANCHORED) == 0)
6298 {
6299 int temp_options = re->options; /* May get changed during these scans */
6300 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6301 re->options |= PCRE_ANCHORED;
6302 else
6303 {
6304 if (firstbyte < 0)
6305 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6306 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6307 {
6308 int ch = firstbyte & 255;
6309 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6310 cd->fcc[ch] == ch)? ch : firstbyte;
6311 re->flags |= PCRE_FIRSTSET;
6312 }
6313 else if (is_startline(codestart, 0, cd->backref_map))
6314 re->flags |= PCRE_STARTLINE;
6315 }
6316 }
6317
6318 /* For an anchored pattern, we use the "required byte" only if it follows a
6319 variable length item in the regex. Remove the caseless flag for non-caseable
6320 bytes. */
6321
6322 if (reqbyte >= 0 &&
6323 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6324 {
6325 int ch = reqbyte & 255;
6326 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6327 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6328 re->flags |= PCRE_REQCHSET;
6329 }
6330
6331 /* Print out the compiled data if debugging is enabled. This is never the
6332 case when building a production library. */
6333
6334 #ifdef DEBUG
6335
6336 printf("Length = %d top_bracket = %d top_backref = %d\n",
6337 length, re->top_bracket, re->top_backref);
6338
6339 printf("Options=%08x\n", re->options);
6340
6341 if ((re->flags & PCRE_FIRSTSET) != 0)
6342 {
6343 int ch = re->first_byte & 255;
6344 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6345 "" : " (caseless)";
6346 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6347 else printf("First char = \\x%02x%s\n", ch, caseless);
6348 }
6349
6350 if ((re->flags & PCRE_REQCHSET) != 0)
6351 {
6352 int ch = re->req_byte & 255;
6353 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6354 "" : " (caseless)";
6355 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6356 else printf("Req char = \\x%02x%s\n", ch, caseless);
6357 }
6358
6359 pcre_printint(re, stdout, TRUE);
6360
6361 /* This check is done here in the debugging case so that the code that
6362 was compiled can be seen. */
6363
6364 if (code - codestart > length)
6365 {
6366 (pcre_free)(re);
6367 *errorptr = find_error_text(ERR23);
6368 *erroroffset = ptr - (uschar *)pattern;
6369 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6370 return NULL;
6371 }
6372 #endif /* DEBUG */
6373
6374 return (pcre *)re;
6375 }
6376
6377 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5