/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 276 - (show annotations)
Wed Nov 21 15:39:20 2007 UTC (7 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 195240 byte(s)
Error occurred while calculating annotation data.
Remove two redunant, never-reachable lines of code whose function had been 
moved.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144 searched linearly. Put all the names into a single string, in order to reduce
145 the number of relocations when a shared library is dynamically linked. */
146
147 typedef struct verbitem {
148 int len;
149 int op;
150 } verbitem;
151
152 static const char verbnames[] =
153 "ACCEPT\0"
154 "COMMIT\0"
155 "F\0"
156 "FAIL\0"
157 "PRUNE\0"
158 "SKIP\0"
159 "THEN";
160
161 static verbitem verbs[] = {
162 { 6, OP_ACCEPT },
163 { 6, OP_COMMIT },
164 { 1, OP_FAIL },
165 { 4, OP_FAIL },
166 { 5, OP_PRUNE },
167 { 4, OP_SKIP },
168 { 4, OP_THEN }
169 };
170
171 static int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174 /* Tables of names of POSIX character classes and their lengths. The names are
175 now all in a single string, to reduce the number of relocations when a shared
176 library is dynamically loaded. The list of lengths is terminated by a zero
177 length entry. The first three must be alpha, lower, upper, as this is assumed
178 for handling case independence. */
179
180 static const char posix_names[] =
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 "word\0" "xdigit";
184
185 static const uschar posix_name_lengths[] = {
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188 /* Table of class bit maps for each POSIX class. Each class is formed from a
189 base map, with an optional addition or removal of another map. Then, for some
190 classes, there is some additional tweaking: for [:blank:] the vertical space
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
192 character is removed. The triples in the table consist of the base map offset,
193 second map offset or -1 if no second map, and a non-negative value for map
194 addition or a negative value for map subtraction (if there are two maps). The
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196 remove vertical space characters, 2 => remove underscore. */
197
198 static const int posix_class_maps[] = {
199 cbit_word, cbit_digit, -2, /* alpha */
200 cbit_lower, -1, 0, /* lower */
201 cbit_upper, -1, 0, /* upper */
202 cbit_word, -1, 2, /* alnum - word without underscore */
203 cbit_print, cbit_cntrl, 0, /* ascii */
204 cbit_space, -1, 1, /* blank - a GNU extension */
205 cbit_cntrl, -1, 0, /* cntrl */
206 cbit_digit, -1, 0, /* digit */
207 cbit_graph, -1, 0, /* graph */
208 cbit_print, -1, 0, /* print */
209 cbit_punct, -1, 0, /* punct */
210 cbit_space, -1, 0, /* space */
211 cbit_word, -1, 0, /* word - a Perl extension */
212 cbit_xdigit,-1, 0 /* xdigit */
213 };
214
215
216 #define STRING(a) # a
217 #define XSTRING(s) STRING(s)
218
219 /* The texts of compile-time error messages. These are "char *" because they
220 are passed to the outside world. Do not ever re-use any error number, because
221 they are documented. Always add a new error instead. Messages marked DEAD below
222 are no longer used. This used to be a table of strings, but in order to reduce
223 the number of relocations needed when a shared library is loaded dynamically,
224 it is now one long string. We cannot use a table of offsets, because the
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226 simply count through to the one we want - this isn't a performance issue
227 because these strings are used only when there is a compilation error. */
228
229 static const char error_texts[] =
230 "no error\0"
231 "\\ at end of pattern\0"
232 "\\c at end of pattern\0"
233 "unrecognized character follows \\\0"
234 "numbers out of order in {} quantifier\0"
235 /* 5 */
236 "number too big in {} quantifier\0"
237 "missing terminating ] for character class\0"
238 "invalid escape sequence in character class\0"
239 "range out of order in character class\0"
240 "nothing to repeat\0"
241 /* 10 */
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243 "internal error: unexpected repeat\0"
244 "unrecognized character after (? or (?-\0"
245 "POSIX named classes are supported only within a class\0"
246 "missing )\0"
247 /* 15 */
248 "reference to non-existent subpattern\0"
249 "erroffset passed as NULL\0"
250 "unknown option bit(s) set\0"
251 "missing ) after comment\0"
252 "parentheses nested too deeply\0" /** DEAD **/
253 /* 20 */
254 "regular expression is too large\0"
255 "failed to get memory\0"
256 "unmatched parentheses\0"
257 "internal error: code overflow\0"
258 "unrecognized character after (?<\0"
259 /* 25 */
260 "lookbehind assertion is not fixed length\0"
261 "malformed number or name after (?(\0"
262 "conditional group contains more than two branches\0"
263 "assertion expected after (?(\0"
264 "(?R or (?[+-]digits must be followed by )\0"
265 /* 30 */
266 "unknown POSIX class name\0"
267 "POSIX collating elements are not supported\0"
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269 "spare error\0" /** DEAD **/
270 "character value in \\x{...} sequence is too large\0"
271 /* 35 */
272 "invalid condition (?(0)\0"
273 "\\C not allowed in lookbehind assertion\0"
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275 "number after (?C is > 255\0"
276 "closing ) for (?C expected\0"
277 /* 40 */
278 "recursive call could loop indefinitely\0"
279 "unrecognized character after (?P\0"
280 "syntax error in subpattern name (missing terminator)\0"
281 "two named subpatterns have the same name\0"
282 "invalid UTF-8 string\0"
283 /* 45 */
284 "support for \\P, \\p, and \\X has not been compiled\0"
285 "malformed \\P or \\p sequence\0"
286 "unknown property name after \\P or \\p\0"
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 /* 50 */
290 "repeated subpattern is too long\0" /** DEAD **/
291 "octal value is greater than \\377 (not in UTF-8 mode)\0"
292 "internal error: overran compiling workspace\0"
293 "internal error: previously-checked referenced subpattern not found\0"
294 "DEFINE group contains more than one branch\0"
295 /* 55 */
296 "repeating a DEFINE group is not allowed\0"
297 "inconsistent NEWLINE options\0"
298 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300 "(*VERB) with an argument is not supported\0"
301 /* 60 */
302 "(*VERB) not recognized\0"
303 "number is too big\0"
304 "subpattern name expected\0"
305 "digit expected after (?+";
306
307
308 /* Table to identify digits and hex digits. This is used when compiling
309 patterns. Note that the tables in chartables are dependent on the locale, and
310 may mark arbitrary characters as digits - but the PCRE compiling code expects
311 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312 a private table here. It costs 256 bytes, but it is a lot faster than doing
313 character value tests (at least in some simple cases I timed), and in some
314 applications one wants PCRE to compile efficiently as well as match
315 efficiently.
316
317 For convenience, we use the same bit definitions as in chartables:
318
319 0x04 decimal digit
320 0x08 hexadecimal digit
321
322 Then we can use ctype_digit and ctype_xdigit in the code. */
323
324 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 static const unsigned char digitab[] =
326 {
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359
360 #else /* This is the "abnormal" case, for EBCDIC systems */
361 static const unsigned char digitab[] =
362 {
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395
396 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429 #endif
430
431
432 /* Definition to allow mutual recursion */
433
434 static BOOL
435 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 int *, int *, branch_chain *, compile_data *, int *);
437
438
439
440 /*************************************************
441 * Find an error text *
442 *************************************************/
443
444 /* The error texts are now all in one long string, to save on relocations. As
445 some of the text is of unknown length, we can't use a table of offsets.
446 Instead, just count through the strings. This is not a performance issue
447 because it happens only when there has been a compilation error.
448
449 Argument: the error number
450 Returns: pointer to the error string
451 */
452
453 static const char *
454 find_error_text(int n)
455 {
456 const char *s = error_texts;
457 for (; n > 0; n--) while (*s++ != 0);
458 return s;
459 }
460
461
462 /*************************************************
463 * Handle escapes *
464 *************************************************/
465
466 /* This function is called when a \ has been encountered. It either returns a
467 positive value for a simple escape such as \n, or a negative value which
468 encodes one of the more complicated things such as \d. A backreference to group
469 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471 ptr is pointing at the \. On exit, it is on the final character of the escape
472 sequence.
473
474 Arguments:
475 ptrptr points to the pattern position pointer
476 errorcodeptr points to the errorcode variable
477 bracount number of previous extracting brackets
478 options the options bits
479 isclass TRUE if inside a character class
480
481 Returns: zero or positive => a data character
482 negative => a special escape sequence
483 on error, errorcodeptr is set
484 */
485
486 static int
487 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488 int options, BOOL isclass)
489 {
490 BOOL utf8 = (options & PCRE_UTF8) != 0;
491 const uschar *ptr = *ptrptr + 1;
492 int c, i;
493
494 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495 ptr--; /* Set pointer back to the last byte */
496
497 /* If backslash is at the end of the pattern, it's an error. */
498
499 if (c == 0) *errorcodeptr = ERR1;
500
501 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502 in a table. A non-zero result is something that can be returned immediately.
503 Otherwise further processing may be required. */
504
505 #ifndef EBCDIC /* ASCII coding */
506 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 else if ((i = escapes[c - '0']) != 0) c = i;
508
509 #else /* EBCDIC coding */
510 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 else if ((i = escapes[c - 0x48]) != 0) c = i;
512 #endif
513
514 /* Escapes that need further processing, or are illegal. */
515
516 else
517 {
518 const uschar *oldptr;
519 BOOL braced, negated;
520
521 switch (c)
522 {
523 /* A number of Perl escapes are not handled by PCRE. We give an explicit
524 error. */
525
526 case 'l':
527 case 'L':
528 case 'N':
529 case 'u':
530 case 'U':
531 *errorcodeptr = ERR37;
532 break;
533
534 /* \g must be followed by a number, either plain or braced. If positive, it
535 is an absolute backreference. If negative, it is a relative backreference.
536 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537 reference to a named group. This is part of Perl's movement towards a
538 unified syntax for back references. As this is synonymous with \k{name}, we
539 fudge it up by pretending it really was \k. */
540
541 case 'g':
542 if (ptr[1] == '{')
543 {
544 const uschar *p;
545 for (p = ptr+2; *p != 0 && *p != '}'; p++)
546 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 if (*p != 0 && *p != '}')
548 {
549 c = -ESC_k;
550 break;
551 }
552 braced = TRUE;
553 ptr++;
554 }
555 else braced = FALSE;
556
557 if (ptr[1] == '-')
558 {
559 negated = TRUE;
560 ptr++;
561 }
562 else negated = FALSE;
563
564 c = 0;
565 while ((digitab[ptr[1]] & ctype_digit) != 0)
566 c = c * 10 + *(++ptr) - '0';
567
568 if (c < 0)
569 {
570 *errorcodeptr = ERR61;
571 break;
572 }
573
574 if (c == 0 || (braced && *(++ptr) != '}'))
575 {
576 *errorcodeptr = ERR57;
577 break;
578 }
579
580 if (negated)
581 {
582 if (c > bracount)
583 {
584 *errorcodeptr = ERR15;
585 break;
586 }
587 c = bracount - (c - 1);
588 }
589
590 c = -(ESC_REF + c);
591 break;
592
593 /* The handling of escape sequences consisting of a string of digits
594 starting with one that is not zero is not straightforward. By experiment,
595 the way Perl works seems to be as follows:
596
597 Outside a character class, the digits are read as a decimal number. If the
598 number is less than 10, or if there are that many previous extracting
599 left brackets, then it is a back reference. Otherwise, up to three octal
600 digits are read to form an escaped byte. Thus \123 is likely to be octal
601 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602 value is greater than 377, the least significant 8 bits are taken. Inside a
603 character class, \ followed by a digit is always an octal number. */
604
605 case '1': case '2': case '3': case '4': case '5':
606 case '6': case '7': case '8': case '9':
607
608 if (!isclass)
609 {
610 oldptr = ptr;
611 c -= '0';
612 while ((digitab[ptr[1]] & ctype_digit) != 0)
613 c = c * 10 + *(++ptr) - '0';
614 if (c < 0)
615 {
616 *errorcodeptr = ERR61;
617 break;
618 }
619 if (c < 10 || c <= bracount)
620 {
621 c = -(ESC_REF + c);
622 break;
623 }
624 ptr = oldptr; /* Put the pointer back and fall through */
625 }
626
627 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628 generates a binary zero byte and treats the digit as a following literal.
629 Thus we have to pull back the pointer by one. */
630
631 if ((c = *ptr) >= '8')
632 {
633 ptr--;
634 c = 0;
635 break;
636 }
637
638 /* \0 always starts an octal number, but we may drop through to here with a
639 larger first octal digit. The original code used just to take the least
640 significant 8 bits of octal numbers (I think this is what early Perls used
641 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642 than 3 octal digits. */
643
644 case '0':
645 c -= '0';
646 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647 c = c * 8 + *(++ptr) - '0';
648 if (!utf8 && c > 255) *errorcodeptr = ERR51;
649 break;
650
651 /* \x is complicated. \x{ddd} is a character number which can be greater
652 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653 treated as a data character. */
654
655 case 'x':
656 if (ptr[1] == '{')
657 {
658 const uschar *pt = ptr + 2;
659 int count = 0;
660
661 c = 0;
662 while ((digitab[*pt] & ctype_xdigit) != 0)
663 {
664 register int cc = *pt++;
665 if (c == 0 && cc == '0') continue; /* Leading zeroes */
666 count++;
667
668 #ifndef EBCDIC /* ASCII coding */
669 if (cc >= 'a') cc -= 32; /* Convert to upper case */
670 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671 #else /* EBCDIC coding */
672 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
673 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
674 #endif
675 }
676
677 if (*pt == '}')
678 {
679 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
680 ptr = pt;
681 break;
682 }
683
684 /* If the sequence of hex digits does not end with '}', then we don't
685 recognize this construct; fall through to the normal \x handling. */
686 }
687
688 /* Read just a single-byte hex-defined char */
689
690 c = 0;
691 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692 {
693 int cc; /* Some compilers don't like ++ */
694 cc = *(++ptr); /* in initializers */
695 #ifndef EBCDIC /* ASCII coding */
696 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698 #else /* EBCDIC coding */
699 if (cc <= 'z') cc += 64; /* Convert to upper case */
700 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701 #endif
702 }
703 break;
704
705 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706 This coding is ASCII-specific, but then the whole concept of \cx is
707 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
708
709 case 'c':
710 c = *(++ptr);
711 if (c == 0)
712 {
713 *errorcodeptr = ERR2;
714 break;
715 }
716
717 #ifndef EBCDIC /* ASCII coding */
718 if (c >= 'a' && c <= 'z') c -= 32;
719 c ^= 0x40;
720 #else /* EBCDIC coding */
721 if (c >= 'a' && c <= 'z') c += 64;
722 c ^= 0xC0;
723 #endif
724 break;
725
726 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727 other alphanumeric following \ is an error if PCRE_EXTRA was set;
728 otherwise, for Perl compatibility, it is a literal. This code looks a bit
729 odd, but there used to be some cases other than the default, and there may
730 be again in future, so I haven't "optimized" it. */
731
732 default:
733 if ((options & PCRE_EXTRA) != 0) switch(c)
734 {
735 default:
736 *errorcodeptr = ERR3;
737 break;
738 }
739 break;
740 }
741 }
742
743 *ptrptr = ptr;
744 return c;
745 }
746
747
748
749 #ifdef SUPPORT_UCP
750 /*************************************************
751 * Handle \P and \p *
752 *************************************************/
753
754 /* This function is called after \P or \p has been encountered, provided that
755 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756 pointing at the P or p. On exit, it is pointing at the final character of the
757 escape sequence.
758
759 Argument:
760 ptrptr points to the pattern position pointer
761 negptr points to a boolean that is set TRUE for negation else FALSE
762 dptr points to an int that is set to the detailed property value
763 errorcodeptr points to the error code variable
764
765 Returns: type value from ucp_type_table, or -1 for an invalid type
766 */
767
768 static int
769 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
770 {
771 int c, i, bot, top;
772 const uschar *ptr = *ptrptr;
773 char name[32];
774
775 c = *(++ptr);
776 if (c == 0) goto ERROR_RETURN;
777
778 *negptr = FALSE;
779
780 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781 negation. */
782
783 if (c == '{')
784 {
785 if (ptr[1] == '^')
786 {
787 *negptr = TRUE;
788 ptr++;
789 }
790 for (i = 0; i < (int)sizeof(name) - 1; i++)
791 {
792 c = *(++ptr);
793 if (c == 0) goto ERROR_RETURN;
794 if (c == '}') break;
795 name[i] = c;
796 }
797 if (c !='}') goto ERROR_RETURN;
798 name[i] = 0;
799 }
800
801 /* Otherwise there is just one following character */
802
803 else
804 {
805 name[0] = c;
806 name[1] = 0;
807 }
808
809 *ptrptr = ptr;
810
811 /* Search for a recognized property name using binary chop */
812
813 bot = 0;
814 top = _pcre_utt_size;
815
816 while (bot < top)
817 {
818 i = (bot + top) >> 1;
819 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
820 if (c == 0)
821 {
822 *dptr = _pcre_utt[i].value;
823 return _pcre_utt[i].type;
824 }
825 if (c > 0) bot = i + 1; else top = i;
826 }
827
828 *errorcodeptr = ERR47;
829 *ptrptr = ptr;
830 return -1;
831
832 ERROR_RETURN:
833 *errorcodeptr = ERR46;
834 *ptrptr = ptr;
835 return -1;
836 }
837 #endif
838
839
840
841
842 /*************************************************
843 * Check for counted repeat *
844 *************************************************/
845
846 /* This function is called when a '{' is encountered in a place where it might
847 start a quantifier. It looks ahead to see if it really is a quantifier or not.
848 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849 where the ddds are digits.
850
851 Arguments:
852 p pointer to the first char after '{'
853
854 Returns: TRUE or FALSE
855 */
856
857 static BOOL
858 is_counted_repeat(const uschar *p)
859 {
860 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861 while ((digitab[*p] & ctype_digit) != 0) p++;
862 if (*p == '}') return TRUE;
863
864 if (*p++ != ',') return FALSE;
865 if (*p == '}') return TRUE;
866
867 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868 while ((digitab[*p] & ctype_digit) != 0) p++;
869
870 return (*p == '}');
871 }
872
873
874
875 /*************************************************
876 * Read repeat counts *
877 *************************************************/
878
879 /* Read an item of the form {n,m} and return the values. This is called only
880 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881 so the syntax is guaranteed to be correct, but we need to check the values.
882
883 Arguments:
884 p pointer to first char after '{'
885 minp pointer to int for min
886 maxp pointer to int for max
887 returned as -1 if no max
888 errorcodeptr points to error code variable
889
890 Returns: pointer to '}' on success;
891 current ptr on error, with errorcodeptr set non-zero
892 */
893
894 static const uschar *
895 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896 {
897 int min = 0;
898 int max = -1;
899
900 /* Read the minimum value and do a paranoid check: a negative value indicates
901 an integer overflow. */
902
903 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904 if (min < 0 || min > 65535)
905 {
906 *errorcodeptr = ERR5;
907 return p;
908 }
909
910 /* Read the maximum value if there is one, and again do a paranoid on its size.
911 Also, max must not be less than min. */
912
913 if (*p == '}') max = min; else
914 {
915 if (*(++p) != '}')
916 {
917 max = 0;
918 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 if (max < 0 || max > 65535)
920 {
921 *errorcodeptr = ERR5;
922 return p;
923 }
924 if (max < min)
925 {
926 *errorcodeptr = ERR4;
927 return p;
928 }
929 }
930 }
931
932 /* Fill in the required variables, and pass back the pointer to the terminating
933 '}'. */
934
935 *minp = min;
936 *maxp = max;
937 return p;
938 }
939
940
941
942 /*************************************************
943 * Find forward referenced subpattern *
944 *************************************************/
945
946 /* This function scans along a pattern's text looking for capturing
947 subpatterns, and counting them. If it finds a named pattern that matches the
948 name it is given, it returns its number. Alternatively, if the name is NULL, it
949 returns when it reaches a given numbered subpattern. This is used for forward
950 references to subpatterns. We know that if (?P< is encountered, the name will
951 be terminated by '>' because that is checked in the first pass.
952
953 Arguments:
954 ptr current position in the pattern
955 count current count of capturing parens so far encountered
956 name name to seek, or NULL if seeking a numbered subpattern
957 lorn name length, or subpattern number if name is NULL
958 xmode TRUE if we are in /x mode
959
960 Returns: the number of the named subpattern, or -1 if not found
961 */
962
963 static int
964 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965 BOOL xmode)
966 {
967 const uschar *thisname;
968
969 for (; *ptr != 0; ptr++)
970 {
971 int term;
972
973 /* Skip over backslashed characters and also entire \Q...\E */
974
975 if (*ptr == '\\')
976 {
977 if (*(++ptr) == 0) return -1;
978 if (*ptr == 'Q') for (;;)
979 {
980 while (*(++ptr) != 0 && *ptr != '\\');
981 if (*ptr == 0) return -1;
982 if (*(++ptr) == 'E') break;
983 }
984 continue;
985 }
986
987 /* Skip over character classes */
988
989 if (*ptr == '[')
990 {
991 while (*(++ptr) != ']')
992 {
993 if (*ptr == 0) return -1;
994 if (*ptr == '\\')
995 {
996 if (*(++ptr) == 0) return -1;
997 if (*ptr == 'Q') for (;;)
998 {
999 while (*(++ptr) != 0 && *ptr != '\\');
1000 if (*ptr == 0) return -1;
1001 if (*(++ptr) == 'E') break;
1002 }
1003 continue;
1004 }
1005 }
1006 continue;
1007 }
1008
1009 /* Skip comments in /x mode */
1010
1011 if (xmode && *ptr == '#')
1012 {
1013 while (*(++ptr) != 0 && *ptr != '\n');
1014 if (*ptr == 0) return -1;
1015 continue;
1016 }
1017
1018 /* An opening parens must now be a real metacharacter */
1019
1020 if (*ptr != '(') continue;
1021 if (ptr[1] != '?' && ptr[1] != '*')
1022 {
1023 count++;
1024 if (name == NULL && count == lorn) return count;
1025 continue;
1026 }
1027
1028 ptr += 2;
1029 if (*ptr == 'P') ptr++; /* Allow optional P */
1030
1031 /* We have to disambiguate (?<! and (?<= from (?<name> */
1032
1033 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034 *ptr != '\'')
1035 continue;
1036
1037 count++;
1038
1039 if (name == NULL && count == lorn) return count;
1040 term = *ptr++;
1041 if (term == '<') term = '>';
1042 thisname = ptr;
1043 while (*ptr != term) ptr++;
1044 if (name != NULL && lorn == ptr - thisname &&
1045 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046 return count;
1047 }
1048
1049 return -1;
1050 }
1051
1052
1053
1054 /*************************************************
1055 * Find first significant op code *
1056 *************************************************/
1057
1058 /* This is called by several functions that scan a compiled expression looking
1059 for a fixed first character, or an anchoring op code etc. It skips over things
1060 that do not influence this. For some calls, a change of option is important.
1061 For some calls, it makes sense to skip negative forward and all backward
1062 assertions, and also the \b assertion; for others it does not.
1063
1064 Arguments:
1065 code pointer to the start of the group
1066 options pointer to external options
1067 optbit the option bit whose changing is significant, or
1068 zero if none are
1069 skipassert TRUE if certain assertions are to be skipped
1070
1071 Returns: pointer to the first significant opcode
1072 */
1073
1074 static const uschar*
1075 first_significant_code(const uschar *code, int *options, int optbit,
1076 BOOL skipassert)
1077 {
1078 for (;;)
1079 {
1080 switch ((int)*code)
1081 {
1082 case OP_OPT:
1083 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084 *options = (int)code[1];
1085 code += 2;
1086 break;
1087
1088 case OP_ASSERT_NOT:
1089 case OP_ASSERTBACK:
1090 case OP_ASSERTBACK_NOT:
1091 if (!skipassert) return code;
1092 do code += GET(code, 1); while (*code == OP_ALT);
1093 code += _pcre_OP_lengths[*code];
1094 break;
1095
1096 case OP_WORD_BOUNDARY:
1097 case OP_NOT_WORD_BOUNDARY:
1098 if (!skipassert) return code;
1099 /* Fall through */
1100
1101 case OP_CALLOUT:
1102 case OP_CREF:
1103 case OP_RREF:
1104 case OP_DEF:
1105 code += _pcre_OP_lengths[*code];
1106 break;
1107
1108 default:
1109 return code;
1110 }
1111 }
1112 /* Control never reaches here */
1113 }
1114
1115
1116
1117
1118 /*************************************************
1119 * Find the fixed length of a pattern *
1120 *************************************************/
1121
1122 /* Scan a pattern and compute the fixed length of subject that will match it,
1123 if the length is fixed. This is needed for dealing with backward assertions.
1124 In UTF8 mode, the result is in characters rather than bytes.
1125
1126 Arguments:
1127 code points to the start of the pattern (the bracket)
1128 options the compiling options
1129
1130 Returns: the fixed length, or -1 if there is no fixed length,
1131 or -2 if \C was encountered
1132 */
1133
1134 static int
1135 find_fixedlength(uschar *code, int options)
1136 {
1137 int length = -1;
1138
1139 register int branchlength = 0;
1140 register uschar *cc = code + 1 + LINK_SIZE;
1141
1142 /* Scan along the opcodes for this branch. If we get to the end of the
1143 branch, check the length against that of the other branches. */
1144
1145 for (;;)
1146 {
1147 int d;
1148 register int op = *cc;
1149 switch (op)
1150 {
1151 case OP_CBRA:
1152 case OP_BRA:
1153 case OP_ONCE:
1154 case OP_COND:
1155 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156 if (d < 0) return d;
1157 branchlength += d;
1158 do cc += GET(cc, 1); while (*cc == OP_ALT);
1159 cc += 1 + LINK_SIZE;
1160 break;
1161
1162 /* Reached end of a branch; if it's a ket it is the end of a nested
1163 call. If it's ALT it is an alternation in a nested call. If it is
1164 END it's the end of the outer call. All can be handled by the same code. */
1165
1166 case OP_ALT:
1167 case OP_KET:
1168 case OP_KETRMAX:
1169 case OP_KETRMIN:
1170 case OP_END:
1171 if (length < 0) length = branchlength;
1172 else if (length != branchlength) return -1;
1173 if (*cc != OP_ALT) return length;
1174 cc += 1 + LINK_SIZE;
1175 branchlength = 0;
1176 break;
1177
1178 /* Skip over assertive subpatterns */
1179
1180 case OP_ASSERT:
1181 case OP_ASSERT_NOT:
1182 case OP_ASSERTBACK:
1183 case OP_ASSERTBACK_NOT:
1184 do cc += GET(cc, 1); while (*cc == OP_ALT);
1185 /* Fall through */
1186
1187 /* Skip over things that don't match chars */
1188
1189 case OP_REVERSE:
1190 case OP_CREF:
1191 case OP_RREF:
1192 case OP_DEF:
1193 case OP_OPT:
1194 case OP_CALLOUT:
1195 case OP_SOD:
1196 case OP_SOM:
1197 case OP_EOD:
1198 case OP_EODN:
1199 case OP_CIRC:
1200 case OP_DOLL:
1201 case OP_NOT_WORD_BOUNDARY:
1202 case OP_WORD_BOUNDARY:
1203 cc += _pcre_OP_lengths[*cc];
1204 break;
1205
1206 /* Handle literal characters */
1207
1208 case OP_CHAR:
1209 case OP_CHARNC:
1210 case OP_NOT:
1211 branchlength++;
1212 cc += 2;
1213 #ifdef SUPPORT_UTF8
1214 if ((options & PCRE_UTF8) != 0)
1215 {
1216 while ((*cc & 0xc0) == 0x80) cc++;
1217 }
1218 #endif
1219 break;
1220
1221 /* Handle exact repetitions. The count is already in characters, but we
1222 need to skip over a multibyte character in UTF8 mode. */
1223
1224 case OP_EXACT:
1225 branchlength += GET2(cc,1);
1226 cc += 4;
1227 #ifdef SUPPORT_UTF8
1228 if ((options & PCRE_UTF8) != 0)
1229 {
1230 while((*cc & 0x80) == 0x80) cc++;
1231 }
1232 #endif
1233 break;
1234
1235 case OP_TYPEEXACT:
1236 branchlength += GET2(cc,1);
1237 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238 cc += 4;
1239 break;
1240
1241 /* Handle single-char matchers */
1242
1243 case OP_PROP:
1244 case OP_NOTPROP:
1245 cc += 2;
1246 /* Fall through */
1247
1248 case OP_NOT_DIGIT:
1249 case OP_DIGIT:
1250 case OP_NOT_WHITESPACE:
1251 case OP_WHITESPACE:
1252 case OP_NOT_WORDCHAR:
1253 case OP_WORDCHAR:
1254 case OP_ANY:
1255 branchlength++;
1256 cc++;
1257 break;
1258
1259 /* The single-byte matcher isn't allowed */
1260
1261 case OP_ANYBYTE:
1262 return -2;
1263
1264 /* Check a class for variable quantification */
1265
1266 #ifdef SUPPORT_UTF8
1267 case OP_XCLASS:
1268 cc += GET(cc, 1) - 33;
1269 /* Fall through */
1270 #endif
1271
1272 case OP_CLASS:
1273 case OP_NCLASS:
1274 cc += 33;
1275
1276 switch (*cc)
1277 {
1278 case OP_CRSTAR:
1279 case OP_CRMINSTAR:
1280 case OP_CRQUERY:
1281 case OP_CRMINQUERY:
1282 return -1;
1283
1284 case OP_CRRANGE:
1285 case OP_CRMINRANGE:
1286 if (GET2(cc,1) != GET2(cc,3)) return -1;
1287 branchlength += GET2(cc,1);
1288 cc += 5;
1289 break;
1290
1291 default:
1292 branchlength++;
1293 }
1294 break;
1295
1296 /* Anything else is variable length */
1297
1298 default:
1299 return -1;
1300 }
1301 }
1302 /* Control never gets here */
1303 }
1304
1305
1306
1307
1308 /*************************************************
1309 * Scan compiled regex for numbered bracket *
1310 *************************************************/
1311
1312 /* This little function scans through a compiled pattern until it finds a
1313 capturing bracket with the given number.
1314
1315 Arguments:
1316 code points to start of expression
1317 utf8 TRUE in UTF-8 mode
1318 number the required bracket number
1319
1320 Returns: pointer to the opcode for the bracket, or NULL if not found
1321 */
1322
1323 static const uschar *
1324 find_bracket(const uschar *code, BOOL utf8, int number)
1325 {
1326 for (;;)
1327 {
1328 register int c = *code;
1329 if (c == OP_END) return NULL;
1330
1331 /* XCLASS is used for classes that cannot be represented just by a bit
1332 map. This includes negated single high-valued characters. The length in
1333 the table is zero; the actual length is stored in the compiled code. */
1334
1335 if (c == OP_XCLASS) code += GET(code, 1);
1336
1337 /* Handle capturing bracket */
1338
1339 else if (c == OP_CBRA)
1340 {
1341 int n = GET2(code, 1+LINK_SIZE);
1342 if (n == number) return (uschar *)code;
1343 code += _pcre_OP_lengths[c];
1344 }
1345
1346 /* Otherwise, we can get the item's length from the table, except that for
1347 repeated character types, we have to test for \p and \P, which have an extra
1348 two bytes of parameters. */
1349
1350 else
1351 {
1352 switch(c)
1353 {
1354 case OP_TYPESTAR:
1355 case OP_TYPEMINSTAR:
1356 case OP_TYPEPLUS:
1357 case OP_TYPEMINPLUS:
1358 case OP_TYPEQUERY:
1359 case OP_TYPEMINQUERY:
1360 case OP_TYPEPOSSTAR:
1361 case OP_TYPEPOSPLUS:
1362 case OP_TYPEPOSQUERY:
1363 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364 break;
1365
1366 case OP_TYPEUPTO:
1367 case OP_TYPEMINUPTO:
1368 case OP_TYPEEXACT:
1369 case OP_TYPEPOSUPTO:
1370 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371 break;
1372 }
1373
1374 /* Add in the fixed length from the table */
1375
1376 code += _pcre_OP_lengths[c];
1377
1378 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379 a multi-byte character. The length in the table is a minimum, so we have to
1380 arrange to skip the extra bytes. */
1381
1382 #ifdef SUPPORT_UTF8
1383 if (utf8) switch(c)
1384 {
1385 case OP_CHAR:
1386 case OP_CHARNC:
1387 case OP_EXACT:
1388 case OP_UPTO:
1389 case OP_MINUPTO:
1390 case OP_POSUPTO:
1391 case OP_STAR:
1392 case OP_MINSTAR:
1393 case OP_POSSTAR:
1394 case OP_PLUS:
1395 case OP_MINPLUS:
1396 case OP_POSPLUS:
1397 case OP_QUERY:
1398 case OP_MINQUERY:
1399 case OP_POSQUERY:
1400 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401 break;
1402 }
1403 #endif
1404 }
1405 }
1406 }
1407
1408
1409
1410 /*************************************************
1411 * Scan compiled regex for recursion reference *
1412 *************************************************/
1413
1414 /* This little function scans through a compiled pattern until it finds an
1415 instance of OP_RECURSE.
1416
1417 Arguments:
1418 code points to start of expression
1419 utf8 TRUE in UTF-8 mode
1420
1421 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1422 */
1423
1424 static const uschar *
1425 find_recurse(const uschar *code, BOOL utf8)
1426 {
1427 for (;;)
1428 {
1429 register int c = *code;
1430 if (c == OP_END) return NULL;
1431 if (c == OP_RECURSE) return code;
1432
1433 /* XCLASS is used for classes that cannot be represented just by a bit
1434 map. This includes negated single high-valued characters. The length in
1435 the table is zero; the actual length is stored in the compiled code. */
1436
1437 if (c == OP_XCLASS) code += GET(code, 1);
1438
1439 /* Otherwise, we can get the item's length from the table, except that for
1440 repeated character types, we have to test for \p and \P, which have an extra
1441 two bytes of parameters. */
1442
1443 else
1444 {
1445 switch(c)
1446 {
1447 case OP_TYPESTAR:
1448 case OP_TYPEMINSTAR:
1449 case OP_TYPEPLUS:
1450 case OP_TYPEMINPLUS:
1451 case OP_TYPEQUERY:
1452 case OP_TYPEMINQUERY:
1453 case OP_TYPEPOSSTAR:
1454 case OP_TYPEPOSPLUS:
1455 case OP_TYPEPOSQUERY:
1456 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457 break;
1458
1459 case OP_TYPEPOSUPTO:
1460 case OP_TYPEUPTO:
1461 case OP_TYPEMINUPTO:
1462 case OP_TYPEEXACT:
1463 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464 break;
1465 }
1466
1467 /* Add in the fixed length from the table */
1468
1469 code += _pcre_OP_lengths[c];
1470
1471 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472 by a multi-byte character. The length in the table is a minimum, so we have
1473 to arrange to skip the extra bytes. */
1474
1475 #ifdef SUPPORT_UTF8
1476 if (utf8) switch(c)
1477 {
1478 case OP_CHAR:
1479 case OP_CHARNC:
1480 case OP_EXACT:
1481 case OP_UPTO:
1482 case OP_MINUPTO:
1483 case OP_POSUPTO:
1484 case OP_STAR:
1485 case OP_MINSTAR:
1486 case OP_POSSTAR:
1487 case OP_PLUS:
1488 case OP_MINPLUS:
1489 case OP_POSPLUS:
1490 case OP_QUERY:
1491 case OP_MINQUERY:
1492 case OP_POSQUERY:
1493 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494 break;
1495 }
1496 #endif
1497 }
1498 }
1499 }
1500
1501
1502
1503 /*************************************************
1504 * Scan compiled branch for non-emptiness *
1505 *************************************************/
1506
1507 /* This function scans through a branch of a compiled pattern to see whether it
1508 can match the empty string or not. It is called from could_be_empty()
1509 below and from compile_branch() when checking for an unlimited repeat of a
1510 group that can match nothing. Note that first_significant_code() skips over
1511 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1512 struck an inner bracket whose current branch will already have been scanned.
1513
1514 Arguments:
1515 code points to start of search
1516 endcode points to where to stop
1517 utf8 TRUE if in UTF8 mode
1518
1519 Returns: TRUE if what is matched could be empty
1520 */
1521
1522 static BOOL
1523 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1524 {
1525 register int c;
1526 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1527 code < endcode;
1528 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1529 {
1530 const uschar *ccode;
1531
1532 c = *code;
1533
1534 /* Groups with zero repeats can of course be empty; skip them. */
1535
1536 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1537 {
1538 code += _pcre_OP_lengths[c];
1539 do code += GET(code, 1); while (*code == OP_ALT);
1540 c = *code;
1541 continue;
1542 }
1543
1544 /* For other groups, scan the branches. */
1545
1546 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1547 {
1548 BOOL empty_branch;
1549 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1550
1551 /* Scan a closed bracket */
1552
1553 empty_branch = FALSE;
1554 do
1555 {
1556 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1557 empty_branch = TRUE;
1558 code += GET(code, 1);
1559 }
1560 while (*code == OP_ALT);
1561 if (!empty_branch) return FALSE; /* All branches are non-empty */
1562 c = *code;
1563 continue;
1564 }
1565
1566 /* Handle the other opcodes */
1567
1568 switch (c)
1569 {
1570 /* Check for quantifiers after a class. XCLASS is used for classes that
1571 cannot be represented just by a bit map. This includes negated single
1572 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1573 actual length is stored in the compiled code, so we must update "code"
1574 here. */
1575
1576 #ifdef SUPPORT_UTF8
1577 case OP_XCLASS:
1578 ccode = code += GET(code, 1);
1579 goto CHECK_CLASS_REPEAT;
1580 #endif
1581
1582 case OP_CLASS:
1583 case OP_NCLASS:
1584 ccode = code + 33;
1585
1586 #ifdef SUPPORT_UTF8
1587 CHECK_CLASS_REPEAT:
1588 #endif
1589
1590 switch (*ccode)
1591 {
1592 case OP_CRSTAR: /* These could be empty; continue */
1593 case OP_CRMINSTAR:
1594 case OP_CRQUERY:
1595 case OP_CRMINQUERY:
1596 break;
1597
1598 default: /* Non-repeat => class must match */
1599 case OP_CRPLUS: /* These repeats aren't empty */
1600 case OP_CRMINPLUS:
1601 return FALSE;
1602
1603 case OP_CRRANGE:
1604 case OP_CRMINRANGE:
1605 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1606 break;
1607 }
1608 break;
1609
1610 /* Opcodes that must match a character */
1611
1612 case OP_PROP:
1613 case OP_NOTPROP:
1614 case OP_EXTUNI:
1615 case OP_NOT_DIGIT:
1616 case OP_DIGIT:
1617 case OP_NOT_WHITESPACE:
1618 case OP_WHITESPACE:
1619 case OP_NOT_WORDCHAR:
1620 case OP_WORDCHAR:
1621 case OP_ANY:
1622 case OP_ANYBYTE:
1623 case OP_CHAR:
1624 case OP_CHARNC:
1625 case OP_NOT:
1626 case OP_PLUS:
1627 case OP_MINPLUS:
1628 case OP_POSPLUS:
1629 case OP_EXACT:
1630 case OP_NOTPLUS:
1631 case OP_NOTMINPLUS:
1632 case OP_NOTPOSPLUS:
1633 case OP_NOTEXACT:
1634 case OP_TYPEPLUS:
1635 case OP_TYPEMINPLUS:
1636 case OP_TYPEPOSPLUS:
1637 case OP_TYPEEXACT:
1638 return FALSE;
1639
1640 /* These are going to continue, as they may be empty, but we have to
1641 fudge the length for the \p and \P cases. */
1642
1643 case OP_TYPESTAR:
1644 case OP_TYPEMINSTAR:
1645 case OP_TYPEPOSSTAR:
1646 case OP_TYPEQUERY:
1647 case OP_TYPEMINQUERY:
1648 case OP_TYPEPOSQUERY:
1649 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1650 break;
1651
1652 /* Same for these */
1653
1654 case OP_TYPEUPTO:
1655 case OP_TYPEMINUPTO:
1656 case OP_TYPEPOSUPTO:
1657 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1658 break;
1659
1660 /* End of branch */
1661
1662 case OP_KET:
1663 case OP_KETRMAX:
1664 case OP_KETRMIN:
1665 case OP_ALT:
1666 return TRUE;
1667
1668 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1669 MINUPTO, and POSUPTO may be followed by a multibyte character */
1670
1671 #ifdef SUPPORT_UTF8
1672 case OP_STAR:
1673 case OP_MINSTAR:
1674 case OP_POSSTAR:
1675 case OP_QUERY:
1676 case OP_MINQUERY:
1677 case OP_POSQUERY:
1678 case OP_UPTO:
1679 case OP_MINUPTO:
1680 case OP_POSUPTO:
1681 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1682 break;
1683 #endif
1684 }
1685 }
1686
1687 return TRUE;
1688 }
1689
1690
1691
1692 /*************************************************
1693 * Scan compiled regex for non-emptiness *
1694 *************************************************/
1695
1696 /* This function is called to check for left recursive calls. We want to check
1697 the current branch of the current pattern to see if it could match the empty
1698 string. If it could, we must look outwards for branches at other levels,
1699 stopping when we pass beyond the bracket which is the subject of the recursion.
1700
1701 Arguments:
1702 code points to start of the recursion
1703 endcode points to where to stop (current RECURSE item)
1704 bcptr points to the chain of current (unclosed) branch starts
1705 utf8 TRUE if in UTF-8 mode
1706
1707 Returns: TRUE if what is matched could be empty
1708 */
1709
1710 static BOOL
1711 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1712 BOOL utf8)
1713 {
1714 while (bcptr != NULL && bcptr->current >= code)
1715 {
1716 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1717 bcptr = bcptr->outer;
1718 }
1719 return TRUE;
1720 }
1721
1722
1723
1724 /*************************************************
1725 * Check for POSIX class syntax *
1726 *************************************************/
1727
1728 /* This function is called when the sequence "[:" or "[." or "[=" is
1729 encountered in a character class. It checks whether this is followed by an
1730 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1731 ".]" or "=]".
1732
1733 Argument:
1734 ptr pointer to the initial [
1735 endptr where to return the end pointer
1736 cd pointer to compile data
1737
1738 Returns: TRUE or FALSE
1739 */
1740
1741 static BOOL
1742 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1743 {
1744 int terminator; /* Don't combine these lines; the Solaris cc */
1745 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1746 if (*(++ptr) == '^') ptr++;
1747 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1748 if (*ptr == terminator && ptr[1] == ']')
1749 {
1750 *endptr = ptr;
1751 return TRUE;
1752 }
1753 return FALSE;
1754 }
1755
1756
1757
1758
1759 /*************************************************
1760 * Check POSIX class name *
1761 *************************************************/
1762
1763 /* This function is called to check the name given in a POSIX-style class entry
1764 such as [:alnum:].
1765
1766 Arguments:
1767 ptr points to the first letter
1768 len the length of the name
1769
1770 Returns: a value representing the name, or -1 if unknown
1771 */
1772
1773 static int
1774 check_posix_name(const uschar *ptr, int len)
1775 {
1776 const char *pn = posix_names;
1777 register int yield = 0;
1778 while (posix_name_lengths[yield] != 0)
1779 {
1780 if (len == posix_name_lengths[yield] &&
1781 strncmp((const char *)ptr, pn, len) == 0) return yield;
1782 pn += posix_name_lengths[yield] + 1;
1783 yield++;
1784 }
1785 return -1;
1786 }
1787
1788
1789 /*************************************************
1790 * Adjust OP_RECURSE items in repeated group *
1791 *************************************************/
1792
1793 /* OP_RECURSE items contain an offset from the start of the regex to the group
1794 that is referenced. This means that groups can be replicated for fixed
1795 repetition simply by copying (because the recursion is allowed to refer to
1796 earlier groups that are outside the current group). However, when a group is
1797 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1798 it, after it has been compiled. This means that any OP_RECURSE items within it
1799 that refer to the group itself or any contained groups have to have their
1800 offsets adjusted. That one of the jobs of this function. Before it is called,
1801 the partially compiled regex must be temporarily terminated with OP_END.
1802
1803 This function has been extended with the possibility of forward references for
1804 recursions and subroutine calls. It must also check the list of such references
1805 for the group we are dealing with. If it finds that one of the recursions in
1806 the current group is on this list, it adjusts the offset in the list, not the
1807 value in the reference (which is a group number).
1808
1809 Arguments:
1810 group points to the start of the group
1811 adjust the amount by which the group is to be moved
1812 utf8 TRUE in UTF-8 mode
1813 cd contains pointers to tables etc.
1814 save_hwm the hwm forward reference pointer at the start of the group
1815
1816 Returns: nothing
1817 */
1818
1819 static void
1820 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1821 uschar *save_hwm)
1822 {
1823 uschar *ptr = group;
1824
1825 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1826 {
1827 int offset;
1828 uschar *hc;
1829
1830 /* See if this recursion is on the forward reference list. If so, adjust the
1831 reference. */
1832
1833 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1834 {
1835 offset = GET(hc, 0);
1836 if (cd->start_code + offset == ptr + 1)
1837 {
1838 PUT(hc, 0, offset + adjust);
1839 break;
1840 }
1841 }
1842
1843 /* Otherwise, adjust the recursion offset if it's after the start of this
1844 group. */
1845
1846 if (hc >= cd->hwm)
1847 {
1848 offset = GET(ptr, 1);
1849 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1850 }
1851
1852 ptr += 1 + LINK_SIZE;
1853 }
1854 }
1855
1856
1857
1858 /*************************************************
1859 * Insert an automatic callout point *
1860 *************************************************/
1861
1862 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1863 callout points before each pattern item.
1864
1865 Arguments:
1866 code current code pointer
1867 ptr current pattern pointer
1868 cd pointers to tables etc
1869
1870 Returns: new code pointer
1871 */
1872
1873 static uschar *
1874 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1875 {
1876 *code++ = OP_CALLOUT;
1877 *code++ = 255;
1878 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1879 PUT(code, LINK_SIZE, 0); /* Default length */
1880 return code + 2*LINK_SIZE;
1881 }
1882
1883
1884
1885 /*************************************************
1886 * Complete a callout item *
1887 *************************************************/
1888
1889 /* A callout item contains the length of the next item in the pattern, which
1890 we can't fill in till after we have reached the relevant point. This is used
1891 for both automatic and manual callouts.
1892
1893 Arguments:
1894 previous_callout points to previous callout item
1895 ptr current pattern pointer
1896 cd pointers to tables etc
1897
1898 Returns: nothing
1899 */
1900
1901 static void
1902 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1903 {
1904 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1905 PUT(previous_callout, 2 + LINK_SIZE, length);
1906 }
1907
1908
1909
1910 #ifdef SUPPORT_UCP
1911 /*************************************************
1912 * Get othercase range *
1913 *************************************************/
1914
1915 /* This function is passed the start and end of a class range, in UTF-8 mode
1916 with UCP support. It searches up the characters, looking for internal ranges of
1917 characters in the "other" case. Each call returns the next one, updating the
1918 start address.
1919
1920 Arguments:
1921 cptr points to starting character value; updated
1922 d end value
1923 ocptr where to put start of othercase range
1924 odptr where to put end of othercase range
1925
1926 Yield: TRUE when range returned; FALSE when no more
1927 */
1928
1929 static BOOL
1930 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1931 unsigned int *odptr)
1932 {
1933 unsigned int c, othercase, next;
1934
1935 for (c = *cptr; c <= d; c++)
1936 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1937
1938 if (c > d) return FALSE;
1939
1940 *ocptr = othercase;
1941 next = othercase + 1;
1942
1943 for (++c; c <= d; c++)
1944 {
1945 if (_pcre_ucp_othercase(c) != next) break;
1946 next++;
1947 }
1948
1949 *odptr = next - 1;
1950 *cptr = c;
1951
1952 return TRUE;
1953 }
1954 #endif /* SUPPORT_UCP */
1955
1956
1957
1958 /*************************************************
1959 * Check if auto-possessifying is possible *
1960 *************************************************/
1961
1962 /* This function is called for unlimited repeats of certain items, to see
1963 whether the next thing could possibly match the repeated item. If not, it makes
1964 sense to automatically possessify the repeated item.
1965
1966 Arguments:
1967 op_code the repeated op code
1968 this data for this item, depends on the opcode
1969 utf8 TRUE in UTF-8 mode
1970 utf8_char used for utf8 character bytes, NULL if not relevant
1971 ptr next character in pattern
1972 options options bits
1973 cd contains pointers to tables etc.
1974
1975 Returns: TRUE if possessifying is wanted
1976 */
1977
1978 static BOOL
1979 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1980 const uschar *ptr, int options, compile_data *cd)
1981 {
1982 int next;
1983
1984 /* Skip whitespace and comments in extended mode */
1985
1986 if ((options & PCRE_EXTENDED) != 0)
1987 {
1988 for (;;)
1989 {
1990 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1991 if (*ptr == '#')
1992 {
1993 while (*(++ptr) != 0)
1994 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1995 }
1996 else break;
1997 }
1998 }
1999
2000 /* If the next item is one that we can handle, get its value. A non-negative
2001 value is a character, a negative value is an escape value. */
2002
2003 if (*ptr == '\\')
2004 {
2005 int temperrorcode = 0;
2006 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2007 if (temperrorcode != 0) return FALSE;
2008 ptr++; /* Point after the escape sequence */
2009 }
2010
2011 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2012 {
2013 #ifdef SUPPORT_UTF8
2014 if (utf8) { GETCHARINC(next, ptr); } else
2015 #endif
2016 next = *ptr++;
2017 }
2018
2019 else return FALSE;
2020
2021 /* Skip whitespace and comments in extended mode */
2022
2023 if ((options & PCRE_EXTENDED) != 0)
2024 {
2025 for (;;)
2026 {
2027 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2028 if (*ptr == '#')
2029 {
2030 while (*(++ptr) != 0)
2031 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2032 }
2033 else break;
2034 }
2035 }
2036
2037 /* If the next thing is itself optional, we have to give up. */
2038
2039 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2040 return FALSE;
2041
2042 /* Now compare the next item with the previous opcode. If the previous is a
2043 positive single character match, "item" either contains the character or, if
2044 "item" is greater than 127 in utf8 mode, the character's bytes are in
2045 utf8_char. */
2046
2047
2048 /* Handle cases when the next item is a character. */
2049
2050 if (next >= 0) switch(op_code)
2051 {
2052 case OP_CHAR:
2053 #ifdef SUPPORT_UTF8
2054 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2055 #endif
2056 return item != next;
2057
2058 /* For CHARNC (caseless character) we must check the other case. If we have
2059 Unicode property support, we can use it to test the other case of
2060 high-valued characters. */
2061
2062 case OP_CHARNC:
2063 #ifdef SUPPORT_UTF8
2064 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2065 #endif
2066 if (item == next) return FALSE;
2067 #ifdef SUPPORT_UTF8
2068 if (utf8)
2069 {
2070 unsigned int othercase;
2071 if (next < 128) othercase = cd->fcc[next]; else
2072 #ifdef SUPPORT_UCP
2073 othercase = _pcre_ucp_othercase((unsigned int)next);
2074 #else
2075 othercase = NOTACHAR;
2076 #endif
2077 return (unsigned int)item != othercase;
2078 }
2079 else
2080 #endif /* SUPPORT_UTF8 */
2081 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2082
2083 /* For OP_NOT, "item" must be a single-byte character. */
2084
2085 case OP_NOT:
2086 if (next < 0) return FALSE; /* Not a character */
2087 if (item == next) return TRUE;
2088 if ((options & PCRE_CASELESS) == 0) return FALSE;
2089 #ifdef SUPPORT_UTF8
2090 if (utf8)
2091 {
2092 unsigned int othercase;
2093 if (next < 128) othercase = cd->fcc[next]; else
2094 #ifdef SUPPORT_UCP
2095 othercase = _pcre_ucp_othercase(next);
2096 #else
2097 othercase = NOTACHAR;
2098 #endif
2099 return (unsigned int)item == othercase;
2100 }
2101 else
2102 #endif /* SUPPORT_UTF8 */
2103 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2104
2105 case OP_DIGIT:
2106 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2107
2108 case OP_NOT_DIGIT:
2109 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2110
2111 case OP_WHITESPACE:
2112 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2113
2114 case OP_NOT_WHITESPACE:
2115 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2116
2117 case OP_WORDCHAR:
2118 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2119
2120 case OP_NOT_WORDCHAR:
2121 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2122
2123 case OP_HSPACE:
2124 case OP_NOT_HSPACE:
2125 switch(next)
2126 {
2127 case 0x09:
2128 case 0x20:
2129 case 0xa0:
2130 case 0x1680:
2131 case 0x180e:
2132 case 0x2000:
2133 case 0x2001:
2134 case 0x2002:
2135 case 0x2003:
2136 case 0x2004:
2137 case 0x2005:
2138 case 0x2006:
2139 case 0x2007:
2140 case 0x2008:
2141 case 0x2009:
2142 case 0x200A:
2143 case 0x202f:
2144 case 0x205f:
2145 case 0x3000:
2146 return op_code != OP_HSPACE;
2147 default:
2148 return op_code == OP_HSPACE;
2149 }
2150
2151 case OP_VSPACE:
2152 case OP_NOT_VSPACE:
2153 switch(next)
2154 {
2155 case 0x0a:
2156 case 0x0b:
2157 case 0x0c:
2158 case 0x0d:
2159 case 0x85:
2160 case 0x2028:
2161 case 0x2029:
2162 return op_code != OP_VSPACE;
2163 default:
2164 return op_code == OP_VSPACE;
2165 }
2166
2167 default:
2168 return FALSE;
2169 }
2170
2171
2172 /* Handle the case when the next item is \d, \s, etc. */
2173
2174 switch(op_code)
2175 {
2176 case OP_CHAR:
2177 case OP_CHARNC:
2178 #ifdef SUPPORT_UTF8
2179 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2180 #endif
2181 switch(-next)
2182 {
2183 case ESC_d:
2184 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2185
2186 case ESC_D:
2187 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2188
2189 case ESC_s:
2190 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2191
2192 case ESC_S:
2193 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2194
2195 case ESC_w:
2196 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2197
2198 case ESC_W:
2199 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2200
2201 case ESC_h:
2202 case ESC_H:
2203 switch(item)
2204 {
2205 case 0x09:
2206 case 0x20:
2207 case 0xa0:
2208 case 0x1680:
2209 case 0x180e:
2210 case 0x2000:
2211 case 0x2001:
2212 case 0x2002:
2213 case 0x2003:
2214 case 0x2004:
2215 case 0x2005:
2216 case 0x2006:
2217 case 0x2007:
2218 case 0x2008:
2219 case 0x2009:
2220 case 0x200A:
2221 case 0x202f:
2222 case 0x205f:
2223 case 0x3000:
2224 return -next != ESC_h;
2225 default:
2226 return -next == ESC_h;
2227 }
2228
2229 case ESC_v:
2230 case ESC_V:
2231 switch(item)
2232 {
2233 case 0x0a:
2234 case 0x0b:
2235 case 0x0c:
2236 case 0x0d:
2237 case 0x85:
2238 case 0x2028:
2239 case 0x2029:
2240 return -next != ESC_v;
2241 default:
2242 return -next == ESC_v;
2243 }
2244
2245 default:
2246 return FALSE;
2247 }
2248
2249 case OP_DIGIT:
2250 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2251 next == -ESC_h || next == -ESC_v;
2252
2253 case OP_NOT_DIGIT:
2254 return next == -ESC_d;
2255
2256 case OP_WHITESPACE:
2257 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2258
2259 case OP_NOT_WHITESPACE:
2260 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2261
2262 case OP_HSPACE:
2263 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2264
2265 case OP_NOT_HSPACE:
2266 return next == -ESC_h;
2267
2268 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2269 case OP_VSPACE:
2270 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2271
2272 case OP_NOT_VSPACE:
2273 return next == -ESC_v;
2274
2275 case OP_WORDCHAR:
2276 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2277
2278 case OP_NOT_WORDCHAR:
2279 return next == -ESC_w || next == -ESC_d;
2280
2281 default:
2282 return FALSE;
2283 }
2284
2285 /* Control does not reach here */
2286 }
2287
2288
2289
2290 /*************************************************
2291 * Compile one branch *
2292 *************************************************/
2293
2294 /* Scan the pattern, compiling it into the a vector. If the options are
2295 changed during the branch, the pointer is used to change the external options
2296 bits. This function is used during the pre-compile phase when we are trying
2297 to find out the amount of memory needed, as well as during the real compile
2298 phase. The value of lengthptr distinguishes the two phases.
2299
2300 Arguments:
2301 optionsptr pointer to the option bits
2302 codeptr points to the pointer to the current code point
2303 ptrptr points to the current pattern pointer
2304 errorcodeptr points to error code variable
2305 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2306 reqbyteptr set to the last literal character required, else < 0
2307 bcptr points to current branch chain
2308 cd contains pointers to tables etc.
2309 lengthptr NULL during the real compile phase
2310 points to length accumulator during pre-compile phase
2311
2312 Returns: TRUE on success
2313 FALSE, with *errorcodeptr set non-zero on error
2314 */
2315
2316 static BOOL
2317 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2318 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2319 compile_data *cd, int *lengthptr)
2320 {
2321 int repeat_type, op_type;
2322 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2323 int bravalue = 0;
2324 int greedy_default, greedy_non_default;
2325 int firstbyte, reqbyte;
2326 int zeroreqbyte, zerofirstbyte;
2327 int req_caseopt, reqvary, tempreqvary;
2328 int options = *optionsptr;
2329 int after_manual_callout = 0;
2330 int length_prevgroup = 0;
2331 register int c;
2332 register uschar *code = *codeptr;
2333 uschar *last_code = code;
2334 uschar *orig_code = code;
2335 uschar *tempcode;
2336 BOOL inescq = FALSE;
2337 BOOL groupsetfirstbyte = FALSE;
2338 const uschar *ptr = *ptrptr;
2339 const uschar *tempptr;
2340 uschar *previous = NULL;
2341 uschar *previous_callout = NULL;
2342 uschar *save_hwm = NULL;
2343 uschar classbits[32];
2344
2345 #ifdef SUPPORT_UTF8
2346 BOOL class_utf8;
2347 BOOL utf8 = (options & PCRE_UTF8) != 0;
2348 uschar *class_utf8data;
2349 uschar utf8_char[6];
2350 #else
2351 BOOL utf8 = FALSE;
2352 uschar *utf8_char = NULL;
2353 #endif
2354
2355 #ifdef DEBUG
2356 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2357 #endif
2358
2359 /* Set up the default and non-default settings for greediness */
2360
2361 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2362 greedy_non_default = greedy_default ^ 1;
2363
2364 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2365 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2366 matches a non-fixed char first char; reqbyte just remains unset if we never
2367 find one.
2368
2369 When we hit a repeat whose minimum is zero, we may have to adjust these values
2370 to take the zero repeat into account. This is implemented by setting them to
2371 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2372 item types that can be repeated set these backoff variables appropriately. */
2373
2374 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2375
2376 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2377 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2378 value > 255. It is added into the firstbyte or reqbyte variables to record the
2379 case status of the value. This is used only for ASCII characters. */
2380
2381 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2382
2383 /* Switch on next character until the end of the branch */
2384
2385 for (;; ptr++)
2386 {
2387 BOOL negate_class;
2388 BOOL should_flip_negation;
2389 BOOL possessive_quantifier;
2390 BOOL is_quantifier;
2391 BOOL is_recurse;
2392 BOOL reset_bracount;
2393 int class_charcount;
2394 int class_lastchar;
2395 int newoptions;
2396 int recno;
2397 int refsign;
2398 int skipbytes;
2399 int subreqbyte;
2400 int subfirstbyte;
2401 int terminator;
2402 int mclength;
2403 uschar mcbuffer[8];
2404
2405 /* Get next byte in the pattern */
2406
2407 c = *ptr;
2408
2409 /* If we are in the pre-compile phase, accumulate the length used for the
2410 previous cycle of this loop. */
2411
2412 if (lengthptr != NULL)
2413 {
2414 #ifdef DEBUG
2415 if (code > cd->hwm) cd->hwm = code; /* High water info */
2416 #endif
2417 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2418 {
2419 *errorcodeptr = ERR52;
2420 goto FAILED;
2421 }
2422
2423 /* There is at least one situation where code goes backwards: this is the
2424 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2425 the class is simply eliminated. However, it is created first, so we have to
2426 allow memory for it. Therefore, don't ever reduce the length at this point.
2427 */
2428
2429 if (code < last_code) code = last_code;
2430
2431 /* Paranoid check for integer overflow */
2432
2433 if (OFLOW_MAX - *lengthptr < code - last_code)
2434 {
2435 *errorcodeptr = ERR20;
2436 goto FAILED;
2437 }
2438
2439 *lengthptr += code - last_code;
2440 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2441
2442 /* If "previous" is set and it is not at the start of the work space, move
2443 it back to there, in order to avoid filling up the work space. Otherwise,
2444 if "previous" is NULL, reset the current code pointer to the start. */
2445
2446 if (previous != NULL)
2447 {
2448 if (previous > orig_code)
2449 {
2450 memmove(orig_code, previous, code - previous);
2451 code -= previous - orig_code;
2452 previous = orig_code;
2453 }
2454 }
2455 else code = orig_code;
2456
2457 /* Remember where this code item starts so we can pick up the length
2458 next time round. */
2459
2460 last_code = code;
2461 }
2462
2463 /* In the real compile phase, just check the workspace used by the forward
2464 reference list. */
2465
2466 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2467 {
2468 *errorcodeptr = ERR52;
2469 goto FAILED;
2470 }
2471
2472 /* If in \Q...\E, check for the end; if not, we have a literal */
2473
2474 if (inescq && c != 0)
2475 {
2476 if (c == '\\' && ptr[1] == 'E')
2477 {
2478 inescq = FALSE;
2479 ptr++;
2480 continue;
2481 }
2482 else
2483 {
2484 if (previous_callout != NULL)
2485 {
2486 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2487 complete_callout(previous_callout, ptr, cd);
2488 previous_callout = NULL;
2489 }
2490 if ((options & PCRE_AUTO_CALLOUT) != 0)
2491 {
2492 previous_callout = code;
2493 code = auto_callout(code, ptr, cd);
2494 }
2495 goto NORMAL_CHAR;
2496 }
2497 }
2498
2499 /* Fill in length of a previous callout, except when the next thing is
2500 a quantifier. */
2501
2502 is_quantifier = c == '*' || c == '+' || c == '?' ||
2503 (c == '{' && is_counted_repeat(ptr+1));
2504
2505 if (!is_quantifier && previous_callout != NULL &&
2506 after_manual_callout-- <= 0)
2507 {
2508 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2509 complete_callout(previous_callout, ptr, cd);
2510 previous_callout = NULL;
2511 }
2512
2513 /* In extended mode, skip white space and comments */
2514
2515 if ((options & PCRE_EXTENDED) != 0)
2516 {
2517 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2518 if (c == '#')
2519 {
2520 while (*(++ptr) != 0)
2521 {
2522 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2523 }
2524 if (*ptr != 0) continue;
2525
2526 /* Else fall through to handle end of string */
2527 c = 0;
2528 }
2529 }
2530
2531 /* No auto callout for quantifiers. */
2532
2533 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2534 {
2535 previous_callout = code;
2536 code = auto_callout(code, ptr, cd);
2537 }
2538
2539 switch(c)
2540 {
2541 /* ===================================================================*/
2542 case 0: /* The branch terminates at string end */
2543 case '|': /* or | or ) */
2544 case ')':
2545 *firstbyteptr = firstbyte;
2546 *reqbyteptr = reqbyte;
2547 *codeptr = code;
2548 *ptrptr = ptr;
2549 if (lengthptr != NULL)
2550 {
2551 if (OFLOW_MAX - *lengthptr < code - last_code)
2552 {
2553 *errorcodeptr = ERR20;
2554 goto FAILED;
2555 }
2556 *lengthptr += code - last_code; /* To include callout length */
2557 DPRINTF((">> end branch\n"));
2558 }
2559 return TRUE;
2560
2561
2562 /* ===================================================================*/
2563 /* Handle single-character metacharacters. In multiline mode, ^ disables
2564 the setting of any following char as a first character. */
2565
2566 case '^':
2567 if ((options & PCRE_MULTILINE) != 0)
2568 {
2569 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2570 }
2571 previous = NULL;
2572 *code++ = OP_CIRC;
2573 break;
2574
2575 case '$':
2576 previous = NULL;
2577 *code++ = OP_DOLL;
2578 break;
2579
2580 /* There can never be a first char if '.' is first, whatever happens about
2581 repeats. The value of reqbyte doesn't change either. */
2582
2583 case '.':
2584 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2585 zerofirstbyte = firstbyte;
2586 zeroreqbyte = reqbyte;
2587 previous = code;
2588 *code++ = OP_ANY;
2589 break;
2590
2591
2592 /* ===================================================================*/
2593 /* Character classes. If the included characters are all < 256, we build a
2594 32-byte bitmap of the permitted characters, except in the special case
2595 where there is only one such character. For negated classes, we build the
2596 map as usual, then invert it at the end. However, we use a different opcode
2597 so that data characters > 255 can be handled correctly.
2598
2599 If the class contains characters outside the 0-255 range, a different
2600 opcode is compiled. It may optionally have a bit map for characters < 256,
2601 but those above are are explicitly listed afterwards. A flag byte tells
2602 whether the bitmap is present, and whether this is a negated class or not.
2603 */
2604
2605 case '[':
2606 previous = code;
2607
2608 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2609 they are encountered at the top level, so we'll do that too. */
2610
2611 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2612 check_posix_syntax(ptr, &tempptr, cd))
2613 {
2614 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2615 goto FAILED;
2616 }
2617
2618 /* If the first character is '^', set the negation flag and skip it. Also,
2619 if the first few characters (either before or after ^) are \Q\E or \E we
2620 skip them too. This makes for compatibility with Perl. */
2621
2622 negate_class = FALSE;
2623 for (;;)
2624 {
2625 c = *(++ptr);
2626 if (c == '\\')
2627 {
2628 if (ptr[1] == 'E') ptr++;
2629 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2630 else break;
2631 }
2632 else if (!negate_class && c == '^')
2633 negate_class = TRUE;
2634 else break;
2635 }
2636
2637 /* If a class contains a negative special such as \S, we need to flip the
2638 negation flag at the end, so that support for characters > 255 works
2639 correctly (they are all included in the class). */
2640
2641 should_flip_negation = FALSE;
2642
2643 /* Keep a count of chars with values < 256 so that we can optimize the case
2644 of just a single character (as long as it's < 256). However, For higher
2645 valued UTF-8 characters, we don't yet do any optimization. */
2646
2647 class_charcount = 0;
2648 class_lastchar = -1;
2649
2650 /* Initialize the 32-char bit map to all zeros. We build the map in a
2651 temporary bit of memory, in case the class contains only 1 character (less
2652 than 256), because in that case the compiled code doesn't use the bit map.
2653 */
2654
2655 memset(classbits, 0, 32 * sizeof(uschar));
2656
2657 #ifdef SUPPORT_UTF8
2658 class_utf8 = FALSE; /* No chars >= 256 */
2659 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2660 #endif
2661
2662 /* Process characters until ] is reached. By writing this as a "do" it
2663 means that an initial ] is taken as a data character. At the start of the
2664 loop, c contains the first byte of the character. */
2665
2666 if (c != 0) do
2667 {
2668 const uschar *oldptr;
2669
2670 #ifdef SUPPORT_UTF8
2671 if (utf8 && c > 127)
2672 { /* Braces are required because the */
2673 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2674 }
2675 #endif
2676
2677 /* Inside \Q...\E everything is literal except \E */
2678
2679 if (inescq)
2680 {
2681 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2682 {
2683 inescq = FALSE; /* Reset literal state */
2684 ptr++; /* Skip the 'E' */
2685 continue; /* Carry on with next */
2686 }
2687 goto CHECK_RANGE; /* Could be range if \E follows */
2688 }
2689
2690 /* Handle POSIX class names. Perl allows a negation extension of the
2691 form [:^name:]. A square bracket that doesn't match the syntax is
2692 treated as a literal. We also recognize the POSIX constructions
2693 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2694 5.6 and 5.8 do. */
2695
2696 if (c == '[' &&
2697 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2698 check_posix_syntax(ptr, &tempptr, cd))
2699 {
2700 BOOL local_negate = FALSE;
2701 int posix_class, taboffset, tabopt;
2702 register const uschar *cbits = cd->cbits;
2703 uschar pbits[32];
2704
2705 if (ptr[1] != ':')
2706 {
2707 *errorcodeptr = ERR31;
2708 goto FAILED;
2709 }
2710
2711 ptr += 2;
2712 if (*ptr == '^')
2713 {
2714 local_negate = TRUE;
2715 should_flip_negation = TRUE; /* Note negative special */
2716 ptr++;
2717 }
2718
2719 posix_class = check_posix_name(ptr, tempptr - ptr);
2720 if (posix_class < 0)
2721 {
2722 *errorcodeptr = ERR30;
2723 goto FAILED;
2724 }
2725
2726 /* If matching is caseless, upper and lower are converted to
2727 alpha. This relies on the fact that the class table starts with
2728 alpha, lower, upper as the first 3 entries. */
2729
2730 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2731 posix_class = 0;
2732
2733 /* We build the bit map for the POSIX class in a chunk of local store
2734 because we may be adding and subtracting from it, and we don't want to
2735 subtract bits that may be in the main map already. At the end we or the
2736 result into the bit map that is being built. */
2737
2738 posix_class *= 3;
2739
2740 /* Copy in the first table (always present) */
2741
2742 memcpy(pbits, cbits + posix_class_maps[posix_class],
2743 32 * sizeof(uschar));
2744
2745 /* If there is a second table, add or remove it as required. */
2746
2747 taboffset = posix_class_maps[posix_class + 1];
2748 tabopt = posix_class_maps[posix_class + 2];
2749
2750 if (taboffset >= 0)
2751 {
2752 if (tabopt >= 0)
2753 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2754 else
2755 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2756 }
2757
2758 /* Not see if we need to remove any special characters. An option
2759 value of 1 removes vertical space and 2 removes underscore. */
2760
2761 if (tabopt < 0) tabopt = -tabopt;
2762 if (tabopt == 1) pbits[1] &= ~0x3c;
2763 else if (tabopt == 2) pbits[11] &= 0x7f;
2764
2765 /* Add the POSIX table or its complement into the main table that is
2766 being built and we are done. */
2767
2768 if (local_negate)
2769 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2770 else
2771 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2772
2773 ptr = tempptr + 1;
2774 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2775 continue; /* End of POSIX syntax handling */
2776 }
2777
2778 /* Backslash may introduce a single character, or it may introduce one
2779 of the specials, which just set a flag. The sequence \b is a special
2780 case. Inside a class (and only there) it is treated as backspace.
2781 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2782 to 'or' into the one we are building. We assume they have more than one
2783 character in them, so set class_charcount bigger than one. */
2784
2785 if (c == '\\')
2786 {
2787 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2788 if (*errorcodeptr != 0) goto FAILED;
2789
2790 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2791 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2792 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2793 else if (-c == ESC_Q) /* Handle start of quoted string */
2794 {
2795 if (ptr[1] == '\\' && ptr[2] == 'E')
2796 {
2797 ptr += 2; /* avoid empty string */
2798 }
2799 else inescq = TRUE;
2800 continue;
2801 }
2802 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2803
2804 if (c < 0)
2805 {
2806 register const uschar *cbits = cd->cbits;
2807 class_charcount += 2; /* Greater than 1 is what matters */
2808
2809 /* Save time by not doing this in the pre-compile phase. */
2810
2811 if (lengthptr == NULL) switch (-c)
2812 {
2813 case ESC_d:
2814 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2815 continue;
2816
2817 case ESC_D:
2818 should_flip_negation = TRUE;
2819 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2820 continue;
2821
2822 case ESC_w:
2823 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2824 continue;
2825
2826 case ESC_W:
2827 should_flip_negation = TRUE;
2828 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2829 continue;
2830
2831 case ESC_s:
2832 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2833 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2834 continue;
2835
2836 case ESC_S:
2837 should_flip_negation = TRUE;
2838 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2839 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2840 continue;
2841
2842 default: /* Not recognized; fall through */
2843 break; /* Need "default" setting to stop compiler warning. */
2844 }
2845
2846 /* In the pre-compile phase, just do the recognition. */
2847
2848 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2849 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2850
2851 /* We need to deal with \H, \h, \V, and \v in both phases because
2852 they use extra memory. */
2853
2854 if (-c == ESC_h)
2855 {
2856 SETBIT(classbits, 0x09); /* VT */
2857 SETBIT(classbits, 0x20); /* SPACE */
2858 SETBIT(classbits, 0xa0); /* NSBP */
2859 #ifdef SUPPORT_UTF8
2860 if (utf8)
2861 {
2862 class_utf8 = TRUE;
2863 *class_utf8data++ = XCL_SINGLE;
2864 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2865 *class_utf8data++ = XCL_SINGLE;
2866 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2867 *class_utf8data++ = XCL_RANGE;
2868 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2869 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2870 *class_utf8data++ = XCL_SINGLE;
2871 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2872 *class_utf8data++ = XCL_SINGLE;
2873 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2874 *class_utf8data++ = XCL_SINGLE;
2875 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2876 }
2877 #endif
2878 continue;
2879 }
2880
2881 if (-c == ESC_H)
2882 {
2883 for (c = 0; c < 32; c++)
2884 {
2885 int x = 0xff;
2886 switch (c)
2887 {
2888 case 0x09/8: x ^= 1 << (0x09%8); break;
2889 case 0x20/8: x ^= 1 << (0x20%8); break;
2890 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2891 default: break;
2892 }
2893 classbits[c] |= x;
2894 }
2895
2896 #ifdef SUPPORT_UTF8
2897 if (utf8)
2898 {
2899 class_utf8 = TRUE;
2900 *class_utf8data++ = XCL_RANGE;
2901 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2902 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2903 *class_utf8data++ = XCL_RANGE;
2904 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2905 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2906 *class_utf8data++ = XCL_RANGE;
2907 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2908 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2909 *class_utf8data++ = XCL_RANGE;
2910 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2911 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2912 *class_utf8data++ = XCL_RANGE;
2913 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2914 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2915 *class_utf8data++ = XCL_RANGE;
2916 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2917 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2918 *class_utf8data++ = XCL_RANGE;
2919 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2920 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2921 }
2922 #endif
2923 continue;
2924 }
2925
2926 if (-c == ESC_v)
2927 {
2928 SETBIT(classbits, 0x0a); /* LF */
2929 SETBIT(classbits, 0x0b); /* VT */
2930 SETBIT(classbits, 0x0c); /* FF */
2931 SETBIT(classbits, 0x0d); /* CR */
2932 SETBIT(classbits, 0x85); /* NEL */
2933 #ifdef SUPPORT_UTF8
2934 if (utf8)
2935 {
2936 class_utf8 = TRUE;
2937 *class_utf8data++ = XCL_RANGE;
2938 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2939 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2940 }
2941 #endif
2942 continue;
2943 }
2944
2945 if (-c == ESC_V)
2946 {
2947 for (c = 0; c < 32; c++)
2948 {
2949 int x = 0xff;
2950 switch (c)
2951 {
2952 case 0x0a/8: x ^= 1 << (0x0a%8);
2953 x ^= 1 << (0x0b%8);
2954 x ^= 1 << (0x0c%8);
2955 x ^= 1 << (0x0d%8);
2956 break;
2957 case 0x85/8: x ^= 1 << (0x85%8); break;
2958 default: break;
2959 }
2960 classbits[c] |= x;
2961 }
2962
2963 #ifdef SUPPORT_UTF8
2964 if (utf8)
2965 {
2966 class_utf8 = TRUE;
2967 *class_utf8data++ = XCL_RANGE;
2968 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2969 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2970 *class_utf8data++ = XCL_RANGE;
2971 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2972 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2973 }
2974 #endif
2975 continue;
2976 }
2977
2978 /* We need to deal with \P and \p in both phases. */
2979
2980 #ifdef SUPPORT_UCP
2981 if (-c == ESC_p || -c == ESC_P)
2982 {
2983 BOOL negated;
2984 int pdata;
2985 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2986 if (ptype < 0) goto FAILED;
2987 class_utf8 = TRUE;
2988 *class_utf8data++ = ((-c == ESC_p) != negated)?
2989 XCL_PROP : XCL_NOTPROP;
2990 *class_utf8data++ = ptype;
2991 *class_utf8data++ = pdata;
2992 class_charcount -= 2; /* Not a < 256 character */
2993 continue;
2994 }
2995 #endif
2996 /* Unrecognized escapes are faulted if PCRE is running in its
2997 strict mode. By default, for compatibility with Perl, they are
2998 treated as literals. */
2999
3000 if ((options & PCRE_EXTRA) != 0)
3001 {
3002 *errorcodeptr = ERR7;
3003 goto FAILED;
3004 }
3005
3006 class_charcount -= 2; /* Undo the default count from above */
3007 c = *ptr; /* Get the final character and fall through */
3008 }
3009
3010 /* Fall through if we have a single character (c >= 0). This may be
3011 greater than 256 in UTF-8 mode. */
3012
3013 } /* End of backslash handling */
3014
3015 /* A single character may be followed by '-' to form a range. However,
3016 Perl does not permit ']' to be the end of the range. A '-' character
3017 at the end is treated as a literal. Perl ignores orphaned \E sequences
3018 entirely. The code for handling \Q and \E is messy. */
3019
3020 CHECK_RANGE:
3021 while (ptr[1] == '\\' && ptr[2] == 'E')
3022 {
3023 inescq = FALSE;
3024 ptr += 2;
3025 }
3026
3027 oldptr = ptr;
3028
3029 /* Remember \r or \n */
3030
3031 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3032
3033 /* Check for range */
3034
3035 if (!inescq && ptr[1] == '-')
3036 {
3037 int d;
3038 ptr += 2;
3039 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3040
3041 /* If we hit \Q (not followed by \E) at this point, go into escaped
3042 mode. */
3043
3044 while (*ptr == '\\' && ptr[1] == 'Q')
3045 {
3046 ptr += 2;
3047 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3048 inescq = TRUE;
3049 break;
3050 }
3051
3052 if (*ptr == 0 || (!inescq && *ptr == ']'))
3053 {
3054 ptr = oldptr;
3055 goto LONE_SINGLE_CHARACTER;
3056 }
3057
3058 #ifdef SUPPORT_UTF8
3059 if (utf8)
3060 { /* Braces are required because the */
3061 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3062 }
3063 else
3064 #endif
3065 d = *ptr; /* Not UTF-8 mode */
3066
3067 /* The second part of a range can be a single-character escape, but
3068 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3069 in such circumstances. */
3070
3071 if (!inescq && d == '\\')
3072 {
3073 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3074 if (*errorcodeptr != 0) goto FAILED;
3075
3076 /* \b is backspace; \X is literal X; \R is literal R; any other
3077 special means the '-' was literal */
3078
3079 if (d < 0)
3080 {
3081 if (d == -ESC_b) d = '\b';
3082 else if (d == -ESC_X) d = 'X';
3083 else if (d == -ESC_R) d = 'R'; else
3084 {
3085 ptr = oldptr;
3086 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3087 }
3088 }
3089 }
3090
3091 /* Check that the two values are in the correct order. Optimize
3092 one-character ranges */
3093
3094 if (d < c)
3095 {
3096 *errorcodeptr = ERR8;
3097 goto FAILED;
3098 }
3099
3100 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3101
3102 /* Remember \r or \n */
3103
3104 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3105
3106 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3107 matching, we have to use an XCLASS with extra data items. Caseless
3108 matching for characters > 127 is available only if UCP support is
3109 available. */
3110
3111 #ifdef SUPPORT_UTF8
3112 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3113 {
3114 class_utf8 = TRUE;
3115
3116 /* With UCP support, we can find the other case equivalents of
3117 the relevant characters. There may be several ranges. Optimize how
3118 they fit with the basic range. */
3119
3120 #ifdef SUPPORT_UCP
3121 if ((options & PCRE_CASELESS) != 0)
3122 {
3123 unsigned int occ, ocd;
3124 unsigned int cc = c;
3125 unsigned int origd = d;
3126 while (get_othercase_range(&cc, origd, &occ, &ocd))
3127 {
3128 if (occ >= (unsigned int)c &&
3129 ocd <= (unsigned int)d)
3130 continue; /* Skip embedded ranges */
3131
3132 if (occ < (unsigned int)c &&
3133 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3134 { /* if there is overlap, */
3135 c = occ; /* noting that if occ < c */
3136 continue; /* we can't have ocd > d */
3137 } /* because a subrange is */
3138 if (ocd > (unsigned int)d &&
3139 occ <= (unsigned int)d + 1) /* always shorter than */
3140 { /* the basic range. */
3141 d = ocd;
3142 continue;
3143 }
3144
3145 if (occ == ocd)
3146 {
3147 *class_utf8data++ = XCL_SINGLE;
3148 }
3149 else
3150 {
3151 *class_utf8data++ = XCL_RANGE;
3152 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3153 }
3154 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3155 }
3156 }
3157 #endif /* SUPPORT_UCP */
3158
3159 /* Now record the original range, possibly modified for UCP caseless
3160 overlapping ranges. */
3161
3162 *class_utf8data++ = XCL_RANGE;
3163 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3164 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3165
3166 /* With UCP support, we are done. Without UCP support, there is no
3167 caseless matching for UTF-8 characters > 127; we can use the bit map
3168 for the smaller ones. */
3169
3170 #ifdef SUPPORT_UCP
3171 continue; /* With next character in the class */
3172 #else
3173 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3174
3175 /* Adjust upper limit and fall through to set up the map */
3176
3177 d = 127;
3178
3179 #endif /* SUPPORT_UCP */
3180 }
3181 #endif /* SUPPORT_UTF8 */
3182
3183 /* We use the bit map for all cases when not in UTF-8 mode; else
3184 ranges that lie entirely within 0-127 when there is UCP support; else
3185 for partial ranges without UCP support. */
3186
3187 class_charcount += d - c + 1;
3188 class_lastchar = d;
3189
3190 /* We can save a bit of time by skipping this in the pre-compile. */
3191
3192 if (lengthptr == NULL) for (; c <= d; c++)
3193 {
3194 classbits[c/8] |= (1 << (c&7));
3195 if ((options & PCRE_CASELESS) != 0)
3196 {
3197 int uc = cd->fcc[c]; /* flip case */
3198 classbits[uc/8] |= (1 << (uc&7));
3199 }
3200 }
3201
3202 continue; /* Go get the next char in the class */
3203 }
3204
3205 /* Handle a lone single character - we can get here for a normal
3206 non-escape char, or after \ that introduces a single character or for an
3207 apparent range that isn't. */
3208
3209 LONE_SINGLE_CHARACTER:
3210
3211 /* Handle a character that cannot go in the bit map */
3212
3213 #ifdef SUPPORT_UTF8
3214 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3215 {
3216 class_utf8 = TRUE;
3217 *class_utf8data++ = XCL_SINGLE;
3218 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3219
3220 #ifdef SUPPORT_UCP
3221 if ((options & PCRE_CASELESS) != 0)
3222 {
3223 unsigned int othercase;
3224 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3225 {
3226 *class_utf8data++ = XCL_SINGLE;
3227 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3228 }
3229 }
3230 #endif /* SUPPORT_UCP */
3231
3232 }
3233 else
3234 #endif /* SUPPORT_UTF8 */
3235
3236 /* Handle a single-byte character */
3237 {
3238 classbits[c/8] |= (1 << (c&7));
3239 if ((options & PCRE_CASELESS) != 0)
3240 {
3241 c = cd->fcc[c]; /* flip case */
3242 classbits[c/8] |= (1 << (c&7));
3243 }
3244 class_charcount++;
3245 class_lastchar = c;
3246 }
3247 }
3248
3249 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3250
3251 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3252
3253 if (c == 0) /* Missing terminating ']' */
3254 {
3255 *errorcodeptr = ERR6;
3256 goto FAILED;
3257 }
3258
3259
3260 /* This code has been disabled because it would mean that \s counts as
3261 an explicit \r or \n reference, and that's not really what is wanted. Now
3262 we set the flag only if there is a literal "\r" or "\n" in the class. */
3263
3264 #if 0
3265 /* Remember whether \r or \n are in this class */
3266
3267 if (negate_class)
3268 {
3269 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3270 }
3271 else
3272 {
3273 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3274 }
3275 #endif
3276
3277
3278 /* If class_charcount is 1, we saw precisely one character whose value is
3279 less than 256. As long as there were no characters >= 128 and there was no
3280 use of \p or \P, in other words, no use of any XCLASS features, we can
3281 optimize.
3282
3283 In UTF-8 mode, we can optimize the negative case only if there were no
3284 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3285 operate on single-bytes only. This is an historical hangover. Maybe one day
3286 we can tidy these opcodes to handle multi-byte characters.
3287
3288 The optimization throws away the bit map. We turn the item into a
3289 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3290 that OP_NOT does not support multibyte characters. In the positive case, it
3291 can cause firstbyte to be set. Otherwise, there can be no first char if
3292 this item is first, whatever repeat count may follow. In the case of
3293 reqbyte, save the previous value for reinstating. */
3294
3295 #ifdef SUPPORT_UTF8
3296 if (class_charcount == 1 && !class_utf8 &&
3297 (!utf8 || !negate_class || class_lastchar < 128))
3298 #else
3299 if (class_charcount == 1)
3300 #endif
3301 {
3302 zeroreqbyte = reqbyte;
3303
3304 /* The OP_NOT opcode works on one-byte characters only. */
3305
3306 if (negate_class)
3307 {
3308 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3309 zerofirstbyte = firstbyte;
3310 *code++ = OP_NOT;
3311 *code++ = class_lastchar;
3312 break;
3313 }
3314
3315 /* For a single, positive character, get the value into mcbuffer, and
3316 then we can handle this with the normal one-character code. */
3317
3318 #ifdef SUPPORT_UTF8
3319 if (utf8 && class_lastchar > 127)
3320 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3321 else
3322 #endif
3323 {
3324 mcbuffer[0] = class_lastchar;
3325 mclength = 1;
3326 }
3327 goto ONE_CHAR;
3328 } /* End of 1-char optimization */
3329
3330 /* The general case - not the one-char optimization. If this is the first
3331 thing in the branch, there can be no first char setting, whatever the
3332 repeat count. Any reqbyte setting must remain unchanged after any kind of
3333 repeat. */
3334
3335 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3336 zerofirstbyte = firstbyte;
3337 zeroreqbyte = reqbyte;
3338
3339 /* If there are characters with values > 255, we have to compile an
3340 extended class, with its own opcode, unless there was a negated special
3341 such as \S in the class, because in that case all characters > 255 are in
3342 the class, so any that were explicitly given as well can be ignored. If
3343 (when there are explicit characters > 255 that must be listed) there are no
3344 characters < 256, we can omit the bitmap in the actual compiled code. */
3345
3346 #ifdef SUPPORT_UTF8
3347 if (class_utf8 && !should_flip_negation)
3348 {
3349 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3350 *code++ = OP_XCLASS;
3351 code += LINK_SIZE;
3352 *code = negate_class? XCL_NOT : 0;
3353
3354 /* If the map is required, move up the extra data to make room for it;
3355 otherwise just move the code pointer to the end of the extra data. */
3356
3357 if (class_charcount > 0)
3358 {
3359 *code++ |= XCL_MAP;
3360 memmove(code + 32, code, class_utf8data - code);
3361 memcpy(code, classbits, 32);
3362 code = class_utf8data + 32;
3363 }
3364 else code = class_utf8data;
3365
3366 /* Now fill in the complete length of the item */
3367
3368 PUT(previous, 1, code - previous);
3369 break; /* End of class handling */
3370 }
3371 #endif
3372
3373 /* If there are no characters > 255, set the opcode to OP_CLASS or
3374 OP_NCLASS, depending on whether the whole class was negated and whether
3375 there were negative specials such as \S in the class. Then copy the 32-byte
3376 map into the code vector, negating it if necessary. */
3377
3378 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3379 if (negate_class)
3380 {
3381 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3382 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3383 }
3384 else
3385 {
3386 memcpy(code, classbits, 32);
3387 }
3388 code += 32;
3389 break;
3390
3391
3392 /* ===================================================================*/
3393 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3394 has been tested above. */
3395
3396 case '{':
3397 if (!is_quantifier) goto NORMAL_CHAR;
3398 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3399 if (*errorcodeptr != 0) goto FAILED;
3400 goto REPEAT;
3401
3402 case '*':
3403 repeat_min = 0;
3404 repeat_max = -1;
3405 goto REPEAT;
3406
3407 case '+':
3408 repeat_min = 1;
3409 repeat_max = -1;
3410 goto REPEAT;
3411
3412 case '?':
3413 repeat_min = 0;
3414 repeat_max = 1;
3415
3416 REPEAT:
3417 if (previous == NULL)
3418 {
3419 *errorcodeptr = ERR9;
3420 goto FAILED;
3421 }
3422
3423 if (repeat_min == 0)
3424 {
3425 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3426 reqbyte = zeroreqbyte; /* Ditto */
3427 }
3428
3429 /* Remember whether this is a variable length repeat */
3430
3431 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3432
3433 op_type = 0; /* Default single-char op codes */
3434 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3435
3436 /* Save start of previous item, in case we have to move it up to make space
3437 for an inserted OP_ONCE for the additional '+' extension. */
3438
3439 tempcode = previous;
3440
3441 /* If the next character is '+', we have a possessive quantifier. This
3442 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3443 If the next character is '?' this is a minimizing repeat, by default,
3444 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3445 repeat type to the non-default. */
3446
3447 if (ptr[1] == '+')
3448 {
3449 repeat_type = 0; /* Force greedy */
3450 possessive_quantifier = TRUE;
3451 ptr++;
3452 }
3453 else if (ptr[1] == '?')
3454 {
3455 repeat_type = greedy_non_default;
3456 ptr++;
3457 }
3458 else repeat_type = greedy_default;
3459
3460 /* If previous was a character match, abolish the item and generate a
3461 repeat item instead. If a char item has a minumum of more than one, ensure
3462 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3463 the first thing in a branch because the x will have gone into firstbyte
3464 instead. */
3465
3466 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3467 {
3468 /* Deal with UTF-8 characters that take up more than one byte. It's
3469 easier to write this out separately than try to macrify it. Use c to
3470 hold the length of the character in bytes, plus 0x80 to flag that it's a
3471 length rather than a small character. */
3472
3473 #ifdef SUPPORT_UTF8
3474 if (utf8 && (code[-1] & 0x80) != 0)
3475 {
3476 uschar *lastchar = code - 1;
3477 while((*lastchar & 0xc0) == 0x80) lastchar--;
3478 c = code - lastchar; /* Length of UTF-8 character */
3479 memcpy(utf8_char, lastchar, c); /* Save the char */
3480 c |= 0x80; /* Flag c as a length */
3481 }
3482 else
3483 #endif
3484
3485 /* Handle the case of a single byte - either with no UTF8 support, or
3486 with UTF-8 disabled, or for a UTF-8 character < 128. */
3487
3488 {
3489 c = code[-1];
3490 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3491 }
3492
3493 /* If the repetition is unlimited, it pays to see if the next thing on
3494 the line is something that cannot possibly match this character. If so,
3495 automatically possessifying this item gains some performance in the case
3496 where the match fails. */
3497
3498 if (!possessive_quantifier &&
3499 repeat_max < 0 &&
3500 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3501 options, cd))
3502 {
3503 repeat_type = 0; /* Force greedy */
3504 possessive_quantifier = TRUE;
3505 }
3506
3507 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3508 }
3509
3510 /* If previous was a single negated character ([^a] or similar), we use
3511 one of the special opcodes, replacing it. The code is shared with single-
3512 character repeats by setting opt_type to add a suitable offset into
3513 repeat_type. We can also test for auto-possessification. OP_NOT is
3514 currently used only for single-byte chars. */
3515
3516 else if (*previous == OP_NOT)
3517 {
3518 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3519 c = previous[1];
3520 if (!possessive_quantifier &&
3521 repeat_max < 0 &&
3522 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3523 {
3524 repeat_type = 0; /* Force greedy */
3525 possessive_quantifier = TRUE;
3526 }
3527 goto OUTPUT_SINGLE_REPEAT;
3528 }
3529
3530 /* If previous was a character type match (\d or similar), abolish it and
3531 create a suitable repeat item. The code is shared with single-character
3532 repeats by setting op_type to add a suitable offset into repeat_type. Note
3533 the the Unicode property types will be present only when SUPPORT_UCP is
3534 defined, but we don't wrap the little bits of code here because it just
3535 makes it horribly messy. */
3536
3537 else if (*previous < OP_EODN)
3538 {
3539 uschar *oldcode;
3540 int prop_type, prop_value;
3541 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3542 c = *previous;
3543
3544 if (!possessive_quantifier &&
3545 repeat_max < 0 &&
3546 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3547 {
3548 repeat_type = 0; /* Force greedy */
3549 possessive_quantifier = TRUE;
3550 }
3551
3552 OUTPUT_SINGLE_REPEAT:
3553 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3554 {
3555 prop_type = previous[1];
3556 prop_value = previous[2];
3557 }
3558 else prop_type = prop_value = -1;
3559
3560 oldcode = code;
3561 code = previous; /* Usually overwrite previous item */
3562
3563 /* If the maximum is zero then the minimum must also be zero; Perl allows
3564 this case, so we do too - by simply omitting the item altogether. */
3565
3566 if (repeat_max == 0) goto END_REPEAT;
3567
3568 /* All real repeats make it impossible to handle partial matching (maybe
3569 one day we will be able to remove this restriction). */
3570
3571 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3572
3573 /* Combine the op_type with the repeat_type */
3574
3575 repeat_type += op_type;
3576
3577 /* A minimum of zero is handled either as the special case * or ?, or as
3578 an UPTO, with the maximum given. */
3579
3580 if (repeat_min == 0)
3581 {
3582 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3583 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3584 else
3585 {
3586 *code++ = OP_UPTO + repeat_type;
3587 PUT2INC(code, 0, repeat_max);
3588 }
3589 }
3590
3591 /* A repeat minimum of 1 is optimized into some special cases. If the
3592 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3593 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3594 one less than the maximum. */
3595
3596 else if (repeat_min == 1)
3597 {
3598 if (repeat_max == -1)
3599 *code++ = OP_PLUS + repeat_type;
3600 else
3601 {
3602 code = oldcode; /* leave previous item in place */
3603 if (repeat_max == 1) goto END_REPEAT;
3604 *code++ = OP_UPTO + repeat_type;
3605 PUT2INC(code, 0, repeat_max - 1);
3606 }
3607 }
3608
3609 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3610 handled as an EXACT followed by an UPTO. */
3611
3612 else
3613 {
3614 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3615 PUT2INC(code, 0, repeat_min);
3616
3617 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3618 we have to insert the character for the previous code. For a repeated
3619 Unicode property match, there are two extra bytes that define the
3620 required property. In UTF-8 mode, long characters have their length in
3621 c, with the 0x80 bit as a flag. */
3622
3623 if (repeat_max < 0)
3624 {
3625 #ifdef SUPPORT_UTF8
3626 if (utf8 && c >= 128)
3627 {
3628 memcpy(code, utf8_char, c & 7);
3629 code += c & 7;
3630 }
3631 else
3632 #endif
3633 {
3634 *code++ = c;
3635 if (prop_type >= 0)
3636 {
3637 *code++ = prop_type;
3638 *code++ = prop_value;
3639 }
3640 }
3641 *code++ = OP_STAR + repeat_type;
3642 }
3643
3644 /* Else insert an UPTO if the max is greater than the min, again
3645 preceded by the character, for the previously inserted code. If the
3646 UPTO is just for 1 instance, we can use QUERY instead. */
3647
3648 else if (repeat_max != repeat_min)
3649 {
3650 #ifdef SUPPORT_UTF8
3651 if (utf8 && c >= 128)
3652 {
3653 memcpy(code, utf8_char, c & 7);
3654 code += c & 7;
3655 }
3656 else
3657 #endif
3658 *code++ = c;
3659 if (prop_type >= 0)
3660 {
3661 *code++ = prop_type;
3662 *code++ = prop_value;
3663 }
3664 repeat_max -= repeat_min;
3665
3666 if (repeat_max == 1)
3667 {
3668 *code++ = OP_QUERY + repeat_type;
3669 }
3670 else
3671 {
3672 *code++ = OP_UPTO + repeat_type;
3673 PUT2INC(code, 0, repeat_max);
3674 }
3675 }
3676 }
3677
3678 /* The character or character type itself comes last in all cases. */
3679
3680 #ifdef SUPPORT_UTF8
3681 if (utf8 && c >= 128)
3682 {
3683 memcpy(code, utf8_char, c & 7);
3684 code += c & 7;
3685 }
3686 else
3687 #endif
3688 *code++ = c;
3689
3690 /* For a repeated Unicode property match, there are two extra bytes that
3691 define the required property. */
3692
3693 #ifdef SUPPORT_UCP
3694 if (prop_type >= 0)
3695 {
3696 *code++ = prop_type;
3697 *code++ = prop_value;
3698 }
3699 #endif
3700 }
3701
3702 /* If previous was a character class or a back reference, we put the repeat
3703 stuff after it, but just skip the item if the repeat was {0,0}. */
3704
3705 else if (*previous == OP_CLASS ||
3706 *previous == OP_NCLASS ||
3707 #ifdef SUPPORT_UTF8
3708 *previous == OP_XCLASS ||
3709 #endif
3710 *previous == OP_REF)
3711 {
3712 if (repeat_max == 0)
3713 {
3714 code = previous;
3715 goto END_REPEAT;
3716 }
3717
3718 /* All real repeats make it impossible to handle partial matching (maybe
3719 one day we will be able to remove this restriction). */
3720
3721 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3722
3723 if (repeat_min == 0 && repeat_max == -1)
3724 *code++ = OP_CRSTAR + repeat_type;
3725 else if (repeat_min == 1 && repeat_max == -1)
3726 *code++ = OP_CRPLUS + repeat_type;
3727 else if (repeat_min == 0 && repeat_max == 1)
3728 *code++ = OP_CRQUERY + repeat_type;
3729 else
3730 {
3731 *code++ = OP_CRRANGE + repeat_type;
3732 PUT2INC(code, 0, repeat_min);
3733 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3734 PUT2INC(code, 0, repeat_max);
3735 }
3736 }
3737
3738 /* If previous was a bracket group, we may have to replicate it in certain
3739 cases. */
3740
3741 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3742 *previous == OP_ONCE || *previous == OP_COND)
3743 {
3744 register int i;
3745 int ketoffset = 0;
3746 int len = code - previous;
3747 uschar *bralink = NULL;
3748
3749 /* Repeating a DEFINE group is pointless */
3750
3751 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3752 {
3753 *errorcodeptr = ERR55;
3754 goto FAILED;
3755 }
3756
3757 /* If the maximum repeat count is unlimited, find the end of the bracket
3758 by scanning through from the start, and compute the offset back to it
3759 from the current code pointer. There may be an OP_OPT setting following
3760 the final KET, so we can't find the end just by going back from the code
3761 pointer. */
3762
3763 if (repeat_max == -1)
3764 {
3765 register uschar *ket = previous;
3766 do ket += GET(ket, 1); while (*ket != OP_KET);
3767 ketoffset = code - ket;
3768 }
3769
3770 /* The case of a zero minimum is special because of the need to stick
3771 OP_BRAZERO in front of it, and because the group appears once in the
3772 data, whereas in other cases it appears the minimum number of times. For
3773 this reason, it is simplest to treat this case separately, as otherwise
3774 the code gets far too messy. There are several special subcases when the
3775 minimum is zero. */
3776
3777 if (repeat_min == 0)
3778 {
3779 /* If the maximum is also zero, we just omit the group from the output
3780 altogether. */
3781
3782 if (repeat_max == 0)
3783 {
3784 code = previous;
3785 goto END_REPEAT;
3786 }
3787
3788 /* If the maximum is 1 or unlimited, we just have to stick in the
3789 BRAZERO and do no more at this point. However, we do need to adjust
3790 any OP_RECURSE calls inside the group that refer to the group itself or
3791 any internal or forward referenced group, because the offset is from
3792 the start of the whole regex. Temporarily terminate the pattern while
3793 doing this. */
3794
3795 if (repeat_max <= 1)
3796 {
3797 *code = OP_END;
3798 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3799 memmove(previous+1, previous, len);
3800 code++;
3801 *previous++ = OP_BRAZERO + repeat_type;
3802 }
3803
3804 /* If the maximum is greater than 1 and limited, we have to replicate
3805 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3806 The first one has to be handled carefully because it's the original
3807 copy, which has to be moved up. The remainder can be handled by code
3808 that is common with the non-zero minimum case below. We have to
3809 adjust the value or repeat_max, since one less copy is required. Once
3810 again, we may have to adjust any OP_RECURSE calls inside the group. */
3811
3812 else
3813 {
3814 int offset;
3815 *code = OP_END;
3816 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3817 memmove(previous + 2 + LINK_SIZE, previous, len);
3818 code += 2 + LINK_SIZE;
3819 *previous++ = OP_BRAZERO + repeat_type;
3820 *previous++ = OP_BRA;
3821
3822 /* We chain together the bracket offset fields that have to be
3823 filled in later when the ends of the brackets are reached. */
3824
3825 offset = (bralink == NULL)? 0 : previous - bralink;
3826 bralink = previous;
3827 PUTINC(previous, 0, offset);
3828 }
3829
3830 repeat_max--;
3831 }
3832
3833 /* If the minimum is greater than zero, replicate the group as many
3834 times as necessary, and adjust the maximum to the number of subsequent
3835 copies that we need. If we set a first char from the group, and didn't
3836 set a required char, copy the latter from the former. If there are any
3837 forward reference subroutine calls in the group, there will be entries on
3838 the workspace list; replicate these with an appropriate increment. */
3839
3840 else
3841 {
3842 if (repeat_min > 1)
3843 {
3844 /* In the pre-compile phase, we don't actually do the replication. We
3845 just adjust the length as if we had. Do some paranoid checks for
3846 potential integer overflow. */
3847
3848 if (lengthptr != NULL)
3849 {
3850 int delta = (repeat_min - 1)*length_prevgroup;
3851 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3852 (double)INT_MAX ||
3853 OFLOW_MAX - *lengthptr < delta)
3854 {
3855 *errorcodeptr = ERR20;
3856 goto FAILED;
3857 }
3858 *lengthptr += delta;
3859 }
3860
3861 /* This is compiling for real */
3862
3863 else
3864 {
3865 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3866 for (i = 1; i < repeat_min; i++)
3867 {
3868 uschar *hc;
3869 uschar *this_hwm = cd->hwm;
3870 memcpy(code, previous, len);
3871 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3872 {
3873 PUT(cd->hwm, 0, GET(hc, 0) + len);
3874 cd->hwm += LINK_SIZE;
3875 }
3876 save_hwm = this_hwm;
3877 code += len;
3878 }
3879 }
3880 }
3881
3882 if (repeat_max > 0) repeat_max -= repeat_min;
3883 }
3884
3885 /* This code is common to both the zero and non-zero minimum cases. If
3886 the maximum is limited, it replicates the group in a nested fashion,
3887 remembering the bracket starts on a stack. In the case of a zero minimum,
3888 the first one was set up above. In all cases the repeat_max now specifies
3889 the number of additional copies needed. Again, we must remember to
3890 replicate entries on the forward reference list. */
3891
3892 if (repeat_max >= 0)
3893 {
3894 /* In the pre-compile phase, we don't actually do the replication. We
3895 just adjust the length as if we had. For each repetition we must add 1
3896 to the length for BRAZERO and for all but the last repetition we must
3897 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3898 paranoid checks to avoid integer overflow. */
3899
3900 if (lengthptr != NULL && repeat_max > 0)
3901 {
3902 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3903 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3904 if ((double)repeat_max *
3905 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3906 > (double)INT_MAX ||
3907 OFLOW_MAX - *lengthptr < delta)
3908 {
3909 *errorcodeptr = ERR20;
3910 goto FAILED;
3911 }
3912 *lengthptr += delta;
3913 }
3914
3915 /* This is compiling for real */
3916
3917 else for (i = repeat_max - 1; i >= 0; i--)
3918 {
3919 uschar *hc;
3920 uschar *this_hwm = cd->hwm;
3921
3922 *code++ = OP_BRAZERO + repeat_type;
3923
3924 /* All but the final copy start a new nesting, maintaining the
3925 chain of brackets outstanding. */
3926
3927 if (i != 0)
3928 {
3929 int offset;
3930 *code++ = OP_BRA;
3931 offset = (bralink == NULL)? 0 : code - bralink;
3932 bralink = code;
3933 PUTINC(code, 0, offset);
3934 }
3935
3936 memcpy(code, previous, len);
3937 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3938 {
3939 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3940 cd->hwm += LINK_SIZE;
3941 }
3942 save_hwm = this_hwm;
3943 code += len;
3944 }
3945
3946 /* Now chain through the pending brackets, and fill in their length
3947 fields (which are holding the chain links pro tem). */
3948
3949 while (bralink != NULL)
3950 {
3951 int oldlinkoffset;
3952 int offset = code - bralink + 1;
3953 uschar *bra = code - offset;
3954 oldlinkoffset = GET(bra, 1);
3955 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3956 *code++ = OP_KET;
3957 PUTINC(code, 0, offset);
3958 PUT(bra, 1, offset);
3959 }
3960 }
3961
3962 /* If the maximum is unlimited, set a repeater in the final copy. We
3963 can't just offset backwards from the current code point, because we
3964 don't know if there's been an options resetting after the ket. The
3965 correct offset was computed above.
3966
3967 Then, when we are doing the actual compile phase, check to see whether
3968 this group is a non-atomic one that could match an empty string. If so,
3969 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3970 that runtime checking can be done. [This check is also applied to
3971 atomic groups at runtime, but in a different way.] */
3972
3973 else
3974 {
3975 uschar *ketcode = code - ketoffset;
3976 uschar *bracode = ketcode - GET(ketcode, 1);
3977 *ketcode = OP_KETRMAX + repeat_type;
3978 if (lengthptr == NULL && *bracode != OP_ONCE)
3979 {
3980 uschar *scode = bracode;
3981 do
3982 {
3983 if (could_be_empty_branch(scode, ketcode, utf8))
3984 {
3985 *bracode += OP_SBRA - OP_BRA;
3986 break;
3987 }
3988 scode += GET(scode, 1);
3989 }
3990 while (*scode == OP_ALT);
3991 }
3992 }
3993 }
3994
3995 /* Else there's some kind of shambles */
3996
3997 else
3998 {
3999 *errorcodeptr = ERR11;
4000 goto FAILED;
4001 }
4002
4003 /* If the character following a repeat is '+', or if certain optimization
4004 tests above succeeded, possessive_quantifier is TRUE. For some of the
4005 simpler opcodes, there is an special alternative opcode for this. For
4006 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4007 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4008 but the special opcodes can optimize it a bit. The repeated item starts at
4009 tempcode, not at previous, which might be the first part of a string whose
4010 (former) last char we repeated.
4011
4012 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4013 an 'upto' may follow. We skip over an 'exact' item, and then test the
4014 length of what remains before proceeding. */
4015
4016 if (possessive_quantifier)
4017 {
4018 int len;
4019 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4020 *tempcode == OP_NOTEXACT)
4021 tempcode += _pcre_OP_lengths[*tempcode];
4022 len = code - tempcode;
4023 if (len > 0) switch (*tempcode)
4024 {
4025 case OP_STAR: *tempcode = OP_POSSTAR; break;
4026 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4027 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4028 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4029
4030 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4031 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4032 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4033 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4034
4035 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4036 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4037 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4038 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4039
4040 default:
4041 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4042 code += 1 + LINK_SIZE;
4043 len += 1 + LINK_SIZE;
4044 tempcode[0] = OP_ONCE;
4045 *code++ = OP_KET;
4046 PUTINC(code, 0, len);
4047 PUT(tempcode, 1, len);
4048 break;
4049 }
4050 }
4051
4052 /* In all case we no longer have a previous item. We also set the
4053 "follows varying string" flag for subsequently encountered reqbytes if
4054 it isn't already set and we have just passed a varying length item. */
4055
4056 END_REPEAT:
4057 previous = NULL;
4058 cd->req_varyopt |= reqvary;
4059 break;
4060
4061
4062 /* ===================================================================*/
4063 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4064 lookbehind or option setting or condition or all the other extended
4065 parenthesis forms. */
4066
4067 case '(':
4068 newoptions = options;
4069 skipbytes = 0;
4070 bravalue = OP_CBRA;
4071 save_hwm = cd->hwm;
4072 reset_bracount = FALSE;
4073
4074 /* First deal with various "verbs" that can be introduced by '*'. */
4075
4076 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4077 {
4078 int i, namelen;
4079 const char *vn = verbnames;
4080 const uschar *name = ++ptr;
4081 previous = NULL;
4082 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4083 if (*ptr == ':')
4084 {
4085 *errorcodeptr = ERR59; /* Not supported */
4086 goto FAILED;
4087 }
4088 if (*ptr != ')')
4089 {
4090 *errorcodeptr = ERR60;
4091 goto FAILED;
4092 }
4093 namelen = ptr - name;
4094 for (i = 0; i < verbcount; i++)
4095 {
4096 if (namelen == verbs[i].len &&
4097 strncmp((char *)name, vn, namelen) == 0)
4098 {
4099 *code = verbs[i].op;
4100 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4101 break;
4102 }
4103 vn += verbs[i].len + 1;
4104 }
4105 if (i < verbcount) continue;
4106 *errorcodeptr = ERR60;
4107 goto FAILED;
4108 }
4109
4110 /* Deal with the extended parentheses; all are introduced by '?', and the
4111 appearance of any of them means that this is not a capturing group. */
4112
4113 else if (*ptr == '?')
4114 {
4115 int i, set, unset, namelen;
4116 int *optset;
4117 const uschar *name;
4118 uschar *slot;
4119
4120 switch (*(++ptr))
4121 {
4122 case '#': /* Comment; skip to ket */
4123 ptr++;
4124 while (*ptr != 0 && *ptr != ')') ptr++;
4125 if (*ptr == 0)
4126 {
4127 *errorcodeptr = ERR18;
4128 goto FAILED;
4129 }
4130 continue;
4131
4132
4133 /* ------------------------------------------------------------ */
4134 case '|': /* Reset capture count for each branch */
4135 reset_bracount = TRUE;
4136 /* Fall through */
4137
4138 /* ------------------------------------------------------------ */
4139 case ':': /* Non-capturing bracket */
4140 bravalue = OP_BRA;
4141 ptr++;
4142 break;
4143
4144
4145 /* ------------------------------------------------------------ */
4146 case '(':
4147 bravalue = OP_COND; /* Conditional group */
4148
4149 /* A condition can be an assertion, a number (referring to a numbered
4150 group), a name (referring to a named group), or 'R', referring to
4151 recursion. R<digits> and R&name are also permitted for recursion tests.
4152
4153 There are several syntaxes for testing a named group: (?(name)) is used
4154 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4155
4156 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4157 be the recursive thing or the name 'R' (and similarly for 'R' followed
4158 by digits), and (b) a number could be a name that consists of digits.
4159 In both cases, we look for a name first; if not found, we try the other
4160 cases. */
4161
4162 /* For conditions that are assertions, check the syntax, and then exit
4163 the switch. This will take control down to where bracketed groups,
4164 including assertions, are processed. */
4165
4166 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4167 break;
4168
4169 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4170 below), and all need to skip 3 bytes at the start of the group. */
4171
4172 code[1+LINK_SIZE] = OP_CREF;
4173 skipbytes = 3;
4174 refsign = -1;
4175
4176 /* Check for a test for recursion in a named group. */
4177
4178 if (ptr[1] == 'R' && ptr[2] == '&')
4179 {
4180 terminator = -1;
4181 ptr += 2;
4182 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4183 }
4184
4185 /* Check for a test for a named group's having been set, using the Perl
4186 syntax (?(<name>) or (?('name') */
4187
4188 else if (ptr[1] == '<')
4189 {
4190 terminator = '>';
4191 ptr++;
4192 }
4193 else if (ptr[1] == '\'')
4194 {
4195 terminator = '\'';
4196 ptr++;
4197 }
4198 else
4199 {
4200 terminator = 0;
4201 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4202 }
4203
4204 /* We now expect to read a name; any thing else is an error */
4205
4206 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4207 {
4208 ptr += 1; /* To get the right offset */
4209 *errorcodeptr = ERR28;
4210 goto FAILED;
4211 }
4212
4213 /* Read the name, but also get it as a number if it's all digits */
4214
4215 recno = 0;
4216 name = ++ptr;
4217 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4218 {
4219 if (recno >= 0)
4220 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4221 recno * 10 + *ptr - '0' : -1;
4222 ptr++;
4223 }
4224 namelen = ptr - name;
4225
4226 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4227 {
4228 ptr--; /* Error offset */
4229 *errorcodeptr = ERR26;
4230 goto FAILED;
4231 }
4232
4233 /* Do no further checking in the pre-compile phase. */
4234
4235 if (lengthptr != NULL) break;
4236
4237 /* In the real compile we do the work of looking for the actual
4238 reference. If the string started with "+" or "-" we require the rest to
4239 be digits, in which case recno will be set. */
4240
4241 if (refsign > 0)
4242 {
4243 if (recno <= 0)
4244 {
4245 *errorcodeptr = ERR58;
4246 goto FAILED;
4247 }
4248 recno = (refsign == '-')?
4249 cd->bracount - recno + 1 : recno +cd->bracount;
4250 if (recno <= 0 || recno > cd->final_bracount)
4251 {
4252 *errorcodeptr = ERR15;
4253 goto FAILED;
4254 }
4255 PUT2(code, 2+LINK_SIZE, recno);
4256 break;
4257 }
4258
4259 /* Otherwise (did not start with "+" or "-"), start by looking for the
4260 name. */
4261
4262 slot = cd->name_table;
4263 for (i = 0; i < cd->names_found; i++)
4264 {
4265 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4266 slot += cd->name_entry_size;
4267 }
4268
4269 /* Found a previous named subpattern */
4270
4271 if (i < cd->names_found)
4272 {
4273 recno = GET2(slot, 0);
4274 PUT2(code, 2+LINK_SIZE, recno);
4275 }
4276
4277 /* Search the pattern for a forward reference */
4278
4279 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4280 (options & PCRE_EXTENDED) != 0)) > 0)
4281 {
4282 PUT2(code, 2+LINK_SIZE, i);
4283 }
4284
4285 /* If terminator == 0 it means that the name followed directly after
4286 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4287 some further alternatives to try. For the cases where terminator != 0
4288 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4289 now checked all the possibilities, so give an error. */
4290
4291 else if (terminator != 0)
4292 {
4293 *errorcodeptr = ERR15;
4294 goto FAILED;
4295 }
4296
4297 /* Check for (?(R) for recursion. Allow digits after R to specify a
4298 specific group number. */
4299
4300 else if (*name == 'R')
4301 {
4302 recno = 0;
4303 for (i = 1; i < namelen; i++)
4304 {
4305 if ((digitab[name[i]] & ctype_digit) == 0)
4306 {
4307 *errorcodeptr = ERR15;
4308 goto FAILED;
4309 }
4310 recno = recno * 10 + name[i] - '0';
4311 }
4312 if (recno == 0) recno = RREF_ANY;
4313 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4314 PUT2(code, 2+LINK_SIZE, recno);
4315 }
4316
4317 /* Similarly, check for the (?(DEFINE) "condition", which is always
4318 false. */
4319
4320 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4321 {
4322 code[1+LINK_SIZE] = OP_DEF;
4323 skipbytes = 1;
4324 }
4325
4326 /* Check for the "name" actually being a subpattern number. We are
4327 in the second pass here, so final_bracount is set. */
4328
4329 else if (recno > 0 && recno <= cd->final_bracount)
4330 {
4331 PUT2(code, 2+LINK_SIZE, recno);
4332 }
4333
4334 /* Either an unidentified subpattern, or a reference to (?(0) */
4335
4336 else
4337 {
4338 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4339 goto FAILED;
4340 }
4341 break;
4342
4343
4344 /* ------------------------------------------------------------ */
4345 case '=': /* Positive lookahead */
4346 bravalue = OP_ASSERT;
4347 ptr++;
4348 break;
4349
4350
4351 /* ------------------------------------------------------------ */
4352 case '!': /* Negative lookahead */
4353 ptr++;
4354 if (*ptr == ')') /* Optimize (?!) */
4355 {
4356 *code++ = OP_FAIL;
4357 previous = NULL;
4358 continue;
4359 }
4360 bravalue = OP_ASSERT_NOT;
4361 break;
4362
4363
4364 /* ------------------------------------------------------------ */
4365 case '<': /* Lookbehind or named define */
4366 switch (ptr[1])
4367 {
4368 case '=': /* Positive lookbehind */
4369 bravalue = OP_ASSERTBACK;
4370 ptr += 2;
4371 break;
4372
4373 case '!': /* Negative lookbehind */
4374 bravalue = OP_ASSERTBACK_NOT;
4375 ptr += 2;
4376 break;
4377
4378 default: /* Could be name define, else bad */
4379 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4380 ptr++; /* Correct offset for error */
4381 *errorcodeptr = ERR24;
4382 goto FAILED;
4383 }
4384 break;
4385
4386
4387 /* ------------------------------------------------------------ */
4388 case '>': /* One-time brackets */
4389 bravalue = OP_ONCE;
4390 ptr++;
4391 break;
4392
4393
4394 /* ------------------------------------------------------------ */
4395 case 'C': /* Callout - may be followed by digits; */
4396 previous_callout = code; /* Save for later completion */
4397 after_manual_callout = 1; /* Skip one item before completing */
4398 *code++ = OP_CALLOUT;
4399 {
4400 int n = 0;
4401 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4402 n = n * 10 + *ptr - '0';
4403 if (*ptr != ')')
4404 {
4405 *errorcodeptr = ERR39;
4406 goto FAILED;
4407 }
4408 if (n > 255)
4409 {
4410 *errorcodeptr = ERR38;
4411 goto FAILED;
4412 }
4413 *code++ = n;
4414 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4415 PUT(code, LINK_SIZE, 0); /* Default length */
4416 code += 2 * LINK_SIZE;
4417 }
4418 previous = NULL;
4419 continue;
4420
4421
4422 /* ------------------------------------------------------------ */
4423 case 'P': /* Python-style named subpattern handling */
4424 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4425 {
4426 is_recurse = *ptr == '>';
4427 terminator = ')';
4428 goto NAMED_REF_OR_RECURSE;
4429 }
4430 else if (*ptr != '<') /* Test for Python-style definition */
4431 {
4432 *errorcodeptr = ERR41;
4433 goto FAILED;
4434 }
4435 /* Fall through to handle (?P< as (?< is handled */
4436
4437
4438 /* ------------------------------------------------------------ */
4439 DEFINE_NAME: /* Come here from (?< handling */
4440 case '\'':
4441 {
4442 terminator = (*ptr == '<')? '>' : '\'';
4443 name = ++ptr;
4444
4445 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4446 namelen = ptr - name;
4447
4448 /* In the pre-compile phase, just do a syntax check. */
4449
4450 if (lengthptr != NULL)
4451 {
4452 if (*ptr != terminator)
4453 {
4454 *errorcodeptr = ERR42;
4455 goto FAILED;
4456 }
4457 if (cd->names_found >= MAX_NAME_COUNT)
4458 {
4459 *errorcodeptr = ERR49;
4460 goto FAILED;
4461 }
4462 if (namelen + 3 > cd->name_entry_size)
4463 {
4464 cd->name_entry_size = namelen + 3;
4465 if (namelen > MAX_NAME_SIZE)
4466 {
4467 *errorcodeptr = ERR48;
4468 goto FAILED;
4469 }
4470 }
4471 }
4472
4473 /* In the real compile, create the entry in the table */
4474
4475 else
4476 {
4477 slot = cd->name_table;
4478 for (i = 0; i < cd->names_found; i++)
4479 {
4480 int crc = memcmp(name, slot+2, namelen);
4481 if (crc == 0)
4482 {
4483 if (slot[2+namelen] == 0)
4484 {
4485 if ((options & PCRE_DUPNAMES) == 0)
4486 {
4487 *errorcodeptr = ERR43;
4488 goto FAILED;
4489 }
4490 }
4491 else crc = -1; /* Current name is substring */
4492 }
4493 if (crc < 0)
4494 {
4495 memmove(slot + cd->name_entry_size, slot,
4496 (cd->names_found - i) * cd->name_entry_size);
4497 break;
4498 }
4499 slot += cd->name_entry_size;
4500 }
4501
4502 PUT2(slot, 0, cd->bracount + 1);
4503 memcpy(slot + 2, name, namelen);
4504 slot[2+namelen] = 0;
4505 }
4506 }
4507
4508 /* In both cases, count the number of names we've encountered. */
4509
4510 ptr++; /* Move past > or ' */
4511 cd->names_found++;
4512 goto NUMBERED_GROUP;
4513
4514
4515 /* ------------------------------------------------------------ */
4516 case '&': /* Perl recursion/subroutine syntax */
4517 terminator = ')';
4518 is_recurse = TRUE;
4519 /* Fall through */
4520
4521 /* We come here from the Python syntax above that handles both
4522 references (?P=name) and recursion (?P>name), as well as falling
4523 through from the Perl recursion syntax (?&name). We also come here from
4524 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4525 .NET syntax. */
4526
4527 NAMED_REF_OR_RECURSE:
4528 name = ++ptr;
4529 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4530 namelen = ptr - name;
4531
4532 /* In the pre-compile phase, do a syntax check and set a dummy
4533 reference number. */
4534
4535 if (lengthptr != NULL)
4536 {
4537 if (namelen == 0)
4538 {
4539 *errorcodeptr = ERR62;
4540 goto FAILED;
4541 }
4542 if (*ptr != terminator)
4543 {
4544 *errorcodeptr = ERR42;
4545 goto FAILED;
4546 }
4547 if (namelen > MAX_NAME_SIZE)
4548 {
4549 *errorcodeptr = ERR48;
4550 goto FAILED;
4551 }
4552 recno = 0;
4553 }
4554
4555 /* In the real compile, seek the name in the table. We check the name
4556 first, and then check that we have reached the end of the name in the
4557 table. That way, if the name that is longer than any in the table,
4558 the comparison will fail without reading beyond the table entry. */
4559
4560 else
4561 {
4562 slot = cd->name_table;
4563 for (i = 0; i < cd->names_found; i++)
4564 {
4565 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4566 slot[2+namelen] == 0)
4567 break;
4568 slot += cd->name_entry_size;
4569 }
4570
4571 if (i < cd->names_found) /* Back reference */
4572 {
4573 recno = GET2(slot, 0);
4574 }
4575 else if ((recno = /* Forward back reference */
4576 find_parens(ptr, cd->bracount, name, namelen,
4577 (options & PCRE_EXTENDED) != 0)) <= 0)
4578 {
4579 *errorcodeptr = ERR15;
4580 goto FAILED;
4581 }
4582 }
4583
4584 /* In both phases, we can now go to the code than handles numerical
4585 recursion or backreferences. */
4586
4587 if (is_recurse) goto HANDLE_RECURSION;
4588 else goto HANDLE_REFERENCE;
4589
4590
4591 /* ------------------------------------------------------------ */
4592 case 'R': /* Recursion */
4593 ptr++; /* Same as (?0) */
4594 /* Fall through */
4595
4596
4597 /* ------------------------------------------------------------ */
4598 case '-': case '+':
4599 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4600 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4601 {
4602 const uschar *called;
4603
4604 if ((refsign = *ptr) == '+')
4605 {
4606 ptr++;
4607 if ((digitab[*ptr] & ctype_digit) == 0)
4608 {
4609 *errorcodeptr = ERR63;
4610 goto FAILED;
4611 }
4612 }
4613 else if (refsign == '-')
4614 {
4615 if ((digitab[ptr[1]] & ctype_digit) == 0)
4616 goto OTHER_CHAR_AFTER_QUERY;
4617 ptr++;
4618 }
4619
4620 recno = 0;
4621 while((digitab[*ptr] & ctype_digit) != 0)
4622 recno = recno * 10 + *ptr++ - '0';
4623
4624 if (*ptr != ')')
4625 {
4626 *errorcodeptr = ERR29;
4627 goto FAILED;
4628 }
4629
4630 if (refsign == '-')
4631 {
4632 if (recno == 0)
4633 {
4634 *errorcodeptr = ERR58;
4635 goto FAILED;
4636 }
4637 recno = cd->bracount - recno + 1;
4638 if (recno <= 0)
4639 {
4640 *errorcodeptr = ERR15;
4641 goto FAILED;
4642 }
4643 }
4644 else if (refsign == '+')
4645 {
4646 if (recno == 0)
4647 {
4648 *errorcodeptr = ERR58;
4649 goto FAILED;
4650 }
4651 recno += cd->bracount;
4652 }
4653
4654 /* Come here from code above that handles a named recursion */
4655
4656 HANDLE_RECURSION:
4657
4658 previous = code;
4659 called = cd->start_code;
4660
4661 /* When we are actually compiling, find the bracket that is being
4662 referenced. Temporarily end the regex in case it doesn't exist before
4663 this point. If we end up with a forward reference, first check that
4664 the bracket does occur later so we can give the error (and position)
4665 now. Then remember this forward reference in the workspace so it can
4666 be filled in at the end. */
4667
4668 if (lengthptr == NULL)
4669 {
4670 *code = OP_END;
4671 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4672
4673 /* Forward reference */
4674
4675 if (called == NULL)
4676 {
4677 if (find_parens(ptr, cd->bracount, NULL, recno,
4678 (options & PCRE_EXTENDED) != 0) < 0)
4679 {
4680 *errorcodeptr = ERR15;
4681 goto FAILED;
4682 }
4683 called = cd->start_code + recno;
4684 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4685 }
4686
4687 /* If not a forward reference, and the subpattern is still open,
4688 this is a recursive call. We check to see if this is a left
4689 recursion that could loop for ever, and diagnose that case. */
4690
4691 else if (GET(called, 1) == 0 &&
4692 could_be_empty(called, code, bcptr, utf8))
4693 {
4694 *errorcodeptr = ERR40;
4695 goto FAILED;
4696 }
4697 }
4698
4699 /* Insert the recursion/subroutine item, automatically wrapped inside
4700 "once" brackets. Set up a "previous group" length so that a
4701 subsequent quantifier will work. */
4702
4703 *code = OP_ONCE;
4704 PUT(code, 1, 2 + 2*LINK_SIZE);
4705 code += 1 + LINK_SIZE;
4706
4707 *code = OP_RECURSE;
4708 PUT(code, 1, called - cd->start_code);
4709 code += 1 + LINK_SIZE;
4710
4711 *code = OP_KET;
4712 PUT(code, 1, 2 + 2*LINK_SIZE);
4713 code += 1 + LINK_SIZE;
4714
4715 length_prevgroup = 3 + 3*LINK_SIZE;
4716 }
4717
4718 /* Can't determine a first byte now */
4719
4720 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4721 continue;
4722
4723
4724 /* ------------------------------------------------------------ */
4725 default: /* Other characters: check option setting */
4726 OTHER_CHAR_AFTER_QUERY:
4727 set = unset = 0;
4728 optset = &set;
4729
4730 while (*ptr != ')' && *ptr != ':')
4731 {
4732 switch (*ptr++)
4733 {
4734 case '-': optset = &unset; break;
4735
4736 case 'J': /* Record that it changed in the external options */
4737 *optset |= PCRE_DUPNAMES;
4738 cd->external_flags |= PCRE_JCHANGED;
4739 break;
4740
4741 case 'i': *optset |= PCRE_CASELESS; break;
4742 case 'm': *optset |= PCRE_MULTILINE; break;
4743 case 's': *optset |= PCRE_DOTALL; break;
4744 case 'x': *optset |= PCRE_EXTENDED; break;
4745 case 'U': *optset |= PCRE_UNGREEDY; break;
4746 case 'X': *optset |= PCRE_EXTRA; break;
4747
4748 default: *errorcodeptr = ERR12;
4749 ptr--; /* Correct the offset */
4750 goto FAILED;
4751 }
4752 }
4753
4754 /* Set up the changed option bits, but don't change anything yet. */
4755
4756 newoptions = (options | set) & (~unset);
4757
4758 /* If the options ended with ')' this is not the start of a nested
4759 group with option changes, so the options change at this level. If this
4760 item is right at the start of the pattern, the options can be
4761 abstracted and made external in the pre-compile phase, and ignored in
4762 the compile phase. This can be helpful when matching -- for instance in
4763 caseless checking of required bytes.
4764
4765 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4766 definitely *not* at the start of the pattern because something has been
4767 compiled. In the pre-compile phase, however, the code pointer can have
4768 that value after the start, because it gets reset as code is discarded
4769 during the pre-compile. However, this can happen only at top level - if
4770 we are within parentheses, the starting BRA will still be present. At
4771 any parenthesis level, the length value can be used to test if anything
4772 has been compiled at that level. Thus, a test for both these conditions
4773 is necessary to ensure we correctly detect the start of the pattern in
4774 both phases.
4775
4776 If we are not at the pattern start, compile code to change the ims
4777 options if this setting actually changes any of them. We also pass the
4778 new setting back so that it can be put at the start of any following
4779 branches, and when this group ends (if we are in a group), a resetting
4780 item can be compiled. */
4781
4782 if (*ptr == ')')
4783 {
4784 if (code == cd->start_code + 1 + LINK_SIZE &&
4785 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4786 {
4787 cd->external_options = newoptions;
4788 options = newoptions;
4789 }
4790 else
4791 {
4792 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4793 {
4794 *code++ = OP_OPT;
4795 *code++ = newoptions & PCRE_IMS;
4796 }
4797
4798 /* Change options at this level, and pass them back for use
4799 in subsequent branches. Reset the greedy defaults and the case
4800 value for firstbyte and reqbyte. */
4801
4802 *optionsptr = options = newoptions;
4803 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4804 greedy_non_default = greedy_default ^ 1;
4805 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4806 }
4807
4808 previous = NULL; /* This item can't be repeated */
4809 continue; /* It is complete */
4810 }
4811
4812 /* If the options ended with ':' we are heading into a nested group
4813 with possible change of options. Such groups are non-capturing and are
4814 not assertions of any kind. All we need to do is skip over the ':';
4815 the newoptions value is handled below. */
4816
4817 bravalue = OP_BRA;
4818 ptr++;
4819 } /* End of switch for character following (? */
4820 } /* End of (? handling */
4821
4822 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4823 all unadorned brackets become non-capturing and behave like (?:...)
4824 brackets. */
4825
4826 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4827 {
4828 bravalue = OP_BRA;
4829 }
4830
4831 /* Else we have a capturing group. */
4832
4833 else
4834 {
4835 NUMBERED_GROUP:
4836 cd->bracount += 1;
4837 PUT2(code, 1+LINK_SIZE, cd->bracount);
4838 skipbytes = 2;
4839 }
4840
4841 /* Process nested bracketed regex. Assertions may not be repeated, but
4842 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4843 non-register variable in order to be able to pass its address because some
4844 compilers complain otherwise. Pass in a new setting for the ims options if
4845 they have changed. */
4846
4847 previous = (bravalue >= OP_ONCE)? code : NULL;
4848 *code = bravalue;
4849 tempcode = code;
4850 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4851 length_prevgroup = 0; /* Initialize for pre-compile phase */
4852
4853 if (!compile_regex(
4854 newoptions, /* The complete new option state */
4855 options & PCRE_IMS, /* The previous ims option state */
4856 &tempcode, /* Where to put code (updated) */
4857 &ptr, /* Input pointer (updated) */
4858 errorcodeptr, /* Where to put an error message */
4859 (bravalue == OP_ASSERTBACK ||
4860 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4861 reset_bracount, /* True if (?| group */
4862 skipbytes, /* Skip over bracket number */
4863 &subfirstbyte, /* For possible first char */
4864 &subreqbyte, /* For possible last char */
4865 bcptr, /* Current branch chain */
4866 cd, /* Tables block */
4867 (lengthptr == NULL)? NULL : /* Actual compile phase */
4868 &length_prevgroup /* Pre-compile phase */
4869 ))
4870 goto FAILED;
4871
4872 /* At the end of compiling, code is still pointing to the start of the
4873 group, while tempcode has been updated to point past the end of the group
4874 and any option resetting that may follow it. The pattern pointer (ptr)
4875 is on the bracket. */
4876
4877 /* If this is a conditional bracket, check that there are no more than
4878 two branches in the group, or just one if it's a DEFINE group. We do this
4879 in the real compile phase, not in the pre-pass, where the whole group may
4880 not be available. */
4881
4882 if (bravalue == OP_COND && lengthptr == NULL)
4883 {
4884 uschar *tc = code;
4885 int condcount = 0;
4886
4887 do {
4888 condcount++;
4889 tc += GET(tc,1);
4890 }
4891 while (*tc != OP_KET);
4892
4893 /* A DEFINE group is never obeyed inline (the "condition" is always
4894 false). It must have only one branch. */
4895
4896 if (code[LINK_SIZE+1] == OP_DEF)
4897 {
4898 if (condcount > 1)
4899 {
4900 *errorcodeptr = ERR54;
4901 goto FAILED;
4902 }
4903 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4904 }
4905
4906 /* A "normal" conditional group. If there is just one branch, we must not
4907 make use of its firstbyte or reqbyte, because this is equivalent to an
4908 empty second branch. */
4909
4910 else
4911 {
4912 if (condcount > 2)
4913 {
4914 *errorcodeptr = ERR27;
4915 goto FAILED;
4916 }
4917 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4918 }
4919 }
4920
4921 /* Error if hit end of pattern */
4922
4923 if (*ptr != ')')
4924 {
4925 *errorcodeptr = ERR14;
4926 goto FAILED;
4927 }
4928
4929 /* In the pre-compile phase, update the length by the length of the group,
4930 less the brackets at either end. Then reduce the compiled code to just a
4931 set of non-capturing brackets so that it doesn't use much memory if it is
4932 duplicated by a quantifier.*/
4933
4934 if (lengthptr != NULL)
4935 {
4936 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4937 {
4938 *errorcodeptr = ERR20;
4939 goto FAILED;
4940 }
4941 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4942 *code++ = OP_BRA;
4943 PUTINC(code, 0, 1 + LINK_SIZE);
4944 *code++ = OP_KET;
4945 PUTINC(code, 0, 1 + LINK_SIZE);
4946 break; /* No need to waste time with special character handling */
4947 }
4948
4949 /* Otherwise update the main code pointer to the end of the group. */
4950
4951 code = tempcode;
4952
4953 /* For a DEFINE group, required and first character settings are not
4954 relevant. */
4955
4956 if (bravalue == OP_DEF) break;
4957
4958 /* Handle updating of the required and first characters for other types of
4959 group. Update for normal brackets of all kinds, and conditions with two
4960 branches (see code above). If the bracket is followed by a quantifier with
4961 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4962 zerofirstbyte outside the main loop so that they can be accessed for the
4963 back off. */
4964
4965 zeroreqbyte = reqbyte;
4966 zerofirstbyte = firstbyte;
4967 groupsetfirstbyte = FALSE;
4968
4969 if (bravalue >= OP_ONCE)
4970 {
4971 /* If we have not yet set a firstbyte in this branch, take it from the
4972 subpattern, remembering that it was set here so that a repeat of more
4973 than one can replicate it as reqbyte if necessary. If the subpattern has
4974 no firstbyte, set "none" for the whole branch. In both cases, a zero
4975 repeat forces firstbyte to "none". */
4976
4977 if (firstbyte == REQ_UNSET)
4978 {
4979 if (subfirstbyte >= 0)
4980 {
4981 firstbyte = subfirstbyte;
4982 groupsetfirstbyte = TRUE;
4983 }
4984 else firstbyte = REQ_NONE;
4985 zerofirstbyte = REQ_NONE;
4986 }
4987
4988 /* If firstbyte was previously set, convert the subpattern's firstbyte
4989 into reqbyte if there wasn't one, using the vary flag that was in
4990 existence beforehand. */
4991
4992 else if (subfirstbyte >= 0 && subreqbyte < 0)
4993 subreqbyte = subfirstbyte | tempreqvary;
4994
4995 /* If the subpattern set a required byte (or set a first byte that isn't
4996 really the first byte - see above), set it. */
4997
4998 if (subreqbyte >= 0) reqbyte = subreqbyte;
4999 }
5000
5001 /* For a forward assertion, we take the reqbyte, if set. This can be
5002 helpful if the pattern that follows the assertion doesn't set a different
5003 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5004 for an assertion, however because it leads to incorrect effect for patterns
5005 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5006 of a firstbyte. This is overcome by a scan at the end if there's no
5007 firstbyte, looking for an asserted first char. */
5008
5009 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5010 break; /* End of processing '(' */
5011
5012
5013 /* ===================================================================*/
5014 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5015 are arranged to be the negation of the corresponding OP_values. For the
5016 back references, the values are ESC_REF plus the reference number. Only
5017 back references and those types that consume a character may be repeated.
5018 We can test for values between ESC_b and ESC_Z for the latter; this may
5019 have to change if any new ones are ever created. */
5020
5021 case '\\':
5022 tempptr = ptr;
5023 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5024 if (*errorcodeptr != 0) goto FAILED;
5025
5026 if (c < 0)
5027 {
5028 if (-c == ESC_Q) /* Handle start of quoted string */
5029 {
5030 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5031 else inescq = TRUE;
5032 continue;
5033 }
5034
5035 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5036
5037 /* For metasequences that actually match a character, we disable the
5038 setting of a first character if it hasn't already been set. */
5039
5040 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5041 firstbyte = REQ_NONE;
5042
5043 /* Set values to reset to if this is followed by a zero repeat. */
5044
5045 zerofirstbyte = firstbyte;
5046 zeroreqbyte = reqbyte;
5047
5048 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5049 We also support \k{name} (.NET syntax) */
5050
5051 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5052 {
5053 is_recurse = FALSE;
5054 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5055 goto NAMED_REF_OR_RECURSE;
5056 }
5057
5058 /* Back references are handled specially; must disable firstbyte if
5059 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5060 ':' later. */
5061
5062 if (-c >= ESC_REF)
5063 {
5064 recno = -c - ESC_REF;
5065
5066 HANDLE_REFERENCE: /* Come here from named backref handling */
5067 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5068 previous = code;
5069 *code++ = OP_REF;
5070 PUT2INC(code, 0, recno);
5071 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5072 if (recno > cd->top_backref) cd->top_backref = recno;
5073 }
5074
5075 /* So are Unicode property matches, if supported. */
5076
5077 #ifdef SUPPORT_UCP
5078 else if (-c == ESC_P || -c == ESC_p)
5079 {
5080 BOOL negated;
5081 int pdata;
5082 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5083 if (ptype < 0) goto FAILED;
5084 previous = code;
5085 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5086 *code++ = ptype;
5087 *code++ = pdata;
5088 }
5089 #else
5090
5091 /* If Unicode properties are not supported, \X, \P, and \p are not
5092 allowed. */
5093
5094 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5095 {
5096 *errorcodeptr = ERR45;
5097 goto FAILED;
5098 }
5099 #endif
5100
5101 /* For the rest (including \X when Unicode properties are supported), we
5102 can obtain the OP value by negating the escape value. */
5103
5104 else
5105 {
5106 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5107 *code++ = -c;
5108 }
5109 continue;
5110 }
5111
5112 /* We have a data character whose value is in c. In UTF-8 mode it may have
5113 a value > 127. We set its representation in the length/buffer, and then
5114 handle it as a data character. */
5115
5116 #ifdef SUPPORT_UTF8
5117 if (utf8 && c > 127)
5118 mclength = _pcre_ord2utf8(c, mcbuffer);
5119 else
5120 #endif
5121
5122 {
5123 mcbuffer[0] = c;
5124 mclength = 1;
5125 }
5126 goto ONE_CHAR;
5127
5128
5129 /* ===================================================================*/
5130 /* Handle a literal character. It is guaranteed not to be whitespace or #
5131 when the extended flag is set. If we are in UTF-8 mode, it may be a
5132 multi-byte literal character. */
5133
5134 default:
5135 NORMAL_CHAR:
5136 mclength = 1;
5137 mcbuffer[0] = c;
5138
5139 #ifdef SUPPORT_UTF8
5140 if (utf8 && c >= 0xc0)
5141 {
5142 while ((ptr[1] & 0xc0) == 0x80)
5143 mcbuffer[mclength++] = *(++ptr);
5144 }
5145 #endif
5146
5147 /* At this point we have the character's bytes in mcbuffer, and the length
5148 in mclength. When not in UTF-8 mode, the length is always 1. */
5149
5150 ONE_CHAR:
5151 previous = code;
5152 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5153 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5154
5155 /* Remember if \r or \n were seen */
5156
5157 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5158 cd->external_flags |= PCRE_HASCRORLF;
5159
5160 /* Set the first and required bytes appropriately. If no previous first
5161 byte, set it from this character, but revert to none on a zero repeat.
5162 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5163 repeat. */
5164
5165 if (firstbyte == REQ_UNSET)
5166 {
5167 zerofirstbyte = REQ_NONE;
5168 zeroreqbyte = reqbyte;
5169
5170 /* If the character is more than one byte long, we can set firstbyte
5171 only if it is not to be matched caselessly. */
5172
5173 if (mclength == 1 || req_caseopt == 0)
5174 {
5175 firstbyte = mcbuffer[0] | req_caseopt;
5176 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5177 }
5178 else firstbyte = reqbyte = REQ_NONE;
5179 }
5180
5181 /* firstbyte was previously set; we can set reqbyte only the length is
5182 1 or the matching is caseful. */
5183
5184 else
5185 {
5186 zerofirstbyte = firstbyte;
5187 zeroreqbyte = reqbyte;
5188 if (mclength == 1 || req_caseopt == 0)
5189 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5190 }
5191
5192 break; /* End of literal character handling */
5193 }
5194 } /* end of big loop */
5195
5196
5197 /* Control never reaches here by falling through, only by a goto for all the
5198 error states. Pass back the position in the pattern so that it can be displayed
5199 to the user for diagnosing the error. */
5200
5201 FAILED:
5202 *ptrptr = ptr;
5203 return FALSE;
5204 }
5205
5206
5207
5208
5209 /*************************************************
5210 * Compile sequence of alternatives *
5211 *************************************************/
5212
5213 /* On entry, ptr is pointing past the bracket character, but on return it
5214 points to the closing bracket, or vertical bar, or end of string. The code
5215 variable is pointing at the byte into which the BRA operator has been stored.
5216 If the ims options are changed at the start (for a (?ims: group) or during any
5217 branch, we need to insert an OP_OPT item at the start of every following branch
5218 to ensure they get set correctly at run time, and also pass the new options
5219 into every subsequent branch compile.
5220
5221 This function is used during the pre-compile phase when we are trying to find
5222 out the amount of memory needed, as well as during the real compile phase. The
5223 value of lengthptr distinguishes the two phases.
5224
5225 Arguments:
5226 options option bits, including any changes for this subpattern
5227 oldims previous settings of ims option bits
5228 codeptr -> the address of the current code pointer
5229 ptrptr -> the address of the current pattern pointer
5230 errorcodeptr -> pointer to error code variable
5231 lookbehind TRUE if this is a lookbehind assertion
5232 reset_bracount TRUE to reset the count for each branch
5233 skipbytes skip this many bytes at start (for brackets and OP_COND)
5234 firstbyteptr place to put the first required character, or a negative number
5235 reqbyteptr place to put the last required character, or a negative number
5236 bcptr pointer to the chain of currently open branches
5237 cd points to the data block with tables pointers etc.
5238 lengthptr NULL during the real compile phase
5239 points to length accumulator during pre-compile phase
5240
5241 Returns: TRUE on success
5242 */
5243
5244 static BOOL
5245 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5246 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5247 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5248 int *lengthptr)
5249 {
5250 const uschar *ptr = *ptrptr;
5251 uschar *code = *codeptr;
5252 uschar *last_branch = code;
5253 uschar *start_bracket = code;
5254 uschar *reverse_count = NULL;
5255 int firstbyte, reqbyte;
5256 int branchfirstbyte, branchreqbyte;
5257 int length;
5258 int orig_bracount;
5259 int max_bracount;
5260 branch_chain bc;
5261
5262 bc.outer = bcptr;
5263 bc.current = code;
5264
5265 firstbyte = reqbyte = REQ_UNSET;
5266
5267 /* Accumulate the length for use in the pre-compile phase. Start with the
5268 length of the BRA and KET and any extra bytes that are required at the
5269 beginning. We accumulate in a local variable to save frequent testing of
5270 lenthptr for NULL. We cannot do this by looking at the value of code at the
5271 start and end of each alternative, because compiled items are discarded during
5272 the pre-compile phase so that the work space is not exceeded. */
5273
5274 length = 2 + 2*LINK_SIZE + skipbytes;
5275
5276 /* WARNING: If the above line is changed for any reason, you must also change
5277 the code that abstracts option settings at the start of the pattern and makes
5278 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5279 pre-compile phase to find out whether anything has yet been compiled or not. */
5280
5281 /* Offset is set zero to mark that this bracket is still open */
5282
5283 PUT(code, 1, 0);
5284 code += 1 + LINK_SIZE + skipbytes;
5285
5286 /* Loop for each alternative branch */
5287
5288 orig_bracount = max_bracount = cd->bracount;
5289 for (;;)
5290 {
5291 /* For a (?| group, reset the capturing bracket count so that each branch
5292 uses the same numbers. */
5293
5294 if (reset_bracount) cd->bracount = orig_bracount;
5295
5296 /* Handle a change of ims options at the start of the branch */
5297
5298 if ((options & PCRE_IMS) != oldims)
5299 {
5300 *code++ = OP_OPT;
5301 *code++ = options & PCRE_IMS;
5302 length += 2;
5303 }
5304
5305 /* Set up dummy OP_REVERSE if lookbehind assertion */
5306
5307 if (lookbehind)
5308 {
5309 *code++ = OP_REVERSE;
5310 reverse_count = code;
5311 PUTINC(code, 0, 0);
5312 length += 1 + LINK_SIZE;
5313 }
5314
5315 /* Now compile the branch; in the pre-compile phase its length gets added
5316 into the length. */
5317
5318 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5319 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5320 {
5321 *ptrptr = ptr;
5322 return FALSE;
5323 }
5324
5325 /* Keep the highest bracket count in case (?| was used and some branch
5326 has fewer than the rest. */
5327
5328 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5329
5330 /* In the real compile phase, there is some post-processing to be done. */
5331
5332 if (lengthptr == NULL)
5333 {
5334 /* If this is the first branch, the firstbyte and reqbyte values for the
5335 branch become the values for the regex. */
5336
5337 if (*last_branch != OP_ALT)
5338 {
5339 firstbyte = branchfirstbyte;
5340 reqbyte = branchreqbyte;
5341 }
5342
5343 /* If this is not the first branch, the first char and reqbyte have to
5344 match the values from all the previous branches, except that if the
5345 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5346 and we set REQ_VARY for the regex. */
5347
5348 else
5349 {
5350 /* If we previously had a firstbyte, but it doesn't match the new branch,
5351 we have to abandon the firstbyte for the regex, but if there was
5352 previously no reqbyte, it takes on the value of the old firstbyte. */
5353
5354 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5355 {
5356 if (reqbyte < 0) reqbyte = firstbyte;
5357 firstbyte = REQ_NONE;
5358 }
5359
5360 /* If we (now or from before) have no firstbyte, a firstbyte from the
5361 branch becomes a reqbyte if there isn't a branch reqbyte. */
5362
5363 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5364 branchreqbyte = branchfirstbyte;
5365
5366 /* Now ensure that the reqbytes match */
5367
5368 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5369 reqbyte = REQ_NONE;
5370 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5371 }
5372
5373 /* If lookbehind, check that this branch matches a fixed-length string, and
5374 put the length into the OP_REVERSE item. Temporarily mark the end of the
5375 branch with OP_END. */
5376
5377 if (lookbehind)
5378 {
5379 int fixed_length;
5380 *code = OP_END;
5381 fixed_length = find_fixedlength(last_branch, options);
5382 DPRINTF(("fixed length = %d\n", fixed_length));
5383 if (fixed_length < 0)
5384 {
5385 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5386 *ptrptr = ptr;
5387 return FALSE;
5388 }
5389 PUT(reverse_count, 0, fixed_length);
5390 }
5391 }
5392
5393 /* Reached end of expression, either ')' or end of pattern. In the real
5394 compile phase, go back through the alternative branches and reverse the chain
5395 of offsets, with the field in the BRA item now becoming an offset to the
5396 first alternative. If there are no alternatives, it points to the end of the
5397 group. The length in the terminating ket is always the length of the whole
5398 bracketed item. If any of the ims options were changed inside the group,
5399 compile a resetting op-code following, except at the very end of the pattern.
5400 Return leaving the pointer at the terminating char. */
5401
5402 if (*ptr != '|')
5403 {
5404 if (lengthptr == NULL)
5405 {
5406 int branch_length = code - last_branch;
5407 do
5408 {
5409 int prev_length = GET(last_branch, 1);
5410 PUT(last_branch, 1, branch_length);
5411 branch_length = prev_length;
5412 last_branch -= branch_length;
5413 }
5414 while (branch_length > 0);
5415 }
5416
5417 /* Fill in the ket */
5418
5419 *code = OP_KET;
5420 PUT(code, 1, code - start_bracket);
5421 code += 1 + LINK_SIZE;
5422
5423 /* Resetting option if needed */
5424
5425 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5426 {
5427 *code++ = OP_OPT;
5428 *code++ = oldims;
5429 length += 2;
5430 }
5431
5432 /* Retain the highest bracket number, in case resetting was used. */
5433
5434 cd->bracount = max_bracount;
5435
5436 /* Set values to pass back */
5437
5438 *codeptr = code;
5439 *ptrptr = ptr;
5440 *firstbyteptr = firstbyte;
5441 *reqbyteptr = reqbyte;
5442 if (lengthptr != NULL)
5443 {
5444 if (OFLOW_MAX - *lengthptr < length)
5445 {
5446 *errorcodeptr = ERR20;
5447 return FALSE;
5448 }
5449 *lengthptr += length;
5450 }
5451 return TRUE;
5452 }
5453
5454 /* Another branch follows. In the pre-compile phase, we can move the code
5455 pointer back to where it was for the start of the first branch. (That is,
5456 pretend that each branch is the only one.)
5457
5458 In the real compile phase, insert an ALT node. Its length field points back
5459 to the previous branch while the bracket remains open. At the end the chain
5460 is reversed. It's done like this so that the start of the bracket has a
5461 zero offset until it is closed, making it possible to detect recursion. */
5462
5463 if (lengthptr != NULL)
5464 {
5465 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5466 length += 1 + LINK_SIZE;
5467 }
5468 else
5469 {
5470 *code = OP_ALT;
5471 PUT(code, 1, code - last_branch);
5472 bc.current = last_branch = code;
5473 code += 1 + LINK_SIZE;
5474 }
5475
5476 ptr++;
5477 }
5478 /* Control never reaches here */
5479 }
5480
5481
5482
5483
5484 /*************************************************
5485 * Check for anchored expression *
5486 *************************************************/
5487
5488 /* Try to find out if this is an anchored regular expression. Consider each
5489 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5490 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5491 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5492 counts, since OP_CIRC can match in the middle.
5493
5494 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5495 This is the code for \G, which means "match at start of match position, taking
5496 into account the match offset".
5497
5498 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5499 because that will try the rest of the pattern at all possible matching points,
5500 so there is no point trying again.... er ....
5501
5502 .... except when the .* appears inside capturing parentheses, and there is a
5503 subsequent back reference to those parentheses. We haven't enough information
5504 to catch that case precisely.
5505
5506 At first, the best we could do was to detect when .* was in capturing brackets
5507 and the highest back reference was greater than or equal to that level.
5508 However, by keeping a bitmap of the first 31 back references, we can catch some
5509 of the more common cases more precisely.
5510
5511 Arguments:
5512 code points to start of expression (the bracket)
5513 options points to the options setting
5514 bracket_map a bitmap of which brackets we are inside while testing; this
5515 handles up to substring 31; after that we just have to take
5516 the less precise approach
5517 backref_map the back reference bitmap
5518
5519 Returns: TRUE or FALSE
5520 */
5521
5522 static BOOL
5523 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5524 unsigned int backref_map)
5525 {
5526 do {
5527 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5528 options, PCRE_MULTILINE, FALSE);
5529 register int op = *scode;
5530
5531 /* Non-capturing brackets */
5532
5533 if (op == OP_BRA)
5534 {
5535 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5536 }
5537
5538 /* Capturing brackets */
5539
5540 else if (op == OP_CBRA)
5541 {
5542 int n = GET2(scode, 1+LINK_SIZE);
5543 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5544 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5545 }
5546
5547 /* Other brackets */
5548
5549 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5550 {
5551 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5552 }
5553
5554 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5555 are or may be referenced. */
5556
5557 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5558 op == OP_TYPEPOSSTAR) &&
5559 (*options & PCRE_DOTALL) != 0)
5560 {
5561 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5562 }
5563
5564 /* Check for explicit anchoring */
5565
5566 else if (op != OP_SOD && op != OP_SOM &&
5567 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5568 return FALSE;
5569 code += GET(code, 1);
5570 }
5571 while (*code == OP_ALT); /* Loop for each alternative */
5572 return TRUE;
5573 }
5574
5575
5576
5577 /*************************************************
5578 * Check for starting with ^ or .* *
5579 *************************************************/
5580
5581 /* This is called to find out if every branch starts with ^ or .* so that
5582 "first char" processing can be done to speed things up in multiline
5583 matching and for non-DOTALL patterns that start with .* (which must start at
5584 the beginning or after \n). As in the case of is_anchored() (see above), we
5585 have to take account of back references to capturing brackets that contain .*
5586 because in that case we can't make the assumption.
5587
5588 Arguments:
5589 code points to start of expression (the bracket)
5590 bracket_map a bitmap of which brackets we are inside while testing; this
5591 handles up to substring 31; after that we just have to take
5592 the less precise approach
5593 backref_map the back reference bitmap
5594
5595 Returns: TRUE or FALSE
5596 */
5597
5598 static BOOL
5599 is_startline(const uschar *code, unsigned int bracket_map,
5600 unsigned int backref_map)
5601 {
5602 do {
5603 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5604 NULL, 0, FALSE);
5605 register int op = *scode;
5606
5607 /* Non-capturing brackets */
5608
5609 if (op == OP_BRA)
5610 {
5611 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5612 }
5613
5614 /* Capturing brackets */
5615
5616 else if (op == OP_CBRA)
5617 {
5618 int n = GET2(scode, 1+LINK_SIZE);
5619 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5620 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5621 }
5622
5623 /* Other brackets */
5624
5625 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5626 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5627
5628 /* .* means "start at start or after \n" if it isn't in brackets that
5629 may be referenced. */
5630
5631 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5632 {
5633 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5634 }
5635
5636 /* Check for explicit circumflex */
5637
5638 else if (op != OP_CIRC) return FALSE;
5639
5640 /* Move on to the next alternative */
5641
5642 code += GET(code, 1);
5643 }
5644 while (*code == OP_ALT); /* Loop for each alternative */
5645 return TRUE;
5646 }
5647
5648
5649
5650 /*************************************************
5651 * Check for asserted fixed first char *
5652 *************************************************/
5653
5654 /* During compilation, the "first char" settings from forward assertions are
5655 discarded, because they can cause conflicts with actual literals that follow.
5656 However, if we end up without a first char setting for an unanchored pattern,
5657 it is worth scanning the regex to see if there is an initial asserted first
5658 char. If all branches start with the same asserted char, or with a bracket all
5659 of whose alternatives start with the same asserted char (recurse ad lib), then
5660 we return that char, otherwise -1.
5661
5662 Arguments:
5663 code points to start of expression (the bracket)
5664 options pointer to the options (used to check casing changes)
5665 inassert TRUE if in an assertion
5666
5667 Returns: -1 or the fixed first char
5668 */
5669
5670 static int
5671 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5672 {
5673 register int c = -1;
5674 do {
5675 int d;
5676 const uschar *scode =
5677 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5678 register int op = *scode;
5679
5680 switch(op)
5681 {
5682 default:
5683 return -1;
5684
5685 case OP_BRA:
5686 case OP_CBRA:
5687 case OP_ASSERT:
5688 case OP_ONCE:
5689 case OP_COND:
5690 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5691 return -1;
5692 if (c < 0) c = d; else if (c != d) return -1;
5693 break;
5694
5695 case OP_EXACT: /* Fall through */
5696 scode += 2;
5697
5698 case OP_CHAR:
5699 case OP_CHARNC:
5700 case OP_PLUS:
5701 case OP_MINPLUS:
5702 case OP_POSPLUS:
5703 if (!inassert) return -1;
5704 if (c < 0)
5705 {
5706 c = scode[1];
5707 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5708 }
5709 else if (c != scode[1]) return -1;
5710 break;
5711 }
5712
5713 code += GET(code, 1);
5714 }
5715 while (*code == OP_ALT);
5716 return c;
5717 }
5718
5719
5720
5721 /*************************************************
5722 * Compile a Regular Expression *
5723 *************************************************/
5724
5725 /* This function takes a string and returns a pointer to a block of store
5726 holding a compiled version of the expression. The original API for this
5727 function had no error code return variable; it is retained for backwards
5728 compatibility. The new function is given a new name.
5729
5730 Arguments:
5731 pattern the regular expression
5732 options various option bits
5733 errorcodeptr pointer to error code variable (pcre_compile2() only)
5734 can be NULL if you don't want a code value
5735 errorptr pointer to pointer to error text
5736 erroroffset ptr offset in pattern where error was detected
5737 tables pointer to character tables or NULL
5738
5739 Returns: pointer to compiled data block, or NULL on error,
5740 with errorptr and erroroffset set
5741 */
5742
5743 PCRE_EXP_DEFN pcre *
5744 pcre_compile(const char *pattern, int options, const char **errorptr,
5745 int *erroroffset, const unsigned char *tables)
5746 {
5747 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5748 }
5749
5750
5751 PCRE_EXP_DEFN pcre *
5752 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5753 const char **errorptr, int *erroroffset, const unsigned char *tables)
5754 {
5755 real_pcre *re;
5756 int length = 1; /* For final END opcode */
5757 int firstbyte, reqbyte, newline;
5758 int errorcode = 0;
5759 int skipatstart = 0;
5760 #ifdef SUPPORT_UTF8
5761 BOOL utf8;
5762 #endif
5763 size_t size;
5764 uschar *code;
5765 const uschar *codestart;
5766 const uschar *ptr;
5767 compile_data compile_block;
5768 compile_data *cd = &compile_block;
5769
5770 /* This space is used for "compiling" into during the first phase, when we are
5771 computing the amount of memory that is needed. Compiled items are thrown away
5772 as soon as possible, so that a fairly large buffer should be sufficient for
5773 this purpose. The same space is used in the second phase for remembering where
5774 to fill in forward references to subpatterns. */
5775
5776 uschar cworkspace[COMPILE_WORK_SIZE];
5777
5778
5779 /* Set this early so that early errors get offset 0. */
5780
5781 ptr = (const uschar *)pattern;
5782
5783 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5784 can do is just return NULL, but we can set a code value if there is a code
5785 pointer. */
5786
5787 if (errorptr == NULL)
5788 {
5789 if (errorcodeptr != NULL) *errorcodeptr = 99;
5790 return NULL;
5791 }
5792
5793 *errorptr = NULL;
5794 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5795
5796 /* However, we can give a message for this error */
5797
5798 if (erroroffset == NULL)
5799 {
5800 errorcode = ERR16;
5801 goto PCRE_EARLY_ERROR_RETURN2;
5802 }
5803
5804 *erroroffset = 0;
5805
5806 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5807
5808 #ifdef SUPPORT_UTF8
5809 utf8 = (options & PCRE_UTF8) != 0;
5810 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5811 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5812 {
5813 errorcode = ERR44;
5814 goto PCRE_EARLY_ERROR_RETURN2;
5815 }
5816 #else
5817 if ((options & PCRE_UTF8) != 0)
5818 {
5819 errorcode = ERR32;
5820 goto PCRE_EARLY_ERROR_RETURN;
5821 }
5822 #endif
5823
5824 if ((options & ~PUBLIC_OPTIONS) != 0)
5825 {
5826 errorcode = ERR17;
5827 goto PCRE_EARLY_ERROR_RETURN;
5828 }
5829
5830 /* Set up pointers to the individual character tables */
5831
5832 if (tables == NULL) tables = _pcre_default_tables;
5833 cd->lcc = tables + lcc_offset;
5834 cd->fcc = tables + fcc_offset;
5835 cd->cbits = tables + cbits_offset;
5836 cd->ctypes = tables + ctypes_offset;
5837
5838 /* Check for global one-time settings at the start of the pattern, and remember
5839 the offset for later. */
5840
5841 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5842 {
5843 int newnl = 0;
5844 int newbsr = 0;
5845
5846 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5847 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5848 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5849 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5850 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5851 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5852 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5853 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5854 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5855 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5856
5857 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5858 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5859 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5860 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5861
5862 if (newnl != 0)
5863 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5864 else if (newbsr != 0)
5865 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5866 else break;
5867 }
5868
5869 /* Check validity of \R options. */
5870
5871 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5872 {
5873 case 0:
5874 case PCRE_BSR_ANYCRLF:
5875 case PCRE_BSR_UNICODE:
5876 break;
5877 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5878 }
5879
5880 /* Handle different types of newline. The three bits give seven cases. The
5881 current code allows for fixed one- or two-byte sequences, plus "any" and
5882 "anycrlf". */
5883
5884 switch (options & PCRE_NEWLINE_BITS)
5885 {
5886 case 0: newline = NEWLINE; break; /* Build-time default */
5887 case PCRE_NEWLINE_CR: newline = '\r'; break;
5888 case PCRE_NEWLINE_LF: newline = '\n'; break;
5889 case PCRE_NEWLINE_CR+
5890 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5891 case PCRE_NEWLINE_ANY: newline = -1; break;
5892 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5893 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5894 }
5895
5896 if (newline == -2)
5897 {
5898 cd->nltype = NLTYPE_ANYCRLF;
5899 }
5900 else if (newline < 0)
5901 {
5902 cd->nltype = NLTYPE_ANY;
5903 }
5904 else
5905 {
5906 cd->nltype = NLTYPE_FIXED;
5907 if (newline > 255)
5908 {
5909 cd->nllen = 2;
5910 cd->nl[0] = (newline >> 8) & 255;
5911 cd->nl[1] = newline & 255;
5912 }
5913 else
5914 {
5915 cd->nllen = 1;
5916 cd->nl[0] = newline;
5917 }
5918 }
5919
5920 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5921 references to help in deciding whether (.*) can be treated as anchored or not.
5922 */
5923
5924 cd->top_backref = 0;
5925 cd->backref_map = 0;
5926
5927 /* Reflect pattern for debugging output */
5928
5929 DPRINTF(("------------------------------------------------------------------\n"));
5930 DPRINTF(("%s\n", pattern));
5931
5932 /* Pretend to compile the pattern while actually just accumulating the length
5933 of memory required. This behaviour is triggered by passing a non-NULL final
5934 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5935 to compile parts of the pattern into; the compiled code is discarded when it is
5936 no longer needed, so hopefully this workspace will never overflow, though there
5937 is a test for its doing so. */
5938
5939 cd->bracount = cd->final_bracount = 0;
5940 cd->names_found = 0;
5941 cd->name_entry_size = 0;
5942 cd->name_table = NULL;
5943 cd->start_workspace = cworkspace;
5944 cd->start_code = cworkspace;
5945 cd->hwm = cworkspace;
5946 cd->start_pattern = (const uschar *)pattern;
5947 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5948 cd->req_varyopt = 0;
5949 cd->external_options = options;
5950 cd->external_flags = 0;
5951
5952 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5953 don't need to look at the result of the function here. The initial options have
5954 been put into the cd block so that they can be changed if an option setting is
5955 found within the regex right at the beginning. Bringing initial option settings
5956 outside can help speed up starting point checks. */
5957
5958 ptr += skipatstart;
5959 code = cworkspace;
5960 *code = OP_BRA;
5961 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5962 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5963 &length);
5964 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5965
5966 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5967 cd->hwm - cworkspace));
5968
5969 if (length > MAX_PATTERN_SIZE)
5970 {
5971 errorcode = ERR20;
5972 goto PCRE_EARLY_ERROR_RETURN;
5973 }
5974
5975 /* Compute the size of data block needed and get it, either from malloc or
5976 externally provided function. Integer overflow should no longer be possible
5977 because nowadays we limit the maximum value of cd->names_found and
5978 cd->name_entry_size. */
5979
5980 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5981 re = (real_pcre *)(pcre_malloc)(size);
5982
5983 if (re == NULL)
5984 {
5985 errorcode = ERR21;
5986 goto PCRE_EARLY_ERROR_RETURN;
5987 }
5988
5989 /* Put in the magic number, and save the sizes, initial options, internal
5990 flags, and character table pointer. NULL is used for the default character
5991 tables. The nullpad field is at the end; it's there to help in the case when a
5992 regex compiled on a system with 4-byte pointers is run on another with 8-byte
5993 pointers. */
5994
5995 re->magic_number = MAGIC_NUMBER;
5996 re->size = size;
5997 re->options = cd->external_options;
5998 re->flags = cd->external_flags;
5999 re->dummy1 = 0;
6000 re->first_byte = 0;
6001 re->req_byte = 0;
6002 re->name_table_offset = sizeof(real_pcre);
6003 re->name_entry_size = cd->name_entry_size;
6004 re->name_count = cd->names_found;
6005 re->ref_count = 0;
6006 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6007 re->nullpad = NULL;
6008
6009 /* The starting points of the name/number translation table and of the code are
6010 passed around in the compile data block. The start/end pattern and initial
6011 options are already set from the pre-compile phase, as is the name_entry_size
6012 field. Reset the bracket count and the names_found field. Also reset the hwm
6013 field; this time it's used for remembering forward references to subpatterns.
6014 */
6015
6016 cd->final_bracount = cd->bracount; /* Save for checking forward references */
6017 cd->bracount = 0;
6018 cd->names_found = 0;
6019 cd->name_table = (uschar *)re + re->name_table_offset;
6020 codestart = cd->name_table + re->name_entry_size * re->name_count;
6021 cd->start_code = codestart;
6022 cd->hwm = cworkspace;
6023 cd->req_varyopt = 0;
6024 cd->had_accept = FALSE;
6025
6026 /* Set up a starting, non-extracting bracket, then compile the expression. On
6027 error, errorcode will be set non-zero, so we don't need to look at the result
6028 of the function here. */
6029
6030 ptr = (const uschar *)pattern + skipatstart;
6031 code = (uschar *)codestart;
6032 *code = OP_BRA;
6033 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6034 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6035 re->top_bracket = cd->bracount;
6036 re->top_backref = cd->top_backref;
6037 re->flags = cd->external_flags;
6038
6039 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6040
6041 /* If not reached end of pattern on success, there's an excess bracket. */
6042
6043 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6044
6045 /* Fill in the terminating state and check for disastrous overflow, but
6046 if debugging, leave the test till after things are printed out. */
6047
6048 *code++ = OP_END;
6049
6050 #ifndef DEBUG
6051 if (code - codestart > length) errorcode = ERR23;
6052 #endif
6053
6054 /* Fill in any forward references that are required. */
6055
6056 while (errorcode == 0 && cd->hwm > cworkspace)
6057 {
6058 int offset, recno;
6059 const uschar *groupptr;
6060 cd->hwm -= LINK_SIZE;
6061 offset = GET(cd->hwm, 0);
6062 recno = GET(codestart, offset);
6063 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6064 if (groupptr == NULL) errorcode = ERR53;
6065 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6066 }
6067
6068 /* Give an error if there's back reference to a non-existent capturing
6069 subpattern. */
6070
6071 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6072
6073 /* Failed to compile, or error while post-processing */
6074
6075 if (errorcode != 0)
6076 {
6077 (pcre_free)(re);
6078 PCRE_EARLY_ERROR_RETURN:
6079 *erroroffset = ptr - (const uschar *)pattern;
6080 PCRE_EARLY_ERROR_RETURN2:
6081 *errorptr = find_error_text(errorcode);
6082 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6083 return NULL;
6084 }
6085
6086 /* If the anchored option was not passed, set the flag if we can determine that
6087 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6088 as starting with .* when DOTALL is set).
6089
6090 Otherwise, if we know what the first byte has to be, save it, because that
6091 speeds up unanchored matches no end. If not, see if we can set the
6092 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6093 start with ^. and also when all branches start with .* for non-DOTALL matches.
6094 */
6095
6096 if ((re->options & PCRE_ANCHORED) == 0)
6097 {
6098 int temp_options = re->options; /* May get changed during these scans */
6099 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6100 re->options |= PCRE_ANCHORED;
6101 else
6102 {
6103 if (firstbyte < 0)
6104 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6105 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6106 {
6107 int ch = firstbyte & 255;
6108 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6109 cd->fcc[ch] == ch)? ch : firstbyte;
6110 re->flags |= PCRE_FIRSTSET;
6111 }
6112 else if (is_startline(codestart, 0, cd->backref_map))
6113 re->flags |= PCRE_STARTLINE;
6114 }
6115 }
6116
6117 /* For an anchored pattern, we use the "required byte" only if it follows a
6118 variable length item in the regex. Remove the caseless flag for non-caseable
6119 bytes. */
6120
6121 if (reqbyte >= 0 &&
6122 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6123 {
6124 int ch = reqbyte & 255;
6125 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6126 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6127 re->flags |= PCRE_REQCHSET;
6128 }
6129
6130 /* Print out the compiled data if debugging is enabled. This is never the
6131 case when building a production library. */
6132
6133 #ifdef DEBUG
6134
6135 printf("Length = %d top_bracket = %d top_backref = %d\n",
6136 length, re->top_bracket, re->top_backref);
6137
6138 printf("Options=%08x\n", re->options);
6139
6140 if ((re->flags & PCRE_FIRSTSET) != 0)
6141 {
6142 int ch = re->first_byte & 255;
6143 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6144 "" : " (caseless)";
6145 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6146 else printf("First char = \\x%02x%s\n", ch, caseless);
6147 }
6148
6149 if ((re->flags & PCRE_REQCHSET) != 0)
6150 {
6151 int ch = re->req_byte & 255;
6152 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6153 "" : " (caseless)";
6154 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6155 else printf("Req char = \\x%02x%s\n", ch, caseless);
6156 }
6157
6158 pcre_printint(re, stdout, TRUE);
6159
6160 /* This check is done here in the debugging case so that the code that
6161 was compiled can be seen. */
6162
6163 if (code - codestart > length)
6164 {
6165 (pcre_free)(re);
6166 *errorptr = find_error_text(ERR23);
6167 *erroroffset = ptr - (uschar *)pattern;
6168 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6169 return NULL;
6170 }
6171 #endif /* DEBUG */
6172
6173 return (pcre *)re;
6174 }
6175
6176 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5