/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1361 - (show annotations)
Fri Sep 6 17:47:32 2013 UTC (6 years ago) by ph10
File MIME type: text/plain
File size: 274546 byte(s)
Make back references to duplicated named subpatterns more like Perl.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Tables of names of POSIX character classes and their lengths. The names are
264 now all in a single string, to reduce the number of relocations when a shared
265 library is dynamically loaded. The list of lengths is terminated by a zero
266 length entry. The first three must be alpha, lower, upper, as this is assumed
267 for handling case independence. */
268
269 static const char posix_names[] =
270 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
271 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
272 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
273 STRING_word0 STRING_xdigit;
274
275 static const pcre_uint8 posix_name_lengths[] = {
276 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
277
278 /* Table of class bit maps for each POSIX class. Each class is formed from a
279 base map, with an optional addition or removal of another map. Then, for some
280 classes, there is some additional tweaking: for [:blank:] the vertical space
281 characters are removed, and for [:alpha:] and [:alnum:] the underscore
282 character is removed. The triples in the table consist of the base map offset,
283 second map offset or -1 if no second map, and a non-negative value for map
284 addition or a negative value for map subtraction (if there are two maps). The
285 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
286 remove vertical space characters, 2 => remove underscore. */
287
288 static const int posix_class_maps[] = {
289 cbit_word, cbit_digit, -2, /* alpha */
290 cbit_lower, -1, 0, /* lower */
291 cbit_upper, -1, 0, /* upper */
292 cbit_word, -1, 2, /* alnum - word without underscore */
293 cbit_print, cbit_cntrl, 0, /* ascii */
294 cbit_space, -1, 1, /* blank - a GNU extension */
295 cbit_cntrl, -1, 0, /* cntrl */
296 cbit_digit, -1, 0, /* digit */
297 cbit_graph, -1, 0, /* graph */
298 cbit_print, -1, 0, /* print */
299 cbit_punct, -1, 0, /* punct */
300 cbit_space, -1, 0, /* space */
301 cbit_word, -1, 0, /* word - a Perl extension */
302 cbit_xdigit,-1, 0 /* xdigit */
303 };
304
305 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
306 substitutes must be in the order of the names, defined above, and there are
307 both positive and negative cases. NULL means no substitute. */
308
309 #ifdef SUPPORT_UCP
310 static const pcre_uchar string_PNd[] = {
311 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
312 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
313 static const pcre_uchar string_pNd[] = {
314 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
315 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
316 static const pcre_uchar string_PXsp[] = {
317 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
318 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319 static const pcre_uchar string_pXsp[] = {
320 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322 static const pcre_uchar string_PXwd[] = {
323 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
324 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325 static const pcre_uchar string_pXwd[] = {
326 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328
329 static const pcre_uchar *substitutes[] = {
330 string_PNd, /* \D */
331 string_pNd, /* \d */
332 string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
333 string_pXsp, /* \s */
334 string_PXwd, /* \W */
335 string_pXwd /* \w */
336 };
337
338 static const pcre_uchar string_pL[] = {
339 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
340 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pLl[] = {
342 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_pLu[] = {
345 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
346 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXan[] = {
348 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 static const pcre_uchar string_h[] = {
351 CHAR_BACKSLASH, CHAR_h, '\0' };
352 static const pcre_uchar string_pXps[] = {
353 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355 static const pcre_uchar string_PL[] = {
356 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
357 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358 static const pcre_uchar string_PLl[] = {
359 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
360 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361 static const pcre_uchar string_PLu[] = {
362 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
363 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
364 static const pcre_uchar string_PXan[] = {
365 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
366 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
367 static const pcre_uchar string_H[] = {
368 CHAR_BACKSLASH, CHAR_H, '\0' };
369 static const pcre_uchar string_PXps[] = {
370 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372
373 static const pcre_uchar *posix_substitutes[] = {
374 string_pL, /* alpha */
375 string_pLl, /* lower */
376 string_pLu, /* upper */
377 string_pXan, /* alnum */
378 NULL, /* ascii */
379 string_h, /* blank */
380 NULL, /* cntrl */
381 string_pNd, /* digit */
382 NULL, /* graph */
383 NULL, /* print */
384 NULL, /* punct */
385 string_pXps, /* space */ /* NOTE: Xps is POSIX space */
386 string_pXwd, /* word */
387 NULL, /* xdigit */
388 /* Negated cases */
389 string_PL, /* ^alpha */
390 string_PLl, /* ^lower */
391 string_PLu, /* ^upper */
392 string_PXan, /* ^alnum */
393 NULL, /* ^ascii */
394 string_H, /* ^blank */
395 NULL, /* ^cntrl */
396 string_PNd, /* ^digit */
397 NULL, /* ^graph */
398 NULL, /* ^print */
399 NULL, /* ^punct */
400 string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
401 string_PXwd, /* ^word */
402 NULL /* ^xdigit */
403 };
404 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
405 #endif
406
407 #define STRING(a) # a
408 #define XSTRING(s) STRING(s)
409
410 /* The texts of compile-time error messages. These are "char *" because they
411 are passed to the outside world. Do not ever re-use any error number, because
412 they are documented. Always add a new error instead. Messages marked DEAD below
413 are no longer used. This used to be a table of strings, but in order to reduce
414 the number of relocations needed when a shared library is loaded dynamically,
415 it is now one long string. We cannot use a table of offsets, because the
416 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
417 simply count through to the one we want - this isn't a performance issue
418 because these strings are used only when there is a compilation error.
419
420 Each substring ends with \0 to insert a null character. This includes the final
421 substring, so that the whole string ends with \0\0, which can be detected when
422 counting through. */
423
424 static const char error_texts[] =
425 "no error\0"
426 "\\ at end of pattern\0"
427 "\\c at end of pattern\0"
428 "unrecognized character follows \\\0"
429 "numbers out of order in {} quantifier\0"
430 /* 5 */
431 "number too big in {} quantifier\0"
432 "missing terminating ] for character class\0"
433 "invalid escape sequence in character class\0"
434 "range out of order in character class\0"
435 "nothing to repeat\0"
436 /* 10 */
437 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
438 "internal error: unexpected repeat\0"
439 "unrecognized character after (? or (?-\0"
440 "POSIX named classes are supported only within a class\0"
441 "missing )\0"
442 /* 15 */
443 "reference to non-existent subpattern\0"
444 "erroffset passed as NULL\0"
445 "unknown option bit(s) set\0"
446 "missing ) after comment\0"
447 "parentheses nested too deeply\0" /** DEAD **/
448 /* 20 */
449 "regular expression is too large\0"
450 "failed to get memory\0"
451 "unmatched parentheses\0"
452 "internal error: code overflow\0"
453 "unrecognized character after (?<\0"
454 /* 25 */
455 "lookbehind assertion is not fixed length\0"
456 "malformed number or name after (?(\0"
457 "conditional group contains more than two branches\0"
458 "assertion expected after (?(\0"
459 "(?R or (?[+-]digits must be followed by )\0"
460 /* 30 */
461 "unknown POSIX class name\0"
462 "POSIX collating elements are not supported\0"
463 "this version of PCRE is compiled without UTF support\0"
464 "spare error\0" /** DEAD **/
465 "character value in \\x{...} sequence is too large\0"
466 /* 35 */
467 "invalid condition (?(0)\0"
468 "\\C not allowed in lookbehind assertion\0"
469 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
470 "number after (?C is > 255\0"
471 "closing ) for (?C expected\0"
472 /* 40 */
473 "recursive call could loop indefinitely\0"
474 "unrecognized character after (?P\0"
475 "syntax error in subpattern name (missing terminator)\0"
476 "two named subpatterns have the same name\0"
477 "invalid UTF-8 string\0"
478 /* 45 */
479 "support for \\P, \\p, and \\X has not been compiled\0"
480 "malformed \\P or \\p sequence\0"
481 "unknown property name after \\P or \\p\0"
482 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
483 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
484 /* 50 */
485 "repeated subpattern is too long\0" /** DEAD **/
486 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
487 "internal error: overran compiling workspace\0"
488 "internal error: previously-checked referenced subpattern not found\0"
489 "DEFINE group contains more than one branch\0"
490 /* 55 */
491 "repeating a DEFINE group is not allowed\0" /** DEAD **/
492 "inconsistent NEWLINE options\0"
493 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
494 "a numbered reference must not be zero\0"
495 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
496 /* 60 */
497 "(*VERB) not recognized or malformed\0"
498 "number is too big\0"
499 "subpattern name expected\0"
500 "digit expected after (?+\0"
501 "] is an invalid data character in JavaScript compatibility mode\0"
502 /* 65 */
503 "different names for subpatterns of the same number are not allowed\0"
504 "(*MARK) must have an argument\0"
505 "this version of PCRE is not compiled with Unicode property support\0"
506 "\\c must be followed by an ASCII character\0"
507 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
508 /* 70 */
509 "internal error: unknown opcode in find_fixedlength()\0"
510 "\\N is not supported in a class\0"
511 "too many forward references\0"
512 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
513 "invalid UTF-16 string\0"
514 /* 75 */
515 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
516 "character value in \\u.... sequence is too large\0"
517 "invalid UTF-32 string\0"
518 "setting UTF is disabled by the application\0"
519 ;
520
521 /* Table to identify digits and hex digits. This is used when compiling
522 patterns. Note that the tables in chartables are dependent on the locale, and
523 may mark arbitrary characters as digits - but the PCRE compiling code expects
524 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
525 a private table here. It costs 256 bytes, but it is a lot faster than doing
526 character value tests (at least in some simple cases I timed), and in some
527 applications one wants PCRE to compile efficiently as well as match
528 efficiently.
529
530 For convenience, we use the same bit definitions as in chartables:
531
532 0x04 decimal digit
533 0x08 hexadecimal digit
534
535 Then we can use ctype_digit and ctype_xdigit in the code. */
536
537 /* Using a simple comparison for decimal numbers rather than a memory read
538 is much faster, and the resulting code is simpler (the compiler turns it
539 into a subtraction and unsigned comparison). */
540
541 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
542
543 #ifndef EBCDIC
544
545 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
546 UTF-8 mode. */
547
548 static const pcre_uint8 digitab[] =
549 {
550 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
551 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
552 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
553 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
554 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
555 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
556 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
557 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
558 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
559 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
560 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
561 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
562 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
563 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
564 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
565 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
566 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
582
583 #else
584
585 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
586
587 static const pcre_uint8 digitab[] =
588 {
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
605 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
613 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
619 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
620 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
621
622 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
623 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
624 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
625 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
627 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
631 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
632 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
634 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
636 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
639 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
640 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
641 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
642 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
643 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
644 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
645 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
646 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
647 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
648 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
649 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
650 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
651 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
652 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
653 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
654 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
655 #endif
656
657
658
659 /*************************************************
660 * Find an error text *
661 *************************************************/
662
663 /* The error texts are now all in one long string, to save on relocations. As
664 some of the text is of unknown length, we can't use a table of offsets.
665 Instead, just count through the strings. This is not a performance issue
666 because it happens only when there has been a compilation error.
667
668 Argument: the error number
669 Returns: pointer to the error string
670 */
671
672 static const char *
673 find_error_text(int n)
674 {
675 const char *s = error_texts;
676 for (; n > 0; n--)
677 {
678 while (*s++ != CHAR_NULL) {};
679 if (*s == CHAR_NULL) return "Error text not found (please report)";
680 }
681 return s;
682 }
683
684
685 /*************************************************
686 * Expand the workspace *
687 *************************************************/
688
689 /* This function is called during the second compiling phase, if the number of
690 forward references fills the existing workspace, which is originally a block on
691 the stack. A larger block is obtained from malloc() unless the ultimate limit
692 has been reached or the increase will be rather small.
693
694 Argument: pointer to the compile data block
695 Returns: 0 if all went well, else an error number
696 */
697
698 static int
699 expand_workspace(compile_data *cd)
700 {
701 pcre_uchar *newspace;
702 int newsize = cd->workspace_size * 2;
703
704 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
705 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
706 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
707 return ERR72;
708
709 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
710 if (newspace == NULL) return ERR21;
711 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
712 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
713 if (cd->workspace_size > COMPILE_WORK_SIZE)
714 (PUBL(free))((void *)cd->start_workspace);
715 cd->start_workspace = newspace;
716 cd->workspace_size = newsize;
717 return 0;
718 }
719
720
721
722 /*************************************************
723 * Check for counted repeat *
724 *************************************************/
725
726 /* This function is called when a '{' is encountered in a place where it might
727 start a quantifier. It looks ahead to see if it really is a quantifier or not.
728 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
729 where the ddds are digits.
730
731 Arguments:
732 p pointer to the first char after '{'
733
734 Returns: TRUE or FALSE
735 */
736
737 static BOOL
738 is_counted_repeat(const pcre_uchar *p)
739 {
740 if (!IS_DIGIT(*p)) return FALSE;
741 p++;
742 while (IS_DIGIT(*p)) p++;
743 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
744
745 if (*p++ != CHAR_COMMA) return FALSE;
746 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
747
748 if (!IS_DIGIT(*p)) return FALSE;
749 p++;
750 while (IS_DIGIT(*p)) p++;
751
752 return (*p == CHAR_RIGHT_CURLY_BRACKET);
753 }
754
755
756
757 /*************************************************
758 * Handle escapes *
759 *************************************************/
760
761 /* This function is called when a \ has been encountered. It either returns a
762 positive value for a simple escape such as \n, or 0 for a data character
763 which will be placed in chptr. A backreference to group n is returned as
764 negative n. When UTF-8 is enabled, a positive value greater than 255 may
765 be returned in chptr.
766 On entry,ptr is pointing at the \. On exit, it is on the final character of the
767 escape sequence.
768
769 Arguments:
770 ptrptr points to the pattern position pointer
771 chptr points to the data character
772 errorcodeptr points to the errorcode variable
773 bracount number of previous extracting brackets
774 options the options bits
775 isclass TRUE if inside a character class
776
777 Returns: zero => a data character
778 positive => a special escape sequence
779 negative => a back reference
780 on error, errorcodeptr is set
781 */
782
783 static int
784 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
785 int bracount, int options, BOOL isclass)
786 {
787 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
788 BOOL utf = (options & PCRE_UTF8) != 0;
789 const pcre_uchar *ptr = *ptrptr + 1;
790 pcre_uint32 c;
791 int escape = 0;
792 int i;
793
794 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
795 ptr--; /* Set pointer back to the last byte */
796
797 /* If backslash is at the end of the pattern, it's an error. */
798
799 if (c == CHAR_NULL) *errorcodeptr = ERR1;
800
801 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
802 in a table. A non-zero result is something that can be returned immediately.
803 Otherwise further processing may be required. */
804
805 #ifndef EBCDIC /* ASCII/UTF-8 coding */
806 /* Not alphanumeric */
807 else if (c < CHAR_0 || c > CHAR_z) {}
808 else if ((i = escapes[c - CHAR_0]) != 0)
809 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
810
811 #else /* EBCDIC coding */
812 /* Not alphanumeric */
813 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
814 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
815 #endif
816
817 /* Escapes that need further processing, or are illegal. */
818
819 else
820 {
821 const pcre_uchar *oldptr;
822 BOOL braced, negated, overflow;
823 int s;
824
825 switch (c)
826 {
827 /* A number of Perl escapes are not handled by PCRE. We give an explicit
828 error. */
829
830 case CHAR_l:
831 case CHAR_L:
832 *errorcodeptr = ERR37;
833 break;
834
835 case CHAR_u:
836 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
837 {
838 /* In JavaScript, \u must be followed by four hexadecimal numbers.
839 Otherwise it is a lowercase u letter. */
840 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
841 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
842 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
843 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
844 {
845 c = 0;
846 for (i = 0; i < 4; ++i)
847 {
848 register pcre_uint32 cc = *(++ptr);
849 #ifndef EBCDIC /* ASCII/UTF-8 coding */
850 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
851 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
852 #else /* EBCDIC coding */
853 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
854 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
855 #endif
856 }
857
858 #if defined COMPILE_PCRE8
859 if (c > (utf ? 0x10ffffU : 0xffU))
860 #elif defined COMPILE_PCRE16
861 if (c > (utf ? 0x10ffffU : 0xffffU))
862 #elif defined COMPILE_PCRE32
863 if (utf && c > 0x10ffffU)
864 #endif
865 {
866 *errorcodeptr = ERR76;
867 }
868 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
869 }
870 }
871 else
872 *errorcodeptr = ERR37;
873 break;
874
875 case CHAR_U:
876 /* In JavaScript, \U is an uppercase U letter. */
877 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
878 break;
879
880 /* In a character class, \g is just a literal "g". Outside a character
881 class, \g must be followed by one of a number of specific things:
882
883 (1) A number, either plain or braced. If positive, it is an absolute
884 backreference. If negative, it is a relative backreference. This is a Perl
885 5.10 feature.
886
887 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
888 is part of Perl's movement towards a unified syntax for back references. As
889 this is synonymous with \k{name}, we fudge it up by pretending it really
890 was \k.
891
892 (3) For Oniguruma compatibility we also support \g followed by a name or a
893 number either in angle brackets or in single quotes. However, these are
894 (possibly recursive) subroutine calls, _not_ backreferences. Just return
895 the ESC_g code (cf \k). */
896
897 case CHAR_g:
898 if (isclass) break;
899 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
900 {
901 escape = ESC_g;
902 break;
903 }
904
905 /* Handle the Perl-compatible cases */
906
907 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
908 {
909 const pcre_uchar *p;
910 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
911 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
912 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
913 {
914 escape = ESC_k;
915 break;
916 }
917 braced = TRUE;
918 ptr++;
919 }
920 else braced = FALSE;
921
922 if (ptr[1] == CHAR_MINUS)
923 {
924 negated = TRUE;
925 ptr++;
926 }
927 else negated = FALSE;
928
929 /* The integer range is limited by the machine's int representation. */
930 s = 0;
931 overflow = FALSE;
932 while (IS_DIGIT(ptr[1]))
933 {
934 if (s > INT_MAX / 10 - 1) /* Integer overflow */
935 {
936 overflow = TRUE;
937 break;
938 }
939 s = s * 10 + (int)(*(++ptr) - CHAR_0);
940 }
941 if (overflow) /* Integer overflow */
942 {
943 while (IS_DIGIT(ptr[1]))
944 ptr++;
945 *errorcodeptr = ERR61;
946 break;
947 }
948
949 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
950 {
951 *errorcodeptr = ERR57;
952 break;
953 }
954
955 if (s == 0)
956 {
957 *errorcodeptr = ERR58;
958 break;
959 }
960
961 if (negated)
962 {
963 if (s > bracount)
964 {
965 *errorcodeptr = ERR15;
966 break;
967 }
968 s = bracount - (s - 1);
969 }
970
971 escape = -s;
972 break;
973
974 /* The handling of escape sequences consisting of a string of digits
975 starting with one that is not zero is not straightforward. By experiment,
976 the way Perl works seems to be as follows:
977
978 Outside a character class, the digits are read as a decimal number. If the
979 number is less than 10, or if there are that many previous extracting
980 left brackets, then it is a back reference. Otherwise, up to three octal
981 digits are read to form an escaped byte. Thus \123 is likely to be octal
982 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
983 value is greater than 377, the least significant 8 bits are taken. Inside a
984 character class, \ followed by a digit is always an octal number. */
985
986 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
987 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
988
989 if (!isclass)
990 {
991 oldptr = ptr;
992 /* The integer range is limited by the machine's int representation. */
993 s = (int)(c -CHAR_0);
994 overflow = FALSE;
995 while (IS_DIGIT(ptr[1]))
996 {
997 if (s > INT_MAX / 10 - 1) /* Integer overflow */
998 {
999 overflow = TRUE;
1000 break;
1001 }
1002 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1003 }
1004 if (overflow) /* Integer overflow */
1005 {
1006 while (IS_DIGIT(ptr[1]))
1007 ptr++;
1008 *errorcodeptr = ERR61;
1009 break;
1010 }
1011 if (s < 10 || s <= bracount)
1012 {
1013 escape = -s;
1014 break;
1015 }
1016 ptr = oldptr; /* Put the pointer back and fall through */
1017 }
1018
1019 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
1020 generates a binary zero byte and treats the digit as a following literal.
1021 Thus we have to pull back the pointer by one. */
1022
1023 if ((c = *ptr) >= CHAR_8)
1024 {
1025 ptr--;
1026 c = 0;
1027 break;
1028 }
1029
1030 /* \0 always starts an octal number, but we may drop through to here with a
1031 larger first octal digit. The original code used just to take the least
1032 significant 8 bits of octal numbers (I think this is what early Perls used
1033 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1034 but no more than 3 octal digits. */
1035
1036 case CHAR_0:
1037 c -= CHAR_0;
1038 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1039 c = c * 8 + *(++ptr) - CHAR_0;
1040 #ifdef COMPILE_PCRE8
1041 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1042 #endif
1043 break;
1044
1045 /* \x is complicated. \x{ddd} is a character number which can be greater
1046 than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1047 If not, { is treated as a data character. */
1048
1049 case CHAR_x:
1050 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1051 {
1052 /* In JavaScript, \x must be followed by two hexadecimal numbers.
1053 Otherwise it is a lowercase x letter. */
1054 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1055 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1056 {
1057 c = 0;
1058 for (i = 0; i < 2; ++i)
1059 {
1060 register pcre_uint32 cc = *(++ptr);
1061 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1062 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1063 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1064 #else /* EBCDIC coding */
1065 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1066 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1067 #endif
1068 }
1069 }
1070 break;
1071 }
1072
1073 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1074 {
1075 const pcre_uchar *pt = ptr + 2;
1076
1077 c = 0;
1078 overflow = FALSE;
1079 while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1080 {
1081 register pcre_uint32 cc = *pt++;
1082 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1083
1084 #ifdef COMPILE_PCRE32
1085 if (c >= 0x10000000l) { overflow = TRUE; break; }
1086 #endif
1087
1088 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1089 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1090 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1091 #else /* EBCDIC coding */
1092 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1093 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1094 #endif
1095
1096 #if defined COMPILE_PCRE8
1097 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1098 #elif defined COMPILE_PCRE16
1099 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1100 #elif defined COMPILE_PCRE32
1101 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1102 #endif
1103 }
1104
1105 if (overflow)
1106 {
1107 while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1108 *errorcodeptr = ERR34;
1109 }
1110
1111 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1112 {
1113 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1114 ptr = pt;
1115 break;
1116 }
1117
1118 /* If the sequence of hex digits does not end with '}', then we don't
1119 recognize this construct; fall through to the normal \x handling. */
1120 }
1121
1122 /* Read just a single-byte hex-defined char */
1123
1124 c = 0;
1125 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1126 {
1127 pcre_uint32 cc; /* Some compilers don't like */
1128 cc = *(++ptr); /* ++ in initializers */
1129 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1130 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1131 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1132 #else /* EBCDIC coding */
1133 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1134 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1135 #endif
1136 }
1137 break;
1138
1139 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1140 An error is given if the byte following \c is not an ASCII character. This
1141 coding is ASCII-specific, but then the whole concept of \cx is
1142 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1143
1144 case CHAR_c:
1145 c = *(++ptr);
1146 if (c == CHAR_NULL)
1147 {
1148 *errorcodeptr = ERR2;
1149 break;
1150 }
1151 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1152 if (c > 127) /* Excludes all non-ASCII in either mode */
1153 {
1154 *errorcodeptr = ERR68;
1155 break;
1156 }
1157 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1158 c ^= 0x40;
1159 #else /* EBCDIC coding */
1160 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1161 c ^= 0xC0;
1162 #endif
1163 break;
1164
1165 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1166 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1167 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1168 odd, but there used to be some cases other than the default, and there may
1169 be again in future, so I haven't "optimized" it. */
1170
1171 default:
1172 if ((options & PCRE_EXTRA) != 0) switch(c)
1173 {
1174 default:
1175 *errorcodeptr = ERR3;
1176 break;
1177 }
1178 break;
1179 }
1180 }
1181
1182 /* Perl supports \N{name} for character names, as well as plain \N for "not
1183 newline". PCRE does not support \N{name}. However, it does support
1184 quantification such as \N{2,3}. */
1185
1186 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1187 !is_counted_repeat(ptr+2))
1188 *errorcodeptr = ERR37;
1189
1190 /* If PCRE_UCP is set, we change the values for \d etc. */
1191
1192 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1193 escape += (ESC_DU - ESC_D);
1194
1195 /* Set the pointer to the final character before returning. */
1196
1197 *ptrptr = ptr;
1198 *chptr = c;
1199 return escape;
1200 }
1201
1202 #ifdef SUPPORT_UCP
1203 /*************************************************
1204 * Handle \P and \p *
1205 *************************************************/
1206
1207 /* This function is called after \P or \p has been encountered, provided that
1208 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1209 pointing at the P or p. On exit, it is pointing at the final character of the
1210 escape sequence.
1211
1212 Argument:
1213 ptrptr points to the pattern position pointer
1214 negptr points to a boolean that is set TRUE for negation else FALSE
1215 ptypeptr points to an unsigned int that is set to the type value
1216 pdataptr points to an unsigned int that is set to the detailed property value
1217 errorcodeptr points to the error code variable
1218
1219 Returns: TRUE if the type value was found, or FALSE for an invalid type
1220 */
1221
1222 static BOOL
1223 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1224 unsigned int *pdataptr, int *errorcodeptr)
1225 {
1226 pcre_uchar c;
1227 int i, bot, top;
1228 const pcre_uchar *ptr = *ptrptr;
1229 pcre_uchar name[32];
1230
1231 c = *(++ptr);
1232 if (c == CHAR_NULL) goto ERROR_RETURN;
1233
1234 *negptr = FALSE;
1235
1236 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1237 negation. */
1238
1239 if (c == CHAR_LEFT_CURLY_BRACKET)
1240 {
1241 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1242 {
1243 *negptr = TRUE;
1244 ptr++;
1245 }
1246 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1247 {
1248 c = *(++ptr);
1249 if (c == CHAR_NULL) goto ERROR_RETURN;
1250 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1251 name[i] = c;
1252 }
1253 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1254 name[i] = 0;
1255 }
1256
1257 /* Otherwise there is just one following character */
1258
1259 else
1260 {
1261 name[0] = c;
1262 name[1] = 0;
1263 }
1264
1265 *ptrptr = ptr;
1266
1267 /* Search for a recognized property name using binary chop */
1268
1269 bot = 0;
1270 top = PRIV(utt_size);
1271
1272 while (bot < top)
1273 {
1274 int r;
1275 i = (bot + top) >> 1;
1276 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1277 if (r == 0)
1278 {
1279 *ptypeptr = PRIV(utt)[i].type;
1280 *pdataptr = PRIV(utt)[i].value;
1281 return TRUE;
1282 }
1283 if (r > 0) bot = i + 1; else top = i;
1284 }
1285
1286 *errorcodeptr = ERR47;
1287 *ptrptr = ptr;
1288 return FALSE;
1289
1290 ERROR_RETURN:
1291 *errorcodeptr = ERR46;
1292 *ptrptr = ptr;
1293 return FALSE;
1294 }
1295 #endif
1296
1297
1298
1299
1300 /*************************************************
1301 * Read repeat counts *
1302 *************************************************/
1303
1304 /* Read an item of the form {n,m} and return the values. This is called only
1305 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1306 so the syntax is guaranteed to be correct, but we need to check the values.
1307
1308 Arguments:
1309 p pointer to first char after '{'
1310 minp pointer to int for min
1311 maxp pointer to int for max
1312 returned as -1 if no max
1313 errorcodeptr points to error code variable
1314
1315 Returns: pointer to '}' on success;
1316 current ptr on error, with errorcodeptr set non-zero
1317 */
1318
1319 static const pcre_uchar *
1320 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1321 {
1322 int min = 0;
1323 int max = -1;
1324
1325 /* Read the minimum value and do a paranoid check: a negative value indicates
1326 an integer overflow. */
1327
1328 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1329 if (min < 0 || min > 65535)
1330 {
1331 *errorcodeptr = ERR5;
1332 return p;
1333 }
1334
1335 /* Read the maximum value if there is one, and again do a paranoid on its size.
1336 Also, max must not be less than min. */
1337
1338 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1339 {
1340 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1341 {
1342 max = 0;
1343 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1344 if (max < 0 || max > 65535)
1345 {
1346 *errorcodeptr = ERR5;
1347 return p;
1348 }
1349 if (max < min)
1350 {
1351 *errorcodeptr = ERR4;
1352 return p;
1353 }
1354 }
1355 }
1356
1357 /* Fill in the required variables, and pass back the pointer to the terminating
1358 '}'. */
1359
1360 *minp = min;
1361 *maxp = max;
1362 return p;
1363 }
1364
1365
1366
1367 /*************************************************
1368 * Find first significant op code *
1369 *************************************************/
1370
1371 /* This is called by several functions that scan a compiled expression looking
1372 for a fixed first character, or an anchoring op code etc. It skips over things
1373 that do not influence this. For some calls, it makes sense to skip negative
1374 forward and all backward assertions, and also the \b assertion; for others it
1375 does not.
1376
1377 Arguments:
1378 code pointer to the start of the group
1379 skipassert TRUE if certain assertions are to be skipped
1380
1381 Returns: pointer to the first significant opcode
1382 */
1383
1384 static const pcre_uchar*
1385 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1386 {
1387 for (;;)
1388 {
1389 switch ((int)*code)
1390 {
1391 case OP_ASSERT_NOT:
1392 case OP_ASSERTBACK:
1393 case OP_ASSERTBACK_NOT:
1394 if (!skipassert) return code;
1395 do code += GET(code, 1); while (*code == OP_ALT);
1396 code += PRIV(OP_lengths)[*code];
1397 break;
1398
1399 case OP_WORD_BOUNDARY:
1400 case OP_NOT_WORD_BOUNDARY:
1401 if (!skipassert) return code;
1402 /* Fall through */
1403
1404 case OP_CALLOUT:
1405 case OP_CREF:
1406 case OP_NCREF:
1407 case OP_RREF:
1408 case OP_NRREF:
1409 case OP_DEF:
1410 code += PRIV(OP_lengths)[*code];
1411 break;
1412
1413 default:
1414 return code;
1415 }
1416 }
1417 /* Control never reaches here */
1418 }
1419
1420
1421
1422
1423 /*************************************************
1424 * Find the fixed length of a branch *
1425 *************************************************/
1426
1427 /* Scan a branch and compute the fixed length of subject that will match it,
1428 if the length is fixed. This is needed for dealing with backward assertions.
1429 In UTF8 mode, the result is in characters rather than bytes. The branch is
1430 temporarily terminated with OP_END when this function is called.
1431
1432 This function is called when a backward assertion is encountered, so that if it
1433 fails, the error message can point to the correct place in the pattern.
1434 However, we cannot do this when the assertion contains subroutine calls,
1435 because they can be forward references. We solve this by remembering this case
1436 and doing the check at the end; a flag specifies which mode we are running in.
1437
1438 Arguments:
1439 code points to the start of the pattern (the bracket)
1440 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1441 atend TRUE if called when the pattern is complete
1442 cd the "compile data" structure
1443
1444 Returns: the fixed length,
1445 or -1 if there is no fixed length,
1446 or -2 if \C was encountered (in UTF-8 mode only)
1447 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1448 or -4 if an unknown opcode was encountered (internal error)
1449 */
1450
1451 static int
1452 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1453 {
1454 int length = -1;
1455
1456 register int branchlength = 0;
1457 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1458
1459 /* Scan along the opcodes for this branch. If we get to the end of the
1460 branch, check the length against that of the other branches. */
1461
1462 for (;;)
1463 {
1464 int d;
1465 pcre_uchar *ce, *cs;
1466 register pcre_uchar op = *cc;
1467
1468 switch (op)
1469 {
1470 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1471 OP_BRA (normal non-capturing bracket) because the other variants of these
1472 opcodes are all concerned with unlimited repeated groups, which of course
1473 are not of fixed length. */
1474
1475 case OP_CBRA:
1476 case OP_BRA:
1477 case OP_ONCE:
1478 case OP_ONCE_NC:
1479 case OP_COND:
1480 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1481 if (d < 0) return d;
1482 branchlength += d;
1483 do cc += GET(cc, 1); while (*cc == OP_ALT);
1484 cc += 1 + LINK_SIZE;
1485 break;
1486
1487 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1488 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1489 an ALT. If it is END it's the end of the outer call. All can be handled by
1490 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1491 because they all imply an unlimited repeat. */
1492
1493 case OP_ALT:
1494 case OP_KET:
1495 case OP_END:
1496 case OP_ACCEPT:
1497 case OP_ASSERT_ACCEPT:
1498 if (length < 0) length = branchlength;
1499 else if (length != branchlength) return -1;
1500 if (*cc != OP_ALT) return length;
1501 cc += 1 + LINK_SIZE;
1502 branchlength = 0;
1503 break;
1504
1505 /* A true recursion implies not fixed length, but a subroutine call may
1506 be OK. If the subroutine is a forward reference, we can't deal with
1507 it until the end of the pattern, so return -3. */
1508
1509 case OP_RECURSE:
1510 if (!atend) return -3;
1511 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1512 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1513 if (cc > cs && cc < ce) return -1; /* Recursion */
1514 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1515 if (d < 0) return d;
1516 branchlength += d;
1517 cc += 1 + LINK_SIZE;
1518 break;
1519
1520 /* Skip over assertive subpatterns */
1521
1522 case OP_ASSERT:
1523 case OP_ASSERT_NOT:
1524 case OP_ASSERTBACK:
1525 case OP_ASSERTBACK_NOT:
1526 do cc += GET(cc, 1); while (*cc == OP_ALT);
1527 cc += PRIV(OP_lengths)[*cc];
1528 break;
1529
1530 /* Skip over things that don't match chars */
1531
1532 case OP_MARK:
1533 case OP_PRUNE_ARG:
1534 case OP_SKIP_ARG:
1535 case OP_THEN_ARG:
1536 cc += cc[1] + PRIV(OP_lengths)[*cc];
1537 break;
1538
1539 case OP_CALLOUT:
1540 case OP_CIRC:
1541 case OP_CIRCM:
1542 case OP_CLOSE:
1543 case OP_COMMIT:
1544 case OP_CREF:
1545 case OP_DEF:
1546 case OP_DOLL:
1547 case OP_DOLLM:
1548 case OP_EOD:
1549 case OP_EODN:
1550 case OP_FAIL:
1551 case OP_NCREF:
1552 case OP_NRREF:
1553 case OP_NOT_WORD_BOUNDARY:
1554 case OP_PRUNE:
1555 case OP_REVERSE:
1556 case OP_RREF:
1557 case OP_SET_SOM:
1558 case OP_SKIP:
1559 case OP_SOD:
1560 case OP_SOM:
1561 case OP_THEN:
1562 case OP_WORD_BOUNDARY:
1563 cc += PRIV(OP_lengths)[*cc];
1564 break;
1565
1566 /* Handle literal characters */
1567
1568 case OP_CHAR:
1569 case OP_CHARI:
1570 case OP_NOT:
1571 case OP_NOTI:
1572 branchlength++;
1573 cc += 2;
1574 #ifdef SUPPORT_UTF
1575 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1576 #endif
1577 break;
1578
1579 /* Handle exact repetitions. The count is already in characters, but we
1580 need to skip over a multibyte character in UTF8 mode. */
1581
1582 case OP_EXACT:
1583 case OP_EXACTI:
1584 case OP_NOTEXACT:
1585 case OP_NOTEXACTI:
1586 branchlength += (int)GET2(cc,1);
1587 cc += 2 + IMM2_SIZE;
1588 #ifdef SUPPORT_UTF
1589 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1590 #endif
1591 break;
1592
1593 case OP_TYPEEXACT:
1594 branchlength += GET2(cc,1);
1595 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1596 cc += 2;
1597 cc += 1 + IMM2_SIZE + 1;
1598 break;
1599
1600 /* Handle single-char matchers */
1601
1602 case OP_PROP:
1603 case OP_NOTPROP:
1604 cc += 2;
1605 /* Fall through */
1606
1607 case OP_HSPACE:
1608 case OP_VSPACE:
1609 case OP_NOT_HSPACE:
1610 case OP_NOT_VSPACE:
1611 case OP_NOT_DIGIT:
1612 case OP_DIGIT:
1613 case OP_NOT_WHITESPACE:
1614 case OP_WHITESPACE:
1615 case OP_NOT_WORDCHAR:
1616 case OP_WORDCHAR:
1617 case OP_ANY:
1618 case OP_ALLANY:
1619 branchlength++;
1620 cc++;
1621 break;
1622
1623 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1624 otherwise \C is coded as OP_ALLANY. */
1625
1626 case OP_ANYBYTE:
1627 return -2;
1628
1629 /* Check a class for variable quantification */
1630
1631 case OP_CLASS:
1632 case OP_NCLASS:
1633 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1634 case OP_XCLASS:
1635 /* The original code caused an unsigned overflow in 64 bit systems,
1636 so now we use a conditional statement. */
1637 if (op == OP_XCLASS)
1638 cc += GET(cc, 1);
1639 else
1640 cc += PRIV(OP_lengths)[OP_CLASS];
1641 #else
1642 cc += PRIV(OP_lengths)[OP_CLASS];
1643 #endif
1644
1645 switch (*cc)
1646 {
1647 case OP_CRPLUS:
1648 case OP_CRMINPLUS:
1649 case OP_CRSTAR:
1650 case OP_CRMINSTAR:
1651 case OP_CRQUERY:
1652 case OP_CRMINQUERY:
1653 return -1;
1654
1655 case OP_CRRANGE:
1656 case OP_CRMINRANGE:
1657 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1658 branchlength += (int)GET2(cc,1);
1659 cc += 1 + 2 * IMM2_SIZE;
1660 break;
1661
1662 default:
1663 branchlength++;
1664 }
1665 break;
1666
1667 /* Anything else is variable length */
1668
1669 case OP_ANYNL:
1670 case OP_BRAMINZERO:
1671 case OP_BRAPOS:
1672 case OP_BRAPOSZERO:
1673 case OP_BRAZERO:
1674 case OP_CBRAPOS:
1675 case OP_EXTUNI:
1676 case OP_KETRMAX:
1677 case OP_KETRMIN:
1678 case OP_KETRPOS:
1679 case OP_MINPLUS:
1680 case OP_MINPLUSI:
1681 case OP_MINQUERY:
1682 case OP_MINQUERYI:
1683 case OP_MINSTAR:
1684 case OP_MINSTARI:
1685 case OP_MINUPTO:
1686 case OP_MINUPTOI:
1687 case OP_NOTMINPLUS:
1688 case OP_NOTMINPLUSI:
1689 case OP_NOTMINQUERY:
1690 case OP_NOTMINQUERYI:
1691 case OP_NOTMINSTAR:
1692 case OP_NOTMINSTARI:
1693 case OP_NOTMINUPTO:
1694 case OP_NOTMINUPTOI:
1695 case OP_NOTPLUS:
1696 case OP_NOTPLUSI:
1697 case OP_NOTPOSPLUS:
1698 case OP_NOTPOSPLUSI:
1699 case OP_NOTPOSQUERY:
1700 case OP_NOTPOSQUERYI:
1701 case OP_NOTPOSSTAR:
1702 case OP_NOTPOSSTARI:
1703 case OP_NOTPOSUPTO:
1704 case OP_NOTPOSUPTOI:
1705 case OP_NOTQUERY:
1706 case OP_NOTQUERYI:
1707 case OP_NOTSTAR:
1708 case OP_NOTSTARI:
1709 case OP_NOTUPTO:
1710 case OP_NOTUPTOI:
1711 case OP_PLUS:
1712 case OP_PLUSI:
1713 case OP_POSPLUS:
1714 case OP_POSPLUSI:
1715 case OP_POSQUERY:
1716 case OP_POSQUERYI:
1717 case OP_POSSTAR:
1718 case OP_POSSTARI:
1719 case OP_POSUPTO:
1720 case OP_POSUPTOI:
1721 case OP_QUERY:
1722 case OP_QUERYI:
1723 case OP_REF:
1724 case OP_REFI:
1725 case OP_DNREF:
1726 case OP_DNREFI:
1727 case OP_SBRA:
1728 case OP_SBRAPOS:
1729 case OP_SCBRA:
1730 case OP_SCBRAPOS:
1731 case OP_SCOND:
1732 case OP_SKIPZERO:
1733 case OP_STAR:
1734 case OP_STARI:
1735 case OP_TYPEMINPLUS:
1736 case OP_TYPEMINQUERY:
1737 case OP_TYPEMINSTAR:
1738 case OP_TYPEMINUPTO:
1739 case OP_TYPEPLUS:
1740 case OP_TYPEPOSPLUS:
1741 case OP_TYPEPOSQUERY:
1742 case OP_TYPEPOSSTAR:
1743 case OP_TYPEPOSUPTO:
1744 case OP_TYPEQUERY:
1745 case OP_TYPESTAR:
1746 case OP_TYPEUPTO:
1747 case OP_UPTO:
1748 case OP_UPTOI:
1749 return -1;
1750
1751 /* Catch unrecognized opcodes so that when new ones are added they
1752 are not forgotten, as has happened in the past. */
1753
1754 default:
1755 return -4;
1756 }
1757 }
1758 /* Control never gets here */
1759 }
1760
1761
1762
1763
1764 /*************************************************
1765 * Scan compiled regex for specific bracket *
1766 *************************************************/
1767
1768 /* This little function scans through a compiled pattern until it finds a
1769 capturing bracket with the given number, or, if the number is negative, an
1770 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1771 so that it can be called from pcre_study() when finding the minimum matching
1772 length.
1773
1774 Arguments:
1775 code points to start of expression
1776 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1777 number the required bracket number or negative to find a lookbehind
1778
1779 Returns: pointer to the opcode for the bracket, or NULL if not found
1780 */
1781
1782 const pcre_uchar *
1783 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
1784 {
1785 for (;;)
1786 {
1787 register pcre_uchar c = *code;
1788
1789 if (c == OP_END) return NULL;
1790
1791 /* XCLASS is used for classes that cannot be represented just by a bit
1792 map. This includes negated single high-valued characters. The length in
1793 the table is zero; the actual length is stored in the compiled code. */
1794
1795 if (c == OP_XCLASS) code += GET(code, 1);
1796
1797 /* Handle recursion */
1798
1799 else if (c == OP_REVERSE)
1800 {
1801 if (number < 0) return (pcre_uchar *)code;
1802 code += PRIV(OP_lengths)[c];
1803 }
1804
1805 /* Handle capturing bracket */
1806
1807 else if (c == OP_CBRA || c == OP_SCBRA ||
1808 c == OP_CBRAPOS || c == OP_SCBRAPOS)
1809 {
1810 int n = (int)GET2(code, 1+LINK_SIZE);
1811 if (n == number) return (pcre_uchar *)code;
1812 code += PRIV(OP_lengths)[c];
1813 }
1814
1815 /* Otherwise, we can get the item's length from the table, except that for
1816 repeated character types, we have to test for \p and \P, which have an extra
1817 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1818 must add in its length. */
1819
1820 else
1821 {
1822 switch(c)
1823 {
1824 case OP_TYPESTAR:
1825 case OP_TYPEMINSTAR:
1826 case OP_TYPEPLUS:
1827 case OP_TYPEMINPLUS:
1828 case OP_TYPEQUERY:
1829 case OP_TYPEMINQUERY:
1830 case OP_TYPEPOSSTAR:
1831 case OP_TYPEPOSPLUS:
1832 case OP_TYPEPOSQUERY:
1833 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1834 break;
1835
1836 case OP_TYPEUPTO:
1837 case OP_TYPEMINUPTO:
1838 case OP_TYPEEXACT:
1839 case OP_TYPEPOSUPTO:
1840 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1841 code += 2;
1842 break;
1843
1844 case OP_MARK:
1845 case OP_PRUNE_ARG:
1846 case OP_SKIP_ARG:
1847 case OP_THEN_ARG:
1848 code += code[1];
1849 break;
1850 }
1851
1852 /* Add in the fixed length from the table */
1853
1854 code += PRIV(OP_lengths)[c];
1855
1856 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1857 a multi-byte character. The length in the table is a minimum, so we have to
1858 arrange to skip the extra bytes. */
1859
1860 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1861 if (utf) switch(c)
1862 {
1863 case OP_CHAR:
1864 case OP_CHARI:
1865 case OP_EXACT:
1866 case OP_EXACTI:
1867 case OP_UPTO:
1868 case OP_UPTOI:
1869 case OP_MINUPTO:
1870 case OP_MINUPTOI:
1871 case OP_POSUPTO:
1872 case OP_POSUPTOI:
1873 case OP_STAR:
1874 case OP_STARI:
1875 case OP_MINSTAR:
1876 case OP_MINSTARI:
1877 case OP_POSSTAR:
1878 case OP_POSSTARI:
1879 case OP_PLUS:
1880 case OP_PLUSI:
1881 case OP_MINPLUS:
1882 case OP_MINPLUSI:
1883 case OP_POSPLUS:
1884 case OP_POSPLUSI:
1885 case OP_QUERY:
1886 case OP_QUERYI:
1887 case OP_MINQUERY:
1888 case OP_MINQUERYI:
1889 case OP_POSQUERY:
1890 case OP_POSQUERYI:
1891 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1892 break;
1893 }
1894 #else
1895 (void)(utf); /* Keep compiler happy by referencing function argument */
1896 #endif
1897 }
1898 }
1899 }
1900
1901
1902
1903 /*************************************************
1904 * Scan compiled regex for recursion reference *
1905 *************************************************/
1906
1907 /* This little function scans through a compiled pattern until it finds an
1908 instance of OP_RECURSE.
1909
1910 Arguments:
1911 code points to start of expression
1912 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1913
1914 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1915 */
1916
1917 static const pcre_uchar *
1918 find_recurse(const pcre_uchar *code, BOOL utf)
1919 {
1920 for (;;)
1921 {
1922 register pcre_uchar c = *code;
1923 if (c == OP_END) return NULL;
1924 if (c == OP_RECURSE) return code;
1925
1926 /* XCLASS is used for classes that cannot be represented just by a bit
1927 map. This includes negated single high-valued characters. The length in
1928 the table is zero; the actual length is stored in the compiled code. */
1929
1930 if (c == OP_XCLASS) code += GET(code, 1);
1931
1932 /* Otherwise, we can get the item's length from the table, except that for
1933 repeated character types, we have to test for \p and \P, which have an extra
1934 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1935 must add in its length. */
1936
1937 else
1938 {
1939 switch(c)
1940 {
1941 case OP_TYPESTAR:
1942 case OP_TYPEMINSTAR:
1943 case OP_TYPEPLUS:
1944 case OP_TYPEMINPLUS:
1945 case OP_TYPEQUERY:
1946 case OP_TYPEMINQUERY:
1947 case OP_TYPEPOSSTAR:
1948 case OP_TYPEPOSPLUS:
1949 case OP_TYPEPOSQUERY:
1950 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1951 break;
1952
1953 case OP_TYPEPOSUPTO:
1954 case OP_TYPEUPTO:
1955 case OP_TYPEMINUPTO:
1956 case OP_TYPEEXACT:
1957 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1958 code += 2;
1959 break;
1960
1961 case OP_MARK:
1962 case OP_PRUNE_ARG:
1963 case OP_SKIP_ARG:
1964 case OP_THEN_ARG:
1965 code += code[1];
1966 break;
1967 }
1968
1969 /* Add in the fixed length from the table */
1970
1971 code += PRIV(OP_lengths)[c];
1972
1973 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1974 by a multi-byte character. The length in the table is a minimum, so we have
1975 to arrange to skip the extra bytes. */
1976
1977 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1978 if (utf) switch(c)
1979 {
1980 case OP_CHAR:
1981 case OP_CHARI:
1982 case OP_NOT:
1983 case OP_NOTI:
1984 case OP_EXACT:
1985 case OP_EXACTI:
1986 case OP_NOTEXACT:
1987 case OP_NOTEXACTI:
1988 case OP_UPTO:
1989 case OP_UPTOI:
1990 case OP_NOTUPTO:
1991 case OP_NOTUPTOI:
1992 case OP_MINUPTO:
1993 case OP_MINUPTOI:
1994 case OP_NOTMINUPTO:
1995 case OP_NOTMINUPTOI:
1996 case OP_POSUPTO:
1997 case OP_POSUPTOI:
1998 case OP_NOTPOSUPTO:
1999 case OP_NOTPOSUPTOI:
2000 case OP_STAR:
2001 case OP_STARI:
2002 case OP_NOTSTAR:
2003 case OP_NOTSTARI:
2004 case OP_MINSTAR:
2005 case OP_MINSTARI:
2006 case OP_NOTMINSTAR:
2007 case OP_NOTMINSTARI:
2008 case OP_POSSTAR:
2009 case OP_POSSTARI:
2010 case OP_NOTPOSSTAR:
2011 case OP_NOTPOSSTARI:
2012 case OP_PLUS:
2013 case OP_PLUSI:
2014 case OP_NOTPLUS:
2015 case OP_NOTPLUSI:
2016 case OP_MINPLUS:
2017 case OP_MINPLUSI:
2018 case OP_NOTMINPLUS:
2019 case OP_NOTMINPLUSI:
2020 case OP_POSPLUS:
2021 case OP_POSPLUSI:
2022 case OP_NOTPOSPLUS:
2023 case OP_NOTPOSPLUSI:
2024 case OP_QUERY:
2025 case OP_QUERYI:
2026 case OP_NOTQUERY:
2027 case OP_NOTQUERYI:
2028 case OP_MINQUERY:
2029 case OP_MINQUERYI:
2030 case OP_NOTMINQUERY:
2031 case OP_NOTMINQUERYI:
2032 case OP_POSQUERY:
2033 case OP_POSQUERYI:
2034 case OP_NOTPOSQUERY:
2035 case OP_NOTPOSQUERYI:
2036 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2037 break;
2038 }
2039 #else
2040 (void)(utf); /* Keep compiler happy by referencing function argument */
2041 #endif
2042 }
2043 }
2044 }
2045
2046
2047
2048 /*************************************************
2049 * Scan compiled branch for non-emptiness *
2050 *************************************************/
2051
2052 /* This function scans through a branch of a compiled pattern to see whether it
2053 can match the empty string or not. It is called from could_be_empty()
2054 below and from compile_branch() when checking for an unlimited repeat of a
2055 group that can match nothing. Note that first_significant_code() skips over
2056 backward and negative forward assertions when its final argument is TRUE. If we
2057 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2058 bracket whose current branch will already have been scanned.
2059
2060 Arguments:
2061 code points to start of search
2062 endcode points to where to stop
2063 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2064 cd contains pointers to tables etc.
2065 recurses chain of recurse_check to catch mutual recursion
2066
2067 Returns: TRUE if what is matched could be empty
2068 */
2069
2070 typedef struct recurse_check {
2071 struct recurse_check *prev;
2072 const pcre_uchar *group;
2073 } recurse_check;
2074
2075 static BOOL
2076 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2077 BOOL utf, compile_data *cd, recurse_check *recurses)
2078 {
2079 register pcre_uchar c;
2080 recurse_check this_recurse;
2081
2082 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2083 code < endcode;
2084 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2085 {
2086 const pcre_uchar *ccode;
2087
2088 c = *code;
2089
2090 /* Skip over forward assertions; the other assertions are skipped by
2091 first_significant_code() with a TRUE final argument. */
2092
2093 if (c == OP_ASSERT)
2094 {
2095 do code += GET(code, 1); while (*code == OP_ALT);
2096 c = *code;
2097 continue;
2098 }
2099
2100 /* For a recursion/subroutine call, if its end has been reached, which
2101 implies a backward reference subroutine call, we can scan it. If it's a
2102 forward reference subroutine call, we can't. To detect forward reference
2103 we have to scan up the list that is kept in the workspace. This function is
2104 called only when doing the real compile, not during the pre-compile that
2105 measures the size of the compiled pattern. */
2106
2107 if (c == OP_RECURSE)
2108 {
2109 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2110 BOOL empty_branch;
2111
2112 /* Test for forward reference or uncompleted reference. This is disabled
2113 when called to scan a completed pattern by setting cd->start_workspace to
2114 NULL. */
2115
2116 if (cd->start_workspace != NULL)
2117 {
2118 const pcre_uchar *tcode;
2119 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2120 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2121 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2122 }
2123
2124 /* If we are scanning a completed pattern, there are no forward references
2125 and all groups are complete. We need to detect whether this is a recursive
2126 call, as otherwise there will be an infinite loop. If it is a recursion,
2127 just skip over it. Simple recursions are easily detected. For mutual
2128 recursions we keep a chain on the stack. */
2129
2130 else
2131 {
2132 recurse_check *r = recurses;
2133 const pcre_uchar *endgroup = scode;
2134
2135 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2136 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2137
2138 for (r = recurses; r != NULL; r = r->prev)
2139 if (r->group == scode) break;
2140 if (r != NULL) continue; /* Mutual recursion */
2141 }
2142
2143 /* Completed reference; scan the referenced group, remembering it on the
2144 stack chain to detect mutual recursions. */
2145
2146 empty_branch = FALSE;
2147 this_recurse.prev = recurses;
2148 this_recurse.group = scode;
2149
2150 do
2151 {
2152 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2153 {
2154 empty_branch = TRUE;
2155 break;
2156 }
2157 scode += GET(scode, 1);
2158 }
2159 while (*scode == OP_ALT);
2160
2161 if (!empty_branch) return FALSE; /* All branches are non-empty */
2162 continue;
2163 }
2164
2165 /* Groups with zero repeats can of course be empty; skip them. */
2166
2167 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2168 c == OP_BRAPOSZERO)
2169 {
2170 code += PRIV(OP_lengths)[c];
2171 do code += GET(code, 1); while (*code == OP_ALT);
2172 c = *code;
2173 continue;
2174 }
2175
2176 /* A nested group that is already marked as "could be empty" can just be
2177 skipped. */
2178
2179 if (c == OP_SBRA || c == OP_SBRAPOS ||
2180 c == OP_SCBRA || c == OP_SCBRAPOS)
2181 {
2182 do code += GET(code, 1); while (*code == OP_ALT);
2183 c = *code;
2184 continue;
2185 }
2186
2187 /* For other groups, scan the branches. */
2188
2189 if (c == OP_BRA || c == OP_BRAPOS ||
2190 c == OP_CBRA || c == OP_CBRAPOS ||
2191 c == OP_ONCE || c == OP_ONCE_NC ||
2192 c == OP_COND)
2193 {
2194 BOOL empty_branch;
2195 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2196
2197 /* If a conditional group has only one branch, there is a second, implied,
2198 empty branch, so just skip over the conditional, because it could be empty.
2199 Otherwise, scan the individual branches of the group. */
2200
2201 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2202 code += GET(code, 1);
2203 else
2204 {
2205 empty_branch = FALSE;
2206 do
2207 {
2208 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2209 empty_branch = TRUE;
2210 code += GET(code, 1);
2211 }
2212 while (*code == OP_ALT);
2213 if (!empty_branch) return FALSE; /* All branches are non-empty */
2214 }
2215
2216 c = *code;
2217 continue;
2218 }
2219
2220 /* Handle the other opcodes */
2221
2222 switch (c)
2223 {
2224 /* Check for quantifiers after a class. XCLASS is used for classes that
2225 cannot be represented just by a bit map. This includes negated single
2226 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2227 actual length is stored in the compiled code, so we must update "code"
2228 here. */
2229
2230 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2231 case OP_XCLASS:
2232 ccode = code += GET(code, 1);
2233 goto CHECK_CLASS_REPEAT;
2234 #endif
2235
2236 case OP_CLASS:
2237 case OP_NCLASS:
2238 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2239
2240 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2241 CHECK_CLASS_REPEAT:
2242 #endif
2243
2244 switch (*ccode)
2245 {
2246 case OP_CRSTAR: /* These could be empty; continue */
2247 case OP_CRMINSTAR:
2248 case OP_CRQUERY:
2249 case OP_CRMINQUERY:
2250 break;
2251
2252 default: /* Non-repeat => class must match */
2253 case OP_CRPLUS: /* These repeats aren't empty */
2254 case OP_CRMINPLUS:
2255 return FALSE;
2256
2257 case OP_CRRANGE:
2258 case OP_CRMINRANGE:
2259 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2260 break;
2261 }
2262 break;
2263
2264 /* Opcodes that must match a character */
2265
2266 case OP_ANY:
2267 case OP_ALLANY:
2268 case OP_ANYBYTE:
2269
2270 case OP_PROP:
2271 case OP_NOTPROP:
2272 case OP_ANYNL:
2273
2274 case OP_NOT_HSPACE:
2275 case OP_HSPACE:
2276 case OP_NOT_VSPACE:
2277 case OP_VSPACE:
2278 case OP_EXTUNI:
2279
2280 case OP_NOT_DIGIT:
2281 case OP_DIGIT:
2282 case OP_NOT_WHITESPACE:
2283 case OP_WHITESPACE:
2284 case OP_NOT_WORDCHAR:
2285 case OP_WORDCHAR:
2286
2287 case OP_CHAR:
2288 case OP_CHARI:
2289 case OP_NOT:
2290 case OP_NOTI:
2291
2292 case OP_PLUS:
2293 case OP_PLUSI:
2294 case OP_MINPLUS:
2295 case OP_MINPLUSI:
2296
2297 case OP_NOTPLUS:
2298 case OP_NOTPLUSI:
2299 case OP_NOTMINPLUS:
2300 case OP_NOTMINPLUSI:
2301
2302 case OP_POSPLUS:
2303 case OP_POSPLUSI:
2304 case OP_NOTPOSPLUS:
2305 case OP_NOTPOSPLUSI:
2306
2307 case OP_EXACT:
2308 case OP_EXACTI:
2309 case OP_NOTEXACT:
2310 case OP_NOTEXACTI:
2311
2312 case OP_TYPEPLUS:
2313 case OP_TYPEMINPLUS:
2314 case OP_TYPEPOSPLUS:
2315 case OP_TYPEEXACT:
2316
2317 return FALSE;
2318
2319 /* These are going to continue, as they may be empty, but we have to
2320 fudge the length for the \p and \P cases. */
2321
2322 case OP_TYPESTAR:
2323 case OP_TYPEMINSTAR:
2324 case OP_TYPEPOSSTAR:
2325 case OP_TYPEQUERY:
2326 case OP_TYPEMINQUERY:
2327 case OP_TYPEPOSQUERY:
2328 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2329 break;
2330
2331 /* Same for these */
2332
2333 case OP_TYPEUPTO:
2334 case OP_TYPEMINUPTO:
2335 case OP_TYPEPOSUPTO:
2336 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2337 code += 2;
2338 break;
2339
2340 /* End of branch */
2341
2342 case OP_KET:
2343 case OP_KETRMAX:
2344 case OP_KETRMIN:
2345 case OP_KETRPOS:
2346 case OP_ALT:
2347 return TRUE;
2348
2349 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2350 MINUPTO, and POSUPTO and their caseless and negative versions may be
2351 followed by a multibyte character. */
2352
2353 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2354 case OP_STAR:
2355 case OP_STARI:
2356 case OP_NOTSTAR:
2357 case OP_NOTSTARI:
2358
2359 case OP_MINSTAR:
2360 case OP_MINSTARI:
2361 case OP_NOTMINSTAR:
2362 case OP_NOTMINSTARI:
2363
2364 case OP_POSSTAR:
2365 case OP_POSSTARI:
2366 case OP_NOTPOSSTAR:
2367 case OP_NOTPOSSTARI:
2368
2369 case OP_QUERY:
2370 case OP_QUERYI:
2371 case OP_NOTQUERY:
2372 case OP_NOTQUERYI:
2373
2374 case OP_MINQUERY:
2375 case OP_MINQUERYI:
2376 case OP_NOTMINQUERY:
2377 case OP_NOTMINQUERYI:
2378
2379 case OP_POSQUERY:
2380 case OP_POSQUERYI:
2381 case OP_NOTPOSQUERY:
2382 case OP_NOTPOSQUERYI:
2383
2384 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2385 break;
2386
2387 case OP_UPTO:
2388 case OP_UPTOI:
2389 case OP_NOTUPTO:
2390 case OP_NOTUPTOI:
2391
2392 case OP_MINUPTO:
2393 case OP_MINUPTOI:
2394 case OP_NOTMINUPTO:
2395 case OP_NOTMINUPTOI:
2396
2397 case OP_POSUPTO:
2398 case OP_POSUPTOI:
2399 case OP_NOTPOSUPTO:
2400 case OP_NOTPOSUPTOI:
2401
2402 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2403 break;
2404 #endif
2405
2406 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2407 string. */
2408
2409 case OP_MARK:
2410 case OP_PRUNE_ARG:
2411 case OP_SKIP_ARG:
2412 case OP_THEN_ARG:
2413 code += code[1];
2414 break;
2415
2416 /* None of the remaining opcodes are required to match a character. */
2417
2418 default:
2419 break;
2420 }
2421 }
2422
2423 return TRUE;
2424 }
2425
2426
2427
2428 /*************************************************
2429 * Scan compiled regex for non-emptiness *
2430 *************************************************/
2431
2432 /* This function is called to check for left recursive calls. We want to check
2433 the current branch of the current pattern to see if it could match the empty
2434 string. If it could, we must look outwards for branches at other levels,
2435 stopping when we pass beyond the bracket which is the subject of the recursion.
2436 This function is called only during the real compile, not during the
2437 pre-compile.
2438
2439 Arguments:
2440 code points to start of the recursion
2441 endcode points to where to stop (current RECURSE item)
2442 bcptr points to the chain of current (unclosed) branch starts
2443 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2444 cd pointers to tables etc
2445
2446 Returns: TRUE if what is matched could be empty
2447 */
2448
2449 static BOOL
2450 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2451 branch_chain *bcptr, BOOL utf, compile_data *cd)
2452 {
2453 while (bcptr != NULL && bcptr->current_branch >= code)
2454 {
2455 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2456 return FALSE;
2457 bcptr = bcptr->outer;
2458 }
2459 return TRUE;
2460 }
2461
2462
2463
2464 /*************************************************
2465 * Check for POSIX class syntax *
2466 *************************************************/
2467
2468 /* This function is called when the sequence "[:" or "[." or "[=" is
2469 encountered in a character class. It checks whether this is followed by a
2470 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2471 reach an unescaped ']' without the special preceding character, return FALSE.
2472
2473 Originally, this function only recognized a sequence of letters between the
2474 terminators, but it seems that Perl recognizes any sequence of characters,
2475 though of course unknown POSIX names are subsequently rejected. Perl gives an
2476 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2477 didn't consider this to be a POSIX class. Likewise for [:1234:].
2478
2479 The problem in trying to be exactly like Perl is in the handling of escapes. We
2480 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2481 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2482 below handles the special case of \], but does not try to do any other escape
2483 processing. This makes it different from Perl for cases such as [:l\ower:]
2484 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2485 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2486 I think.
2487
2488 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2489 It seems that the appearance of a nested POSIX class supersedes an apparent
2490 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2491 a digit.
2492
2493 In Perl, unescaped square brackets may also appear as part of class names. For
2494 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2495 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2496 seem right at all. PCRE does not allow closing square brackets in POSIX class
2497 names.
2498
2499 Arguments:
2500 ptr pointer to the initial [
2501 endptr where to return the end pointer
2502
2503 Returns: TRUE or FALSE
2504 */
2505
2506 static BOOL
2507 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2508 {
2509 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
2510 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2511 for (++ptr; *ptr != CHAR_NULL; ptr++)
2512 {
2513 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2514 ptr++;
2515 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2516 else
2517 {
2518 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2519 {
2520 *endptr = ptr;
2521 return TRUE;
2522 }
2523 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2524 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2525 ptr[1] == CHAR_EQUALS_SIGN) &&
2526 check_posix_syntax(ptr, endptr))
2527 return FALSE;
2528 }
2529 }
2530 return FALSE;
2531 }
2532
2533
2534
2535
2536 /*************************************************
2537 * Check POSIX class name *
2538 *************************************************/
2539
2540 /* This function is called to check the name given in a POSIX-style class entry
2541 such as [:alnum:].
2542
2543 Arguments:
2544 ptr points to the first letter
2545 len the length of the name
2546
2547 Returns: a value representing the name, or -1 if unknown
2548 */
2549
2550 static int
2551 check_posix_name(const pcre_uchar *ptr, int len)
2552 {
2553 const char *pn = posix_names;
2554 register int yield = 0;
2555 while (posix_name_lengths[yield] != 0)
2556 {
2557 if (len == posix_name_lengths[yield] &&
2558 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
2559 pn += posix_name_lengths[yield] + 1;
2560 yield++;
2561 }
2562 return -1;
2563 }
2564
2565
2566 /*************************************************
2567 * Adjust OP_RECURSE items in repeated group *
2568 *************************************************/
2569
2570 /* OP_RECURSE items contain an offset from the start of the regex to the group
2571 that is referenced. This means that groups can be replicated for fixed
2572 repetition simply by copying (because the recursion is allowed to refer to
2573 earlier groups that are outside the current group). However, when a group is
2574 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2575 inserted before it, after it has been compiled. This means that any OP_RECURSE
2576 items within it that refer to the group itself or any contained groups have to
2577 have their offsets adjusted. That one of the jobs of this function. Before it
2578 is called, the partially compiled regex must be temporarily terminated with
2579 OP_END.
2580
2581 This function has been extended with the possibility of forward references for
2582 recursions and subroutine calls. It must also check the list of such references
2583 for the group we are dealing with. If it finds that one of the recursions in
2584 the current group is on this list, it adjusts the offset in the list, not the
2585 value in the reference (which is a group number).
2586
2587 Arguments:
2588 group points to the start of the group
2589 adjust the amount by which the group is to be moved
2590 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2591 cd contains pointers to tables etc.
2592 save_hwm the hwm forward reference pointer at the start of the group
2593
2594 Returns: nothing
2595 */
2596
2597 static void
2598 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2599 pcre_uchar *save_hwm)
2600 {
2601 pcre_uchar *ptr = group;
2602
2603 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2604 {
2605 int offset;
2606 pcre_uchar *hc;
2607
2608 /* See if this recursion is on the forward reference list. If so, adjust the
2609 reference. */
2610
2611 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2612 {
2613 offset = (int)GET(hc, 0);
2614 if (cd->start_code + offset == ptr + 1)
2615 {
2616 PUT(hc, 0, offset + adjust);
2617 break;
2618 }
2619 }
2620
2621 /* Otherwise, adjust the recursion offset if it's after the start of this
2622 group. */
2623
2624 if (hc >= cd->hwm)
2625 {
2626 offset = (int)GET(ptr, 1);
2627 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2628 }
2629
2630 ptr += 1 + LINK_SIZE;
2631 }
2632 }
2633
2634
2635
2636 /*************************************************
2637 * Insert an automatic callout point *
2638 *************************************************/
2639
2640 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2641 callout points before each pattern item.
2642
2643 Arguments:
2644 code current code pointer
2645 ptr current pattern pointer
2646 cd pointers to tables etc
2647
2648 Returns: new code pointer
2649 */
2650
2651 static pcre_uchar *
2652 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2653 {
2654 *code++ = OP_CALLOUT;
2655 *code++ = 255;
2656 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2657 PUT(code, LINK_SIZE, 0); /* Default length */
2658 return code + 2 * LINK_SIZE;
2659 }
2660
2661
2662
2663 /*************************************************
2664 * Complete a callout item *
2665 *************************************************/
2666
2667 /* A callout item contains the length of the next item in the pattern, which
2668 we can't fill in till after we have reached the relevant point. This is used
2669 for both automatic and manual callouts.
2670
2671 Arguments:
2672 previous_callout points to previous callout item
2673 ptr current pattern pointer
2674 cd pointers to tables etc
2675
2676 Returns: nothing
2677 */
2678
2679 static void
2680 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2681 {
2682 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2683 PUT(previous_callout, 2 + LINK_SIZE, length);
2684 }
2685
2686
2687
2688 #ifdef SUPPORT_UCP
2689 /*************************************************
2690 * Get othercase range *
2691 *************************************************/
2692
2693 /* This function is passed the start and end of a class range, in UTF-8 mode
2694 with UCP support. It searches up the characters, looking for ranges of
2695 characters in the "other" case. Each call returns the next one, updating the
2696 start address. A character with multiple other cases is returned on its own
2697 with a special return value.
2698
2699 Arguments:
2700 cptr points to starting character value; updated
2701 d end value
2702 ocptr where to put start of othercase range
2703 odptr where to put end of othercase range
2704
2705 Yield: -1 when no more
2706 0 when a range is returned
2707 >0 the CASESET offset for char with multiple other cases
2708 in this case, ocptr contains the original
2709 */
2710
2711 static int
2712 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
2713 pcre_uint32 *odptr)
2714 {
2715 pcre_uint32 c, othercase, next;
2716 unsigned int co;
2717
2718 /* Find the first character that has an other case. If it has multiple other
2719 cases, return its case offset value. */
2720
2721 for (c = *cptr; c <= d; c++)
2722 {
2723 if ((co = UCD_CASESET(c)) != 0)
2724 {
2725 *ocptr = c++; /* Character that has the set */
2726 *cptr = c; /* Rest of input range */
2727 return (int)co;
2728 }
2729 if ((othercase = UCD_OTHERCASE(c)) != c) break;
2730 }
2731
2732 if (c > d) return -1; /* Reached end of range */
2733
2734 *ocptr = othercase;
2735 next = othercase + 1;
2736
2737 for (++c; c <= d; c++)
2738 {
2739 if (UCD_OTHERCASE(c) != next) break;
2740 next++;
2741 }
2742
2743 *odptr = next - 1; /* End of othercase range */
2744 *cptr = c; /* Rest of input range */
2745 return 0;
2746 }
2747
2748
2749
2750 /*************************************************
2751 * Check a character and a property *
2752 *************************************************/
2753
2754 /* This function is called by check_auto_possessive() when a property item
2755 is adjacent to a fixed character.
2756
2757 Arguments:
2758 c the character
2759 ptype the property type
2760 pdata the data for the type
2761 negated TRUE if it's a negated property (\P or \p{^)
2762
2763 Returns: TRUE if auto-possessifying is OK
2764 */
2765
2766 static BOOL
2767 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata, BOOL negated)
2768 {
2769 #ifdef SUPPORT_UCP
2770 const pcre_uint32 *p;
2771 #endif
2772
2773 const ucd_record *prop = GET_UCD(c);
2774
2775 switch(ptype)
2776 {
2777 case PT_LAMP:
2778 return (prop->chartype == ucp_Lu ||
2779 prop->chartype == ucp_Ll ||
2780 prop->chartype == ucp_Lt) == negated;
2781
2782 case PT_GC:
2783 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2784
2785 case PT_PC:
2786 return (pdata == prop->chartype) == negated;
2787
2788 case PT_SC:
2789 return (pdata == prop->script) == negated;
2790
2791 /* These are specials */
2792
2793 case PT_ALNUM:
2794 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2795 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2796
2797 case PT_SPACE: /* Perl space */
2798 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2799 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2800 == negated;
2801
2802 case PT_PXSPACE: /* POSIX space */
2803 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2804 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2805 c == CHAR_FF || c == CHAR_CR)
2806 == negated;
2807
2808 case PT_WORD:
2809 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2810 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2811 c == CHAR_UNDERSCORE) == negated;
2812
2813 #ifdef SUPPORT_UCP
2814 case PT_CLIST:
2815 p = PRIV(ucd_caseless_sets) + prop->caseset;
2816 for (;;)
2817 {
2818 if (c < *p) return !negated;
2819 if (c == *p++) return negated;
2820 }
2821 break; /* Control never reaches here */
2822 #endif
2823 }
2824
2825 return FALSE;
2826 }
2827 #endif /* SUPPORT_UCP */
2828
2829
2830
2831 /*************************************************
2832 * Check if auto-possessifying is possible *
2833 *************************************************/
2834
2835 /* This function is called for unlimited repeats of certain items, to see
2836 whether the next thing could possibly match the repeated item. If not, it makes
2837 sense to automatically possessify the repeated item.
2838
2839 Arguments:
2840 previous pointer to the repeated opcode
2841 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2842 ptr next character in pattern
2843 options options bits
2844 cd contains pointers to tables etc.
2845
2846 Returns: TRUE if possessifying is wanted
2847 */
2848
2849 static BOOL
2850 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2851 const pcre_uchar *ptr, int options, compile_data *cd)
2852 {
2853 pcre_uint32 c = NOTACHAR;
2854 pcre_uint32 next;
2855 int escape;
2856 pcre_uchar op_code = *previous++;
2857
2858 /* Skip whitespace and comments in extended mode */
2859
2860 if ((options & PCRE_EXTENDED) != 0)
2861 {
2862 for (;;)
2863 {
2864 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2865 if (*ptr == CHAR_NUMBER_SIGN)
2866 {
2867 ptr++;
2868 while (*ptr != CHAR_NULL)
2869 {
2870 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2871 ptr++;
2872 #ifdef SUPPORT_UTF
2873 if (utf) FORWARDCHAR(ptr);
2874 #endif
2875 }
2876 }
2877 else break;
2878 }
2879 }
2880
2881 /* If the next item is one that we can handle, get its value. A non-negative
2882 value is a character, a negative value is an escape value. */
2883
2884 if (*ptr == CHAR_BACKSLASH)
2885 {
2886 int temperrorcode = 0;
2887 escape = check_escape(&ptr, &next, &temperrorcode, cd->bracount, options,
2888 FALSE);
2889 if (temperrorcode != 0) return FALSE;
2890 ptr++; /* Point after the escape sequence */
2891 }
2892 else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
2893 {
2894 escape = 0;
2895 #ifdef SUPPORT_UTF
2896 if (utf) { GETCHARINC(next, ptr); } else
2897 #endif
2898 next = *ptr++;
2899 }
2900 else return FALSE;
2901
2902 /* Skip whitespace and comments in extended mode */
2903
2904 if ((options & PCRE_EXTENDED) != 0)
2905 {
2906 for (;;)
2907 {
2908 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2909 if (*ptr == CHAR_NUMBER_SIGN)
2910 {
2911 ptr++;
2912 while (*ptr != CHAR_NULL)
2913 {
2914 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2915 ptr++;
2916 #ifdef SUPPORT_UTF
2917 if (utf) FORWARDCHAR(ptr);
2918 #endif
2919 }
2920 }
2921 else break;
2922 }
2923 }
2924
2925 /* If the next thing is itself optional, we have to give up. */
2926
2927 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2928 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2929 return FALSE;
2930
2931 /* If the previous item is a character, get its value. */
2932
2933 if (op_code == OP_CHAR || op_code == OP_CHARI ||
2934 op_code == OP_NOT || op_code == OP_NOTI)
2935 {
2936 #ifdef SUPPORT_UTF
2937 GETCHARTEST(c, previous);
2938 #else
2939 c = *previous;
2940 #endif
2941 }
2942
2943 /* Now compare the next item with the previous opcode. First, handle cases when
2944 the next item is a character. */
2945
2946 if (escape == 0)
2947 {
2948 /* For a caseless UTF match, the next character may have more than one other
2949 case, which maps to the special PT_CLIST property. Check this first. */
2950
2951 #ifdef SUPPORT_UCP
2952 if (utf && c != NOTACHAR && (options & PCRE_CASELESS) != 0)
2953 {
2954 unsigned int ocs = UCD_CASESET(next);
2955 if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, op_code >= OP_NOT);
2956 }
2957 #endif
2958
2959 switch(op_code)
2960 {
2961 case OP_CHAR:
2962 return c != next;
2963
2964 /* For CHARI (caseless character) we must check the other case. If we have
2965 Unicode property support, we can use it to test the other case of
2966 high-valued characters. We know that next can have only one other case,
2967 because multi-other-case characters are dealt with above. */
2968
2969 case OP_CHARI:
2970 if (c == next) return FALSE;
2971 #ifdef SUPPORT_UTF
2972 if (utf)
2973 {
2974 pcre_uint32 othercase;
2975 if (next < 128) othercase = cd->fcc[next]; else
2976 #ifdef SUPPORT_UCP
2977 othercase = UCD_OTHERCASE(next);
2978 #else
2979 othercase = NOTACHAR;
2980 #endif
2981 return c != othercase;
2982 }
2983 else
2984 #endif /* SUPPORT_UTF */
2985 return (c != TABLE_GET(next, cd->fcc, next)); /* Not UTF */
2986
2987 case OP_NOT:
2988 return c == next;
2989
2990 case OP_NOTI:
2991 if (c == next) return TRUE;
2992 #ifdef SUPPORT_UTF
2993 if (utf)
2994 {
2995 pcre_uint32 othercase;
2996 if (next < 128) othercase = cd->fcc[next]; else
2997 #ifdef SUPPORT_UCP
2998 othercase = UCD_OTHERCASE(next);
2999 #else
3000 othercase = NOTACHAR;
3001 #endif
3002 return c == othercase;
3003 }
3004 else
3005 #endif /* SUPPORT_UTF */
3006 return (c == TABLE_GET(next, cd->fcc, next)); /* Not UTF */
3007
3008 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3009 When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3010
3011 case OP_DIGIT:
3012 return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3013
3014 case OP_NOT_DIGIT:
3015 return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3016
3017 case OP_WHITESPACE:
3018 return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3019
3020 case OP_NOT_WHITESPACE:
3021 return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3022
3023 case OP_WORDCHAR:
3024 return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3025
3026 case OP_NOT_WORDCHAR:
3027 return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3028
3029 case OP_HSPACE:
3030 case OP_NOT_HSPACE:
3031 switch(next)
3032 {
3033 HSPACE_CASES:
3034 return op_code == OP_NOT_HSPACE;
3035
3036 default:
3037 return op_code != OP_NOT_HSPACE;
3038 }
3039
3040 case OP_ANYNL:
3041 case OP_VSPACE:
3042 case OP_NOT_VSPACE:
3043 switch(next)
3044 {
3045 VSPACE_CASES:
3046 return op_code == OP_NOT_VSPACE;
3047
3048 default:
3049 return op_code != OP_NOT_VSPACE;
3050 }
3051
3052 #ifdef SUPPORT_UCP
3053 case OP_PROP:
3054 return check_char_prop(next, previous[0], previous[1], FALSE);
3055
3056 case OP_NOTPROP:
3057 return check_char_prop(next, previous[0], previous[1], TRUE);
3058 #endif
3059
3060 default:
3061 return FALSE;
3062 }
3063 }
3064
3065 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3066 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3067 generated only when PCRE_UCP is *not* set, that is, when only ASCII
3068 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3069 replaced by OP_PROP codes when PCRE_UCP is set. */
3070
3071 switch(op_code)
3072 {
3073 case OP_CHAR:
3074 case OP_CHARI:
3075 switch(escape)
3076 {
3077 case ESC_d:
3078 return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3079
3080 case ESC_D:
3081 return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3082
3083 case ESC_s:
3084 return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3085
3086 case ESC_S:
3087 return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3088
3089 case ESC_w:
3090 return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3091
3092 case ESC_W:
3093 return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3094
3095 case ESC_h:
3096 case ESC_H:
3097 switch(c)
3098 {
3099 HSPACE_CASES:
3100 return escape != ESC_h;
3101
3102 default:
3103 return escape == ESC_h;
3104 }
3105
3106 case ESC_v:
3107 case ESC_V:
3108 switch(c)
3109 {
3110 VSPACE_CASES:
3111 return escape != ESC_v;
3112
3113 default:
3114 return escape == ESC_v;
3115 }
3116
3117 /* When PCRE_UCP is set, these values get generated for \d etc. Find
3118 their substitutions and process them. The result will always be either
3119 ESC_p or ESC_P. Then fall through to process those values. */
3120
3121 #ifdef SUPPORT_UCP
3122 case ESC_du:
3123 case ESC_DU:
3124 case ESC_wu:
3125 case ESC_WU:
3126 case ESC_su:
3127 case ESC_SU:
3128 {
3129 int temperrorcode = 0;
3130 ptr = substitutes[escape - ESC_DU];
3131 escape = check_escape(&ptr, &next, &temperrorcode, 0, options, FALSE);
3132 if (temperrorcode != 0) return FALSE;
3133 ptr++; /* For compatibility */
3134 }
3135 /* Fall through */
3136
3137 case ESC_p:
3138 case ESC_P:
3139 {
3140 unsigned int ptype = 0, pdata = 0;
3141 int errorcodeptr;
3142 BOOL negated;
3143
3144 ptr--; /* Make ptr point at the p or P */
3145 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcodeptr))
3146 return FALSE;
3147 ptr++; /* Point past the final curly ket */
3148
3149 /* If the property item is optional, we have to give up. (When generated
3150 from \d etc by PCRE_UCP, this test will have been applied much earlier,
3151 to the original \d etc. At this point, ptr will point to a zero byte. */
3152
3153 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3154 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3155 return FALSE;
3156
3157 /* Do the property check. */
3158
3159 return check_char_prop(c, ptype, pdata, (escape == ESC_P) != negated);
3160 }
3161 #endif
3162
3163 default:
3164 return FALSE;
3165 }
3166
3167 /* In principle, support for Unicode properties should be integrated here as
3168 well. It means re-organizing the above code so as to get hold of the property
3169 values before switching on the op-code. However, I wonder how many patterns
3170 combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3171 these op-codes are never generated.) */
3172
3173 case OP_DIGIT:
3174 return escape == ESC_D || escape == ESC_s || escape == ESC_W ||
3175 escape == ESC_h || escape == ESC_v || escape == ESC_R;
3176
3177 case OP_NOT_DIGIT:
3178 return escape == ESC_d;
3179
3180 case OP_WHITESPACE:
3181 return escape == ESC_S || escape == ESC_d || escape == ESC_w;
3182
3183 case OP_NOT_WHITESPACE:
3184 return escape == ESC_s || escape == ESC_h || escape == ESC_v || escape == ESC_R;
3185
3186 case OP_HSPACE:
3187 return escape == ESC_S || escape == ESC_H || escape == ESC_d ||
3188 escape == ESC_w || escape == ESC_v || escape == ESC_R;
3189
3190 case OP_NOT_HSPACE:
3191 return escape == ESC_h;
3192
3193 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3194 case OP_ANYNL:
3195 case OP_VSPACE:
3196 return escape == ESC_V || escape == ESC_d || escape == ESC_w;
3197
3198 case OP_NOT_VSPACE:
3199 return escape == ESC_v || escape == ESC_R;
3200
3201 case OP_WORDCHAR:
3202 return escape == ESC_W || escape == ESC_s || escape == ESC_h ||
3203 escape == ESC_v || escape == ESC_R;
3204
3205 case OP_NOT_WORDCHAR:
3206 return escape == ESC_w || escape == ESC_d;
3207
3208 default:
3209 return FALSE;
3210 }
3211
3212 /* Control does not reach here */
3213 }
3214
3215
3216
3217 /*************************************************
3218 * Add a character or range to a class *
3219 *************************************************/
3220
3221 /* This function packages up the logic of adding a character or range of
3222 characters to a class. The character values in the arguments will be within the
3223 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3224 mutually recursive with the function immediately below.
3225
3226 Arguments:
3227 classbits the bit map for characters < 256
3228 uchardptr points to the pointer for extra data
3229 options the options word
3230 cd contains pointers to tables etc.
3231 start start of range character
3232 end end of range character
3233
3234 Returns: the number of < 256 characters added
3235 the pointer to extra data is updated
3236 */
3237
3238 static int
3239 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3240 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
3241 {
3242 pcre_uint32 c;
3243 int n8 = 0;
3244
3245 /* If caseless matching is required, scan the range and process alternate
3246 cases. In Unicode, there are 8-bit characters that have alternate cases that
3247 are greater than 255 and vice-versa. Sometimes we can just extend the original
3248 range. */
3249
3250 if ((options & PCRE_CASELESS) != 0)
3251 {
3252 #ifdef SUPPORT_UCP
3253 if ((options & PCRE_UTF8) != 0)
3254 {
3255 int rc;
3256 pcre_uint32 oc, od;
3257
3258 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
3259 c = start;
3260
3261 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3262 {
3263 /* Handle a single character that has more than one other case. */
3264
3265 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3266 PRIV(ucd_caseless_sets) + rc, oc);
3267
3268 /* Do nothing if the other case range is within the original range. */
3269
3270 else if (oc >= start && od <= end) continue;
3271
3272 /* Extend the original range if there is overlap, noting that if oc < c, we
3273 can't have od > end because a subrange is always shorter than the basic
3274 range. Otherwise, use a recursive call to add the additional range. */
3275
3276 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3277 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
3278 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3279 }
3280 }
3281 else
3282 #endif /* SUPPORT_UCP */
3283
3284 /* Not UTF-mode, or no UCP */
3285
3286 for (c = start; c <= end && c < 256; c++)
3287 {
3288 SETBIT(classbits, cd->fcc[c]);
3289 n8++;
3290 }
3291 }
3292
3293 /* Now handle the original range. Adjust the final value according to the bit
3294 length - this means that the same lists of (e.g.) horizontal spaces can be used
3295 in all cases. */
3296
3297 #if defined COMPILE_PCRE8
3298 #ifdef SUPPORT_UTF
3299 if ((options & PCRE_UTF8) == 0)
3300 #endif
3301 if (end > 0xff) end = 0xff;
3302
3303 #elif defined COMPILE_PCRE16
3304 #ifdef SUPPORT_UTF
3305 if ((options & PCRE_UTF16) == 0)
3306 #endif
3307 if (end > 0xffff) end = 0xffff;
3308
3309 #endif /* COMPILE_PCRE[8|16] */
3310
3311 /* If all characters are less than 256, use the bit map. Otherwise use extra
3312 data. */
3313
3314 if (end < 0x100)
3315 {
3316 for (c = start; c <= end; c++)
3317 {
3318 n8++;
3319 SETBIT(classbits, c);
3320 }
3321 }
3322
3323 else
3324 {
3325 pcre_uchar *uchardata = *uchardptr;
3326
3327 #ifdef SUPPORT_UTF
3328 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
3329 {
3330 if (start < end)
3331 {
3332 *uchardata++ = XCL_RANGE;
3333 uchardata += PRIV(ord2utf)(start, uchardata);
3334 uchardata += PRIV(ord2utf)(end, uchardata);
3335 }
3336 else if (start == end)
3337 {
3338 *uchardata++ = XCL_SINGLE;
3339 uchardata += PRIV(ord2utf)(start, uchardata);
3340 }
3341 }
3342 else
3343 #endif /* SUPPORT_UTF */
3344
3345 /* Without UTF support, character values are constrained by the bit length,
3346 and can only be > 256 for 16-bit and 32-bit libraries. */
3347
3348 #ifdef COMPILE_PCRE8
3349 {}
3350 #else
3351 if (start < end)
3352 {
3353 *uchardata++ = XCL_RANGE;
3354 *uchardata++ = start;
3355 *uchardata++ = end;
3356 }
3357 else if (start == end)
3358 {
3359 *uchardata++ = XCL_SINGLE;
3360 *uchardata++ = start;
3361 }
3362 #endif
3363
3364 *uchardptr = uchardata; /* Updata extra data pointer */
3365 }
3366
3367 return n8; /* Number of 8-bit characters */
3368 }
3369
3370
3371
3372
3373 /*************************************************
3374 * Add a list of characters to a class *
3375 *************************************************/
3376
3377 /* This function is used for adding a list of case-equivalent characters to a
3378 class, and also for adding a list of horizontal or vertical whitespace. If the
3379 list is in order (which it should be), ranges of characters are detected and
3380 handled appropriately. This function is mutually recursive with the function
3381 above.
3382
3383 Arguments:
3384 classbits the bit map for characters < 256
3385 uchardptr points to the pointer for extra data
3386 options the options word
3387 cd contains pointers to tables etc.
3388 p points to row of 32-bit values, terminated by NOTACHAR
3389 except character to omit; this is used when adding lists of
3390 case-equivalent characters to avoid including the one we
3391 already know about
3392
3393 Returns: the number of < 256 characters added
3394 the pointer to extra data is updated
3395 */
3396
3397 static int
3398 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3399 compile_data *cd, const pcre_uint32 *p, unsigned int except)
3400 {
3401 int n8 = 0;
3402 while (p[0] < NOTACHAR)
3403 {
3404 int n = 0;
3405 if (p[0] != except)
3406 {
3407 while(p[n+1] == p[0] + n + 1) n++;
3408 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
3409 }
3410 p += n + 1;
3411 }
3412 return n8;
3413 }
3414
3415
3416
3417 /*************************************************
3418 * Add characters not in a list to a class *
3419 *************************************************/
3420
3421 /* This function is used for adding the complement of a list of horizontal or
3422 vertical whitespace to a class. The list must be in order.
3423
3424 Arguments:
3425 classbits the bit map for characters < 256
3426 uchardptr points to the pointer for extra data
3427 options the options word
3428 cd contains pointers to tables etc.
3429 p points to row of 32-bit values, terminated by NOTACHAR
3430
3431 Returns: the number of < 256 characters added
3432 the pointer to extra data is updated
3433 */
3434
3435 static int
3436 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
3437 int options, compile_data *cd, const pcre_uint32 *p)
3438 {
3439 BOOL utf = (options & PCRE_UTF8) != 0;
3440 int n8 = 0;
3441 if (p[0] > 0)
3442 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
3443 while (p[0] < NOTACHAR)
3444 {
3445 while (p[1] == p[0] + 1) p++;
3446 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
3447 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3448 p++;
3449 }
3450 return n8;
3451 }
3452
3453
3454
3455 /*************************************************
3456 * Compile one branch *
3457 *************************************************/
3458
3459 /* Scan the pattern, compiling it into the a vector. If the options are
3460 changed during the branch, the pointer is used to change the external options
3461 bits. This function is used during the pre-compile phase when we are trying
3462 to find out the amount of memory needed, as well as during the real compile
3463 phase. The value of lengthptr distinguishes the two phases.
3464
3465 Arguments:
3466 optionsptr pointer to the option bits
3467 codeptr points to the pointer to the current code point
3468 ptrptr points to the current pattern pointer
3469 errorcodeptr points to error code variable
3470 firstcharptr place to put the first required character
3471 firstcharflagsptr place to put the first character flags, or a negative number
3472 reqcharptr place to put the last required character
3473 reqcharflagsptr place to put the last required character flags, or a negative number
3474 bcptr points to current branch chain
3475 cond_depth conditional nesting depth
3476 cd contains pointers to tables etc.
3477 lengthptr NULL during the real compile phase
3478 points to length accumulator during pre-compile phase
3479
3480 Returns: TRUE on success
3481 FALSE, with *errorcodeptr set non-zero on error
3482 */
3483
3484 static BOOL
3485 compile_branch(int *optionsptr, pcre_uchar **codeptr,
3486 const pcre_uchar **ptrptr, int *errorcodeptr,
3487 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
3488 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
3489 branch_chain *bcptr, int cond_depth,
3490 compile_data *cd, int *lengthptr)
3491 {
3492 int repeat_type, op_type;
3493 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3494 int bravalue = 0;
3495 int greedy_default, greedy_non_default;
3496 pcre_uint32 firstchar, reqchar;
3497 pcre_int32 firstcharflags, reqcharflags;
3498 pcre_uint32 zeroreqchar, zerofirstchar;
3499 pcre_int32 zeroreqcharflags, zerofirstcharflags;
3500 pcre_int32 req_caseopt, reqvary, tempreqvary;
3501 int options = *optionsptr; /* May change dynamically */
3502 int after_manual_callout = 0;
3503 int length_prevgroup = 0;
3504 register pcre_uint32 c;
3505 int escape;
3506 register pcre_uchar *code = *codeptr;
3507 pcre_uchar *last_code = code;
3508 pcre_uchar *orig_code = code;
3509 pcre_uchar *tempcode;
3510 BOOL inescq = FALSE;
3511 BOOL groupsetfirstchar = FALSE;
3512 const pcre_uchar *ptr = *ptrptr;
3513 const pcre_uchar *tempptr;
3514 const pcre_uchar *nestptr = NULL;
3515 pcre_uchar *previous = NULL;
3516 pcre_uchar *previous_callout = NULL;
3517 pcre_uchar *save_hwm = NULL;
3518 pcre_uint8 classbits[32];
3519
3520 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3521 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3522 dynamically as we process the pattern. */
3523
3524 #ifdef SUPPORT_UTF
3525 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
3526 BOOL utf = (options & PCRE_UTF8) != 0;
3527 #ifndef COMPILE_PCRE32
3528 pcre_uchar utf_chars[6];
3529 #endif
3530 #else
3531 BOOL utf = FALSE;
3532 #endif
3533
3534 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
3535 class_uchardata always so that it can be passed to add_to_class() always,
3536 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
3537 alternative calls for the different cases. */
3538
3539 pcre_uchar *class_uchardata;
3540 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3541 BOOL xclass;
3542 pcre_uchar *class_uchardata_base;
3543 #endif
3544
3545 #ifdef PCRE_DEBUG
3546 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3547 #endif
3548
3549 /* Set up the default and non-default settings for greediness */
3550
3551 greedy_default = ((options & PCRE_UNGREEDY) != 0);
3552 greedy_non_default = greedy_default ^ 1;
3553
3554 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3555 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3556 matches a non-fixed char first char; reqchar just remains unset if we never
3557 find one.
3558
3559 When we hit a repeat whose minimum is zero, we may have to adjust these values
3560 to take the zero repeat into account. This is implemented by setting them to
3561 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3562 item types that can be repeated set these backoff variables appropriately. */
3563
3564 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
3565 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
3566
3567 /* The variable req_caseopt contains either the REQ_CASELESS value
3568 or zero, according to the current setting of the caseless flag. The
3569 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3570 firstchar or reqchar variables to record the case status of the
3571 value. This is used only for ASCII characters. */
3572
3573 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3574
3575 /* Switch on next character until the end of the branch */
3576
3577 for (;; ptr++)
3578 {
3579 BOOL negate_class;
3580 BOOL should_flip_negation;
3581 BOOL possessive_quantifier;
3582 BOOL is_quantifier;
3583 BOOL is_recurse;
3584 BOOL reset_bracount;
3585 int class_has_8bitchar;
3586 int class_one_char;
3587 int newoptions;
3588 int recno;
3589 int refsign;
3590 int skipbytes;
3591 pcre_uint32 subreqchar, subfirstchar;
3592 pcre_int32 subreqcharflags, subfirstcharflags;
3593 int terminator;
3594 unsigned int mclength;
3595 unsigned int tempbracount;
3596 pcre_uint32 ec;
3597 pcre_uchar mcbuffer[8];
3598
3599 /* Get next character in the pattern */
3600
3601 c = *ptr;
3602
3603 /* If we are at the end of a nested substitution, revert to the outer level
3604 string. Nesting only happens one level deep. */
3605
3606 if (c == CHAR_NULL && nestptr != NULL)
3607 {
3608 ptr = nestptr;
3609 nestptr = NULL;
3610 c = *ptr;
3611 }
3612
3613 /* If we are in the pre-compile phase, accumulate the length used for the
3614 previous cycle of this loop. */
3615
3616 if (lengthptr != NULL)
3617 {
3618 #ifdef PCRE_DEBUG
3619 if (code > cd->hwm) cd->hwm = code; /* High water info */
3620 #endif
3621 if (code > cd->start_workspace + cd->workspace_size -
3622 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
3623 {
3624 *errorcodeptr = ERR52;
3625 goto FAILED;
3626 }
3627
3628 /* There is at least one situation where code goes backwards: this is the
3629 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3630 the class is simply eliminated. However, it is created first, so we have to
3631 allow memory for it. Therefore, don't ever reduce the length at this point.
3632 */
3633
3634 if (code < last_code) code = last_code;
3635
3636 /* Paranoid check for integer overflow */
3637
3638 if (OFLOW_MAX - *lengthptr < code - last_code)
3639 {
3640 *errorcodeptr = ERR20;
3641 goto FAILED;
3642 }
3643
3644 *lengthptr += (int)(code - last_code);
3645 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3646 (int)(code - last_code), c, c));
3647
3648 /* If "previous" is set and it is not at the start of the work space, move
3649 it back to there, in order to avoid filling up the work space. Otherwise,
3650 if "previous" is NULL, reset the current code pointer to the start. */
3651
3652 if (previous != NULL)
3653 {
3654 if (previous > orig_code)
3655 {
3656 memmove(orig_code, previous, IN_UCHARS(code - previous));
3657 code -= previous - orig_code;
3658 previous = orig_code;
3659 }
3660 }
3661 else code = orig_code;
3662
3663 /* Remember where this code item starts so we can pick up the length
3664 next time round. */
3665
3666 last_code = code;
3667 }
3668
3669 /* In the real compile phase, just check the workspace used by the forward
3670 reference list. */
3671
3672 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3673 WORK_SIZE_SAFETY_MARGIN)
3674 {
3675 *errorcodeptr = ERR52;
3676 goto FAILED;
3677 }
3678
3679 /* If in \Q...\E, check for the end; if not, we have a literal */
3680
3681 if (inescq && c != CHAR_NULL)
3682 {
3683 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3684 {
3685 inescq = FALSE;
3686 ptr++;
3687 continue;
3688 }
3689 else
3690 {
3691 if (previous_callout != NULL)
3692 {
3693 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3694 complete_callout(previous_callout, ptr, cd);
3695 previous_callout = NULL;
3696 }
3697 if ((options & PCRE_AUTO_CALLOUT) != 0)
3698 {
3699 previous_callout = code;
3700 code = auto_callout(code, ptr, cd);
3701 }
3702 goto NORMAL_CHAR;
3703 }
3704 }
3705
3706 /* Fill in length of a previous callout, except when the next thing is
3707 a quantifier. */
3708
3709 is_quantifier =
3710 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3711 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3712
3713 if (!is_quantifier && previous_callout != NULL &&
3714 after_manual_callout-- <= 0)
3715 {
3716 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3717 complete_callout(previous_callout, ptr, cd);
3718 previous_callout = NULL;
3719 }
3720
3721 /* In extended mode, skip white space and comments. */
3722
3723 if ((options & PCRE_EXTENDED) != 0)
3724 {
3725 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3726 if (c == CHAR_NUMBER_SIGN)
3727 {
3728 ptr++;
3729 while (*ptr != CHAR_NULL)
3730 {
3731 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3732 ptr++;
3733 #ifdef SUPPORT_UTF
3734 if (utf) FORWARDCHAR(ptr);
3735 #endif
3736 }
3737 if (*ptr != CHAR_NULL) continue;
3738
3739 /* Else fall through to handle end of string */
3740 c = 0;
3741 }
3742 }
3743
3744 /* No auto callout for quantifiers. */
3745
3746 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3747 {
3748 previous_callout = code;
3749 code = auto_callout(code, ptr, cd);
3750 }
3751
3752 switch(c)
3753 {
3754 /* ===================================================================*/
3755 case 0: /* The branch terminates at string end */
3756 case CHAR_VERTICAL_LINE: /* or | or ) */
3757 case CHAR_RIGHT_PARENTHESIS:
3758 *firstcharptr = firstchar;
3759 *firstcharflagsptr = firstcharflags;
3760 *reqcharptr = reqchar;
3761 *reqcharflagsptr = reqcharflags;
3762 *codeptr = code;
3763 *ptrptr = ptr;
3764 if (lengthptr != NULL)
3765 {
3766 if (OFLOW_MAX - *lengthptr < code - last_code)
3767 {
3768 *errorcodeptr = ERR20;
3769 goto FAILED;
3770 }
3771 *lengthptr += (int)(code - last_code); /* To include callout length */
3772 DPRINTF((">> end branch\n"));
3773 }
3774 return TRUE;
3775
3776
3777 /* ===================================================================*/
3778 /* Handle single-character metacharacters. In multiline mode, ^ disables
3779 the setting of any following char as a first character. */
3780
3781 case CHAR_CIRCUMFLEX_ACCENT:
3782 previous = NULL;
3783 if ((options & PCRE_MULTILINE) != 0)
3784 {
3785 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
3786 *code++ = OP_CIRCM;
3787 }
3788 else *code++ = OP_CIRC;
3789 break;
3790
3791 case CHAR_DOLLAR_SIGN:
3792 previous = NULL;
3793 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3794 break;
3795
3796 /* There can never be a first char if '.' is first, whatever happens about
3797 repeats. The value of reqchar doesn't change either. */
3798
3799 case CHAR_DOT:
3800 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
3801 zerofirstchar = firstchar;
3802 zerofirstcharflags = firstcharflags;
3803 zeroreqchar = reqchar;
3804 zeroreqcharflags = reqcharflags;
3805 previous = code;
3806 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3807 break;
3808
3809
3810 /* ===================================================================*/
3811 /* Character classes. If the included characters are all < 256, we build a
3812 32-byte bitmap of the permitted characters, except in the special case
3813 where there is only one such character. For negated classes, we build the
3814 map as usual, then invert it at the end. However, we use a different opcode
3815 so that data characters > 255 can be handled correctly.
3816
3817 If the class contains characters outside the 0-255 range, a different
3818 opcode is compiled. It may optionally have a bit map for characters < 256,
3819 but those above are are explicitly listed afterwards. A flag byte tells
3820 whether the bitmap is present, and whether this is a negated class or not.
3821
3822 In JavaScript compatibility mode, an isolated ']' causes an error. In
3823 default (Perl) mode, it is treated as a data character. */
3824
3825 case CHAR_RIGHT_SQUARE_BRACKET:
3826 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3827 {
3828 *errorcodeptr = ERR64;
3829 goto FAILED;
3830 }
3831 goto NORMAL_CHAR;
3832
3833 case CHAR_LEFT_SQUARE_BRACKET:
3834 previous = code;
3835
3836 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3837 they are encountered at the top level, so we'll do that too. */
3838
3839 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3840 ptr[1] == CHAR_EQUALS_SIGN) &&
3841 check_posix_syntax(ptr, &tempptr))
3842 {
3843 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3844 goto FAILED;
3845 }
3846
3847 /* If the first character is '^', set the negation flag and skip it. Also,
3848 if the first few characters (either before or after ^) are \Q\E or \E we
3849 skip them too. This makes for compatibility with Perl. */
3850
3851 negate_class = FALSE;
3852 for (;;)
3853 {
3854 c = *(++ptr);
3855 if (c == CHAR_BACKSLASH)
3856 {
3857 if (ptr[1] == CHAR_E)
3858 ptr++;
3859 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3860 ptr += 3;
3861 else
3862 break;
3863 }
3864 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3865 negate_class = TRUE;
3866 else break;
3867 }
3868
3869 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3870 an initial ']' is taken as a data character -- the code below handles
3871 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3872 [^] must match any character, so generate OP_ALLANY. */
3873
3874 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3875 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3876 {
3877 *code++ = negate_class? OP_ALLANY : OP_FAIL;
3878 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
3879 zerofirstchar = firstchar;
3880 zerofirstcharflags = firstcharflags;
3881 break;
3882 }
3883
3884 /* If a class contains a negative special such as \S, we need to flip the
3885 negation flag at the end, so that support for characters > 255 works
3886 correctly (they are all included in the class). */
3887
3888 should_flip_negation = FALSE;
3889
3890 /* For optimization purposes, we track some properties of the class:
3891 class_has_8bitchar will be non-zero if the class contains at least one <
3892 256 character; class_one_char will be 1 if the class contains just one
3893 character. */
3894
3895 class_has_8bitchar = 0;
3896 class_one_char = 0;
3897
3898 /* Initialize the 32-char bit map to all zeros. We build the map in a
3899 temporary bit of memory, in case the class contains fewer than two
3900 8-bit characters because in that case the compiled code doesn't use the bit
3901 map. */
3902
3903 memset(classbits, 0, 32 * sizeof(pcre_uint8));
3904
3905 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3906 xclass = FALSE;
3907 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
3908 class_uchardata_base = class_uchardata; /* Save the start */
3909 #endif
3910
3911 /* Process characters until ] is reached. By writing this as a "do" it
3912 means that an initial ] is taken as a data character. At the start of the
3913 loop, c contains the first byte of the character. */
3914
3915 if (c != CHAR_NULL) do
3916 {
3917 const pcre_uchar *oldptr;
3918
3919 #ifdef SUPPORT_UTF
3920 if (utf && HAS_EXTRALEN(c))
3921 { /* Braces are required because the */
3922 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3923 }
3924 #endif
3925
3926 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3927 /* In the pre-compile phase, accumulate the length of any extra
3928 data and reset the pointer. This is so that very large classes that
3929 contain a zillion > 255 characters no longer overwrite the work space
3930 (which is on the stack). We have to remember that there was XCLASS data,
3931 however. */
3932
3933 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
3934 {
3935 xclass = TRUE;
3936 *lengthptr += class_uchardata - class_uchardata_base;
3937 class_uchardata = class_uchardata_base;
3938 }
3939 #endif
3940
3941 /* Inside \Q...\E everything is literal except \E */
3942
3943 if (inescq)
3944 {
3945 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3946 {
3947 inescq = FALSE; /* Reset literal state */
3948 ptr++; /* Skip the 'E' */
3949 continue; /* Carry on with next */
3950 }
3951 goto CHECK_RANGE; /* Could be range if \E follows */
3952 }
3953
3954 /* Handle POSIX class names. Perl allows a negation extension of the
3955 form [:^name:]. A square bracket that doesn't match the syntax is
3956 treated as a literal. We also recognize the POSIX constructions
3957 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3958 5.6 and 5.8 do. */
3959
3960 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3961 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3962 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3963 {
3964 BOOL local_negate = FALSE;
3965 int posix_class, taboffset, tabopt;
3966 register const pcre_uint8 *cbits = cd->cbits;
3967 pcre_uint8 pbits[32];
3968
3969 if (ptr[1] != CHAR_COLON)
3970 {
3971 *errorcodeptr = ERR31;
3972 goto FAILED;
3973 }
3974
3975 ptr += 2;
3976 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3977 {
3978 local_negate = TRUE;
3979 should_flip_negation = TRUE; /* Note negative special */
3980 ptr++;
3981 }
3982
3983 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3984 if (posix_class < 0)
3985 {
3986 *errorcodeptr = ERR30;
3987 goto FAILED;
3988 }
3989
3990 /* If matching is caseless, upper and lower are converted to
3991 alpha. This relies on the fact that the class table starts with
3992 alpha, lower, upper as the first 3 entries. */
3993
3994 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3995 posix_class = 0;
3996
3997 /* When PCRE_UCP is set, some of the POSIX classes are converted to
3998 different escape sequences that use Unicode properties. */
3999
4000 #ifdef SUPPORT_UCP
4001 if ((options & PCRE_UCP) != 0)
4002 {
4003 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4004 if (posix_substitutes[pc] != NULL)
4005 {
4006 nestptr = tempptr + 1;
4007 ptr = posix_substitutes[pc] - 1;
4008 continue;
4009 }
4010 }
4011 #endif
4012 /* In the non-UCP case, we build the bit map for the POSIX class in a
4013 chunk of local store because we may be adding and subtracting from it,
4014 and we don't want to subtract bits that may be in the main map already.
4015 At the end we or the result into the bit map that is being built. */
4016
4017 posix_class *= 3;
4018
4019 /* Copy in the first table (always present) */
4020
4021 memcpy(pbits, cbits + posix_class_maps[posix_class],
4022 32 * sizeof(pcre_uint8));
4023
4024 /* If there is a second table, add or remove it as required. */
4025
4026 taboffset = posix_class_maps[posix_class + 1];
4027 tabopt = posix_class_maps[posix_class + 2];
4028
4029 if (taboffset >= 0)
4030 {
4031 if (tabopt >= 0)
4032 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4033 else
4034 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4035 }
4036
4037 /* Now see if we need to remove any special characters. An option
4038 value of 1 removes vertical space and 2 removes underscore. */
4039
4040 if (tabopt < 0) tabopt = -tabopt;
4041 if (tabopt == 1) pbits[1] &= ~0x3c;
4042 else if (tabopt == 2) pbits[11] &= 0x7f;
4043
4044 /* Add the POSIX table or its complement into the main table that is
4045 being built and we are done. */
4046
4047 if (local_negate)
4048 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4049 else
4050 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4051
4052 ptr = tempptr + 1;
4053 /* Every class contains at least one < 256 character. */
4054 class_has_8bitchar = 1;
4055 /* Every class contains at least two characters. */
4056 class_one_char = 2;
4057 continue; /* End of POSIX syntax handling */
4058 }
4059
4060 /* Backslash may introduce a single character, or it may introduce one
4061 of the specials, which just set a flag. The sequence \b is a special
4062 case. Inside a class (and only there) it is treated as backspace. We
4063 assume that other escapes have more than one character in them, so
4064 speculatively set both class_has_8bitchar and class_one_char bigger
4065 than one. Unrecognized escapes fall through and are either treated
4066 as literal characters (by default), or are faulted if
4067 PCRE_EXTRA is set. */
4068
4069 if (c == CHAR_BACKSLASH)
4070 {
4071 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4072 TRUE);
4073 if (*errorcodeptr != 0) goto FAILED;
4074 if (escape == 0) c = ec;
4075 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4076 else if (escape == ESC_N) /* \N is not supported in a class */
4077 {
4078 *errorcodeptr = ERR71;
4079 goto FAILED;
4080 }
4081 else if (escape == ESC_Q) /* Handle start of quoted string */
4082 {
4083 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4084 {
4085 ptr += 2; /* avoid empty string */
4086 }
4087 else inescq = TRUE;
4088 continue;
4089 }
4090 else if (escape == ESC_E) continue; /* Ignore orphan \E */
4091
4092 else
4093 {
4094 register const pcre_uint8 *cbits = cd->cbits;
4095 /* Every class contains at least two < 256 characters. */
4096 class_has_8bitchar++;
4097 /* Every class contains at least two characters. */
4098 class_one_char += 2;
4099
4100 switch (escape)
4101 {
4102 #ifdef SUPPORT_UCP
4103 case ESC_du: /* These are the values given for \d etc */
4104 case ESC_DU: /* when PCRE_UCP is set. We replace the */
4105 case ESC_wu: /* escape sequence with an appropriate \p */
4106 case ESC_WU: /* or \P to test Unicode properties instead */
4107 case ESC_su: /* of the default ASCII testing. */
4108 case ESC_SU:
4109 nestptr = ptr;
4110 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4111 class_has_8bitchar--; /* Undo! */
4112 continue;
4113 #endif
4114 case ESC_d:
4115 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4116 continue;
4117
4118 case ESC_D:
4119 should_flip_negation = TRUE;
4120 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4121 continue;
4122
4123 case ESC_w:
4124 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4125 continue;
4126
4127 case ESC_W:
4128 should_flip_negation = TRUE;
4129 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4130 continue;
4131
4132 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4133 if it was previously set by something earlier in the character
4134 class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and
4135 EBCDIC, so we lazily just adjust the appropriate bit. */
4136
4137 case ESC_s:
4138 classbits[0] |= cbits[cbit_space];
4139 classbits[1] |= cbits[cbit_space+1] & ~0x08;
4140 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4141 continue;
4142
4143 case ESC_S:
4144 should_flip_negation = TRUE;
4145 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4146 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
4147 continue;
4148
4149 /* The rest apply in both UCP and non-UCP cases. */
4150
4151 case ESC_h:
4152 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4153 PRIV(hspace_list), NOTACHAR);
4154 continue;
4155
4156 case ESC_H:
4157 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4158 cd, PRIV(hspace_list));
4159 continue;
4160
4161 case ESC_v:
4162 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4163 PRIV(vspace_list), NOTACHAR);
4164 continue;
4165
4166 case ESC_V:
4167 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4168 cd, PRIV(vspace_list));
4169 continue;
4170
4171 #ifdef SUPPORT_UCP
4172 case ESC_p:
4173 case ESC_P:
4174 {
4175 BOOL negated;
4176 unsigned int ptype = 0, pdata = 0;
4177 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
4178 goto FAILED;
4179 *class_uchardata++ = ((escape == ESC_p) != negated)?
4180 XCL_PROP : XCL_NOTPROP;
4181 *class_uchardata++ = ptype;
4182 *class_uchardata++ = pdata;
4183 class_has_8bitchar--; /* Undo! */
4184 continue;
4185 }
4186 #endif
4187 /* Unrecognized escapes are faulted if PCRE is running in its
4188 strict mode. By default, for compatibility with Perl, they are
4189 treated as literals. */
4190
4191 default:
4192 if ((options & PCRE_EXTRA) != 0)
4193 {
4194 *errorcodeptr = ERR7;
4195 goto FAILED;
4196 }
4197 class_has_8bitchar--; /* Undo the speculative increase. */
4198 class_one_char -= 2; /* Undo the speculative increase. */
4199 c = *ptr; /* Get the final character and fall through */
4200 break;
4201 }
4202 }
4203
4204 /* Fall through if the escape just defined a single character (c >= 0).
4205 This may be greater than 256. */
4206
4207 escape = 0;
4208
4209 } /* End of backslash handling */
4210
4211 /* A character may be followed by '-' to form a range. However, Perl does
4212 not permit ']' to be the end of the range. A '-' character at the end is
4213 treated as a literal. Perl ignores orphaned \E sequences entirely. The
4214 code for handling \Q and \E is messy. */
4215
4216 CHECK_RANGE:
4217 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4218 {
4219 inescq = FALSE;
4220 ptr += 2;
4221 }
4222 oldptr = ptr;
4223
4224 /* Remember if \r or \n were explicitly used */
4225
4226 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4227
4228 /* Check for range */
4229
4230 if (!inescq && ptr[1] == CHAR_MINUS)
4231 {
4232 pcre_uint32 d;
4233 ptr += 2;
4234 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4235
4236 /* If we hit \Q (not followed by \E) at this point, go into escaped
4237 mode. */
4238
4239 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4240 {
4241 ptr += 2;
4242 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4243 { ptr += 2; continue; }
4244 inescq = TRUE;
4245 break;
4246 }
4247
4248 /* Minus (hyphen) at the end of a class is treated as a literal, so put
4249 back the pointer and jump to handle the character that preceded it. */
4250
4251 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4252 {
4253 ptr = oldptr;
4254 goto CLASS_SINGLE_CHARACTER;
4255 }
4256
4257 /* Otherwise, we have a potential range; pick up the next character */
4258
4259 #ifdef SUPPORT_UTF
4260 if (utf)
4261 { /* Braces are required because the */
4262 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4263 }
4264 else
4265 #endif
4266 d = *ptr; /* Not UTF-8 mode */
4267
4268 /* The second part of a range can be a single-character escape, but
4269 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4270 in such circumstances. */
4271
4272 if (!inescq && d == CHAR_BACKSLASH)
4273 {
4274 int descape;
4275 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
4276 if (*errorcodeptr != 0) goto FAILED;
4277
4278 /* \b is backspace; any other special means the '-' was literal. */
4279
4280 if (descape != 0)
4281 {
4282 if (descape == ESC_b) d = CHAR_BS; else
4283 {
4284 ptr = oldptr;
4285 goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4286 }
4287 }
4288 }
4289
4290 /* Check that the two values are in the correct order. Optimize
4291 one-character ranges. */
4292
4293 if (d < c)
4294 {
4295 *errorcodeptr = ERR8;
4296 goto FAILED;
4297 }
4298 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4299
4300 /* We have found a character range, so single character optimizations
4301 cannot be done anymore. Any value greater than 1 indicates that there
4302 is more than one character. */
4303
4304 class_one_char = 2;
4305
4306 /* Remember an explicit \r or \n, and add the range to the class. */
4307
4308 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4309
4310 class_has_8bitchar +=
4311 add_to_class(classbits, &class_uchardata, options, cd, c, d);
4312
4313 continue; /* Go get the next char in the class */
4314 }
4315
4316 /* Handle a single character - we can get here for a normal non-escape
4317 char, or after \ that introduces a single character or for an apparent
4318 range that isn't. Only the value 1 matters for class_one_char, so don't
4319 increase it if it is already 2 or more ... just in case there's a class
4320 with a zillion characters in it. */
4321
4322 CLASS_SINGLE_CHARACTER:
4323 if (class_one_char < 2) class_one_char++;
4324
4325 /* If class_one_char is 1, we have the first single character in the
4326 class, and there have been no prior ranges, or XCLASS items generated by
4327 escapes. If this is the final character in the class, we can optimize by
4328 turning the item into a 1-character OP_CHAR[I] if it's positive, or
4329 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
4330 to be set. Otherwise, there can be no first char if this item is first,
4331 whatever repeat count may follow. In the case of reqchar, save the
4332 previous value for reinstating. */
4333
4334 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4335 {
4336 ptr++;
4337 zeroreqchar = reqchar;
4338 zeroreqcharflags = reqcharflags;
4339
4340 if (negate_class)
4341 {
4342 #ifdef SUPPORT_UCP
4343 int d;
4344 #endif
4345 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4346 zerofirstchar = firstchar;
4347 zerofirstcharflags = firstcharflags;
4348
4349 /* For caseless UTF-8 mode when UCP support is available, check
4350 whether this character has more than one other case. If so, generate
4351 a special OP_NOTPROP item instead of OP_NOTI. */
4352
4353 #ifdef SUPPORT_UCP
4354 if (utf && (options & PCRE_CASELESS) != 0 &&
4355 (d = UCD_CASESET(c)) != 0)
4356 {
4357 *code++ = OP_NOTPROP;
4358 *code++ = PT_CLIST;
4359 *code++ = d;
4360 }
4361 else
4362 #endif
4363 /* Char has only one other case, or UCP not available */
4364
4365 {
4366 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4367 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4368 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4369 code += PRIV(ord2utf)(c, code);
4370 else
4371 #endif
4372 *code++ = c;
4373 }
4374
4375 /* We are finished with this character class */
4376
4377 goto END_CLASS;
4378 }
4379
4380 /* For a single, positive character, get the value into mcbuffer, and
4381 then we can handle this with the normal one-character code. */
4382
4383 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4384 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4385 mclength = PRIV(ord2utf)(c, mcbuffer);
4386 else
4387 #endif
4388 {
4389 mcbuffer[0] = c;
4390 mclength = 1;
4391 }
4392 goto ONE_CHAR;
4393 } /* End of 1-char optimization */
4394
4395 /* There is more than one character in the class, or an XCLASS item
4396 has been generated. Add this character to the class. */
4397
4398 class_has_8bitchar +=
4399 add_to_class(classbits, &class_uchardata, options, cd, c, c);
4400 }
4401
4402 /* Loop until ']' reached. This "while" is the end of the "do" far above.
4403 If we are at the end of an internal nested string, revert to the outer
4404 string. */
4405
4406 while (((c = *(++ptr)) != CHAR_NULL ||
4407 (nestptr != NULL &&
4408 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
4409 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4410
4411 /* Check for missing terminating ']' */
4412
4413 if (c == CHAR_NULL)
4414 {
4415 *errorcodeptr = ERR6;
4416 goto FAILED;
4417 }
4418
4419 /* We will need an XCLASS if data has been placed in class_uchardata. In
4420 the second phase this is a sufficient test. However, in the pre-compile
4421 phase, class_uchardata gets emptied to prevent workspace overflow, so it
4422 only if the very last character in the class needs XCLASS will it contain
4423 anything at this point. For this reason, xclass gets set TRUE above when
4424 uchar_classdata is emptied, and that's why this code is the way it is here
4425 instead of just doing a test on class_uchardata below. */
4426
4427 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4428 if (class_uchardata > class_uchardata_base) xclass = TRUE;
4429 #endif
4430
4431 /* If this is the first thing in the branch, there can be no first char
4432 setting, whatever the repeat count. Any reqchar setting must remain
4433 unchanged after any kind of repeat. */
4434
4435 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4436 zerofirstchar = firstchar;
4437 zerofirstcharflags = firstcharflags;
4438 zeroreqchar = reqchar;
4439 zeroreqcharflags = reqcharflags;
4440
4441 /* If there are characters with values > 255, we have to compile an
4442 extended class, with its own opcode, unless there was a negated special
4443 such as \S in the class, and PCRE_UCP is not set, because in that case all
4444 characters > 255 are in the class, so any that were explicitly given as
4445 well can be ignored. If (when there are explicit characters > 255 that must
4446 be listed) there are no characters < 256, we can omit the bitmap in the
4447 actual compiled code. */
4448
4449 #ifdef SUPPORT_UTF
4450 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4451 #elif !defined COMPILE_PCRE8
4452 if (xclass && !should_flip_negation)
4453 #endif
4454 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4455 {
4456 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
4457 *code++ = OP_XCLASS;
4458 code += LINK_SIZE;
4459 *code = negate_class? XCL_NOT:0;
4460
4461 /* If the map is required, move up the extra data to make room for it;
4462 otherwise just move the code pointer to the end of the extra data. */
4463
4464 if (class_has_8bitchar > 0)
4465 {
4466 *code++ |= XCL_MAP;
4467 memmove(code + (32 / sizeof(pcre_uchar)), code,
4468 IN_UCHARS(class_uchardata - code));
4469 memcpy(code, classbits, 32);
4470 code = class_uchardata + (32 / sizeof(pcre_uchar));
4471 }
4472 else code = class_uchardata;
4473
4474 /* Now fill in the complete length of the item */
4475
4476 PUT(previous, 1, (int)(code - previous));
4477 break; /* End of class handling */
4478 }
4479 #endif
4480
4481 /* If there are no characters > 255, or they are all to be included or
4482 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4483 whole class was negated and whether there were negative specials such as \S
4484 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4485 negating it if necessary. */
4486
4487 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4488 if (lengthptr == NULL) /* Save time in the pre-compile phase */
4489 {
4490 if (negate_class)
4491 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4492 memcpy(code, classbits, 32);
4493 }
4494 code += 32 / sizeof(pcre_uchar);
4495
4496 END_CLASS:
4497 break;
4498
4499
4500 /* ===================================================================*/
4501 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4502 has been tested above. */
4503
4504 case CHAR_LEFT_CURLY_BRACKET:
4505 if (!is_quantifier) goto NORMAL_CHAR;
4506 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4507 if (*errorcodeptr != 0) goto FAILED;
4508 goto REPEAT;
4509
4510 case CHAR_ASTERISK:
4511 repeat_min = 0;
4512 repeat_max = -1;
4513 goto REPEAT;
4514
4515 case CHAR_PLUS:
4516 repeat_min = 1;
4517 repeat_max = -1;
4518 goto REPEAT;
4519
4520 case CHAR_QUESTION_MARK:
4521 repeat_min = 0;
4522 repeat_max = 1;
4523
4524 REPEAT:
4525 if (previous == NULL)
4526 {
4527 *errorcodeptr = ERR9;
4528 goto FAILED;
4529 }
4530
4531 if (repeat_min == 0)
4532 {
4533 firstchar = zerofirstchar; /* Adjust for zero repeat */
4534 firstcharflags = zerofirstcharflags;
4535 reqchar = zeroreqchar; /* Ditto */
4536 reqcharflags = zeroreqcharflags;
4537 }
4538
4539 /* Remember whether this is a variable length repeat */
4540
4541 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4542
4543 op_type = 0; /* Default single-char op codes */
4544 possessive_quantifier = FALSE; /* Default not possessive quantifier */
4545
4546 /* Save start of previous item, in case we have to move it up in order to
4547 insert something before it. */
4548
4549 tempcode = previous;
4550
4551 /* If the next character is '+', we have a possessive quantifier. This
4552 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4553 If the next character is '?' this is a minimizing repeat, by default,
4554 but if PCRE_UNGREEDY is set, it works the other way round. We change the
4555 repeat type to the non-default. */
4556
4557 if (ptr[1] == CHAR_PLUS)
4558 {
4559 repeat_type = 0; /* Force greedy */
4560 possessive_quantifier = TRUE;
4561 ptr++;
4562 }
4563 else if (ptr[1] == CHAR_QUESTION_MARK)
4564 {
4565 repeat_type = greedy_non_default;
4566 ptr++;
4567 }
4568 else repeat_type = greedy_default;
4569
4570 /* If previous was a recursion call, wrap it in atomic brackets so that
4571 previous becomes the atomic group. All recursions were so wrapped in the
4572 past, but it no longer happens for non-repeated recursions. In fact, the
4573 repeated ones could be re-implemented independently so as not to need this,
4574 but for the moment we rely on the code for repeating groups. */
4575
4576 if (*previous == OP_RECURSE)
4577 {
4578 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4579 *previous = OP_ONCE;
4580 PUT(previous, 1, 2 + 2*LINK_SIZE);
4581 previous[2 + 2*LINK_SIZE] = OP_KET;
4582 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4583 code += 2 + 2 * LINK_SIZE;
4584 length_prevgroup = 3 + 3*LINK_SIZE;
4585
4586 /* When actually compiling, we need to check whether this was a forward
4587 reference, and if so, adjust the offset. */
4588
4589 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4590 {
4591 int offset = GET(cd->hwm, -LINK_SIZE);
4592 if (offset == previous + 1 - cd->start_code)
4593 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4594 }
4595 }
4596
4597 /* Now handle repetition for the different types of item. */
4598
4599 /* If previous was a character or negated character match, abolish the item
4600 and generate a repeat item instead. If a char item has a minimum of more
4601 than one, ensure that it is set in reqchar - it might not be if a sequence
4602 such as x{3} is the first thing in a branch because the x will have gone
4603 into firstchar instead. */
4604
4605 if (*previous == OP_CHAR || *previous == OP_CHARI
4606 || *previous == OP_NOT || *previous == OP_NOTI)
4607 {
4608 switch (*previous)
4609 {
4610 default: /* Make compiler happy. */
4611 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
4612 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4613 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
4614 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
4615 }
4616
4617 /* Deal with UTF characters that take up more than one character. It's
4618 easier to write this out separately than try to macrify it. Use c to
4619 hold the length of the character in bytes, plus UTF_LENGTH to flag that
4620 it's a length rather than a small character. */
4621
4622 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4623 if (utf && NOT_FIRSTCHAR(code[-1]))
4624 {
4625 pcre_uchar *lastchar = code - 1;
4626 BACKCHAR(lastchar);
4627 c = (int)(code - lastchar); /* Length of UTF-8 character */
4628 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4629 c |= UTF_LENGTH; /* Flag c as a length */
4630 }
4631 else
4632 #endif /* SUPPORT_UTF */
4633
4634 /* Handle the case of a single charater - either with no UTF support, or
4635 with UTF disabled, or for a single character UTF character. */
4636 {
4637 c = code[-1];
4638 if (*previous <= OP_CHARI && repeat_min > 1)
4639 {
4640 reqchar = c;
4641 reqcharflags = req_caseopt | cd->req_varyopt;
4642 }
4643 }
4644
4645 /* If the repetition is unlimited, it pays to see if the next thing on
4646 the line is something that cannot possibly match this character. If so,
4647 automatically possessifying this item gains some performance in the case
4648 where the match fails. */
4649
4650 if (!possessive_quantifier &&
4651 repeat_max < 0 &&
4652 check_auto_possessive(previous, utf, ptr + 1, options, cd))
4653 {
4654 repeat_type = 0; /* Force greedy */
4655 possessive_quantifier = TRUE;
4656 }
4657
4658 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
4659 }
4660
4661 /* If previous was a character type match (\d or similar), abolish it and
4662 create a suitable repeat item. The code is shared with single-character
4663 repeats by setting op_type to add a suitable offset into repeat_type. Note
4664 the the Unicode property types will be present only when SUPPORT_UCP is
4665 defined, but we don't wrap the little bits of code here because it just
4666 makes it horribly messy. */
4667
4668 else if (*previous < OP_EODN)
4669 {
4670 pcre_uchar *oldcode;
4671 int prop_type, prop_value;
4672 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
4673 c = *previous;
4674
4675 if (!possessive_quantifier &&
4676 repeat_max < 0 &&
4677 check_auto_possessive(previous, utf, ptr + 1, options, cd))
4678 {
4679 repeat_type = 0; /* Force greedy */
4680 possessive_quantifier = TRUE;
4681 }
4682
4683 OUTPUT_SINGLE_REPEAT:
4684 if (*previous == OP_PROP || *previous == OP_NOTPROP)
4685 {
4686 prop_type = previous[1];
4687 prop_value = previous[2];
4688 }
4689 else prop_type = prop_value = -1;
4690
4691 oldcode = code;
4692 code = previous; /* Usually overwrite previous item */
4693
4694 /* If the maximum is zero then the minimum must also be zero; Perl allows
4695 this case, so we do too - by simply omitting the item altogether. */
4696
4697 if (repeat_max == 0) goto END_REPEAT;
4698
4699 /* Combine the op_type with the repeat_type */
4700
4701 repeat_type += op_type;
4702
4703 /* A minimum of zero is handled either as the special case * or ?, or as
4704 an UPTO, with the maximum given. */
4705
4706 if (repeat_min == 0)
4707 {
4708 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4709 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4710 else
4711 {
4712 *code++ = OP_UPTO + repeat_type;
4713 PUT2INC(code, 0, repeat_max);
4714 }
4715 }
4716
4717 /* A repeat minimum of 1 is optimized into some special cases. If the
4718 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4719 left in place and, if the maximum is greater than 1, we use OP_UPTO with
4720 one less than the maximum. */
4721
4722 else if (repeat_min == 1)
4723 {
4724 if (repeat_max == -1)
4725 *code++ = OP_PLUS + repeat_type;
4726 else
4727 {
4728 code = oldcode; /* leave previous item in place */
4729 if (repeat_max == 1) goto END_REPEAT;
4730 *code++ = OP_UPTO + repeat_type;
4731 PUT2INC(code, 0, repeat_max - 1);
4732 }
4733 }
4734
4735 /* The case {n,n} is just an EXACT, while the general case {n,m} is
4736 handled as an EXACT followed by an UPTO. */
4737
4738 else
4739 {
4740 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4741 PUT2INC(code, 0, repeat_min);
4742
4743 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4744 we have to insert the character for the previous code. For a repeated
4745 Unicode property match, there are two extra bytes that define the
4746 required property. In UTF-8 mode, long characters have their length in
4747 c, with the UTF_LENGTH bit as a flag. */
4748
4749 if (repeat_max < 0)
4750 {
4751 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4752 if (utf && (c & UTF_LENGTH) != 0)
4753 {
4754 memcpy(code, utf_chars, IN_UCHARS(c & 7));
4755 code += c & 7;
4756 }
4757 else
4758 #endif
4759 {
4760 *code++ = c;
4761 if (prop_type >= 0)
4762 {
4763 *code++ = prop_type;
4764 *code++ = prop_value;
4765 }
4766 }
4767 *code++ = OP_STAR + repeat_type;
4768 }
4769
4770 /* Else insert an UPTO if the max is greater than the min, again
4771 preceded by the character, for the previously inserted code. If the
4772 UPTO is just for 1 instance, we can use QUERY instead. */
4773
4774 else if (repeat_max != repeat_min)
4775 {
4776 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4777 if (utf && (c & UTF_LENGTH) != 0)
4778 {
4779 memcpy(code, utf_chars, IN_UCHARS(c & 7));
4780 code += c & 7;
4781 }
4782 else
4783 #endif
4784 *code++ = c;
4785 if (prop_type >= 0)
4786 {
4787 *code++ = prop_type;
4788 *code++ = prop_value;
4789 }
4790 repeat_max -= repeat_min;
4791
4792 if (repeat_max == 1)
4793 {
4794 *code++ = OP_QUERY + repeat_type;
4795 }
4796 else
4797 {
4798 *code++ = OP_UPTO + repeat_type;
4799 PUT2INC(code, 0, repeat_max);
4800 }
4801 }
4802 }
4803
4804 /* The character or character type itself comes last in all cases. */
4805
4806 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4807 if (utf && (c & UTF_LENGTH) != 0)
4808 {
4809 memcpy(code, utf_chars, IN_UCHARS(c & 7));
4810 code += c & 7;
4811 }
4812 else
4813 #endif
4814 *code++ = c;
4815
4816 /* For a repeated Unicode property match, there are two extra bytes that
4817 define the required property. */
4818
4819 #ifdef SUPPORT_UCP
4820 if (prop_type >= 0)
4821 {
4822 *code++ = prop_type;
4823 *code++ = prop_value;
4824 }
4825 #endif
4826 }
4827
4828 /* If previous was a character class or a back reference, we put the repeat
4829 stuff after it, but just skip the item if the repeat was {0,0}. */
4830
4831 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
4832 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4833 *previous == OP_XCLASS ||
4834 #endif
4835 *previous == OP_REF || *previous == OP_REFI ||
4836 *previous == OP_DNREF || *previous == OP_DNREFI)
4837 {
4838 if (repeat_max == 0)
4839 {
4840 code = previous;
4841 goto END_REPEAT;
4842 }
4843
4844 if (repeat_min == 0 && repeat_max == -1)
4845 *code++ = OP_CRSTAR + repeat_type;
4846 else if (repeat_min == 1 && repeat_max == -1)
4847 *code++ = OP_CRPLUS + repeat_type;
4848 else if (repeat_min == 0 && repeat_max == 1)
4849 *code++ = OP_CRQUERY + repeat_type;
4850 else
4851 {
4852 *code++ = OP_CRRANGE + repeat_type;
4853 PUT2INC(code, 0, repeat_min);
4854 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4855 PUT2INC(code, 0, repeat_max);
4856 }
4857 }
4858
4859 /* If previous was a bracket group, we may have to replicate it in certain
4860 cases. Note that at this point we can encounter only the "basic" bracket
4861 opcodes such as BRA and CBRA, as this is the place where they get converted
4862 into the more special varieties such as BRAPOS and SBRA. A test for >=
4863 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
4864 ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
4865 repetition of assertions, but now it does, for Perl compatibility. */
4866
4867 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
4868 {
4869 register int i;
4870 int len = (int)(code - previous);
4871 pcre_uchar *bralink = NULL;
4872 pcre_uchar *brazeroptr = NULL;
4873
4874 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
4875 we just ignore the repeat. */
4876
4877 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4878 goto END_REPEAT;
4879
4880 /* There is no sense in actually repeating assertions. The only potential
4881 use of repetition is in cases when the assertion is optional. Therefore,
4882 if the minimum is greater than zero, just ignore the repeat. If the
4883 maximum is not not zero or one, set it to 1. */
4884
4885 if (*previous < OP_ONCE) /* Assertion */
4886 {
4887 if (repeat_min > 0) goto END_REPEAT;
4888 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
4889 }
4890
4891 /* The case of a zero minimum is special because of the need to stick
4892 OP_BRAZERO in front of it, and because the group appears once in the
4893 data, whereas in other cases it appears the minimum number of times. For
4894 this reason, it is simplest to treat this case separately, as otherwise
4895 the code gets far too messy. There are several special subcases when the
4896 minimum is zero. */
4897
4898 if (repeat_min == 0)
4899 {
4900 /* If the maximum is also zero, we used to just omit the group from the
4901 output altogether, like this:
4902
4903 ** if (repeat_max == 0)
4904 ** {
4905 ** code = previous;
4906 ** goto END_REPEAT;
4907 ** }
4908
4909 However, that fails when a group or a subgroup within it is referenced
4910 as a subroutine from elsewhere in the pattern, so now we stick in
4911 OP_SKIPZERO in front of it so that it is skipped on execution. As we
4912 don't have a list of which groups are referenced, we cannot do this
4913 selectively.
4914
4915 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4916 and do no more at this point. However, we do need to adjust any
4917 OP_RECURSE calls inside the group that refer to the group itself or any
4918 internal or forward referenced group, because the offset is from the
4919 start of the whole regex. Temporarily terminate the pattern while doing
4920 this. */
4921
4922 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4923 {
4924 *code = OP_END;
4925 adjust_recurse(previous, 1, utf, cd, save_hwm);
4926 memmove(previous + 1, previous, IN_UCHARS(len));
4927 code++;
4928 if (repeat_max == 0)
4929 {
4930 *previous++ = OP_SKIPZERO;
4931 goto END_REPEAT;
4932 }
4933 brazeroptr = previous; /* Save for possessive optimizing */
4934 *previous++ = OP_BRAZERO + repeat_type;
4935 }
4936
4937 /* If the maximum is greater than 1 and limited, we have to replicate
4938 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4939 The first one has to be handled carefully because it's the original
4940 copy, which has to be moved up. The remainder can be handled by code
4941 that is common with the non-zero minimum case below. We have to
4942 adjust the value or repeat_max, since one less copy is required. Once
4943 again, we may have to adjust any OP_RECURSE calls inside the group. */
4944
4945 else
4946 {
4947 int offset;
4948 *code = OP_END;
4949 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
4950 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
4951 code += 2 + LINK_SIZE;
4952 *previous++ = OP_BRAZERO + repeat_type;
4953 *previous++ = OP_BRA;
4954
4955 /* We chain together the bracket offset fields that have to be
4956 filled in later when the ends of the brackets are reached. */
4957
4958 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
4959 bralink = previous;
4960 PUTINC(previous, 0, offset);
4961 }
4962
4963 repeat_max--;
4964 }
4965
4966 /* If the minimum is greater than zero, replicate the group as many
4967 times as necessary, and adjust the maximum to the number of subsequent
4968 copies that we need. If we set a first char from the group, and didn't
4969 set a required char, copy the latter from the former. If there are any
4970 forward reference subroutine calls in the group, there will be entries on
4971 the workspace list; replicate these with an appropriate increment. */
4972
4973 else
4974 {
4975 if (repeat_min > 1)
4976 {
4977 /* In the pre-compile phase, we don't actually do the replication. We
4978 just adjust the length as if we had. Do some paranoid checks for
4979 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4980 integer type when available, otherwise double. */
4981
4982 if (lengthptr != NULL)
4983 {
4984 int delta = (repeat_min - 1)*length_prevgroup;
4985 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4986 (INT64_OR_DOUBLE)length_prevgroup >
4987 (INT64_OR_DOUBLE)INT_MAX ||
4988 OFLOW_MAX - *lengthptr < delta)
4989 {
4990 *errorcodeptr = ERR20;
4991 goto FAILED;
4992 }
4993 *lengthptr += delta;
4994 }
4995
4996 /* This is compiling for real. If there is a set first byte for
4997 the group, and we have not yet set a "required byte", set it. Make
4998 sure there is enough workspace for copying forward references before
4999 doing the copy. */
5000
5001 else
5002 {
5003 if (groupsetfirstchar && reqcharflags < 0)
5004 {
5005 reqchar = firstchar;
5006 reqcharflags = firstcharflags;
5007 }
5008
5009 for (i = 1; i < repeat_min; i++)
5010 {
5011 pcre_uchar *hc;
5012 pcre_uchar *this_hwm = cd->hwm;
5013 memcpy(code, previous, IN_UCHARS(len));
5014
5015 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5016 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5017 {
5018 int save_offset = save_hwm - cd->start_workspace;
5019 int this_offset = this_hwm - cd->start_workspace;
5020 *errorcodeptr = expand_workspace(cd);
5021 if (*errorcodeptr != 0) goto FAILED;
5022 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5023 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5024 }
5025
5026 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5027 {
5028 PUT(cd->hwm, 0, GET(hc, 0) + len);
5029 cd->hwm += LINK_SIZE;
5030 }
5031 save_hwm = this_hwm;
5032 code += len;
5033 }
5034 }
5035 }
5036
5037 if (repeat_max > 0) repeat_max -= repeat_min;
5038 }
5039
5040 /* This code is common to both the zero and non-zero minimum cases. If
5041 the maximum is limited, it replicates the group in a nested fashion,
5042 remembering the bracket starts on a stack. In the case of a zero minimum,
5043 the first one was set up above. In all cases the repeat_max now specifies
5044 the number of additional copies needed. Again, we must remember to
5045 replicate entries on the forward reference list. */
5046
5047 if (repeat_max >= 0)
5048 {
5049 /* In the pre-compile phase, we don't actually do the replication. We
5050 just adjust the length as if we had. For each repetition we must add 1
5051 to the length for BRAZERO and for all but the last repetition we must
5052 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5053 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5054 a 64-bit integer type when available, otherwise double. */
5055
5056 if (lengthptr != NULL && repeat_max > 0)
5057 {
5058 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5059 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5060 if ((INT64_OR_DOUBLE)repeat_max *
5061 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5062 > (INT64_OR_DOUBLE)INT_MAX ||
5063 OFLOW_MAX - *lengthptr < delta)
5064 {
5065 *errorcodeptr = ERR20;
5066 goto FAILED;
5067 }
5068 *lengthptr += delta;
5069 }
5070
5071 /* This is compiling for real */
5072
5073 else for (i = repeat_max - 1; i >= 0; i--)
5074 {
5075 pcre_uchar *hc;
5076 pcre_uchar *this_hwm = cd->hwm;
5077
5078 *code++ = OP_BRAZERO + repeat_type;
5079
5080 /* All but the final copy start a new nesting, maintaining the
5081 chain of brackets outstanding. */
5082
5083 if (i != 0)
5084 {
5085 int offset;
5086 *code++ = OP_BRA;
5087 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5088 bralink = code;
5089 PUTINC(code, 0, offset);
5090 }
5091
5092 memcpy(code, previous, IN_UCHARS(len));
5093
5094 /* Ensure there is enough workspace for forward references before
5095 copying them. */
5096
5097 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5098 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5099 {
5100 int save_offset = save_hwm - cd->start_workspace;
5101 int this_offset = this_hwm - cd->start_workspace;
5102 *errorcodeptr = expand_workspace(cd);
5103 if (*errorcodeptr != 0) goto FAILED;
5104 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5105 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5106 }
5107
5108 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5109 {
5110 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5111 cd->hwm += LINK_SIZE;
5112 }
5113 save_hwm = this_hwm;
5114 code += len;
5115 }
5116
5117 /* Now chain through the pending brackets, and fill in their length
5118 fields (which are holding the chain links pro tem). */
5119
5120 while (bralink != NULL)
5121 {
5122 int oldlinkoffset;
5123 int offset = (int)(code - bralink + 1);
5124 pcre_uchar *bra = code - offset;
5125 oldlinkoffset = GET(bra, 1);
5126 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5127 *code++ = OP_KET;
5128 PUTINC(code, 0, offset);
5129 PUT(bra, 1, offset);
5130 }
5131 }
5132
5133 /* If the maximum is unlimited, set a repeater in the final copy. For
5134 ONCE brackets, that's all we need to do. However, possessively repeated
5135 ONCE brackets can be converted into non-capturing brackets, as the
5136 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5137 deal with possessive ONCEs specially.
5138
5139 Otherwise, when we are doing the actual compile phase, check to see
5140 whether this group is one that could match an empty string. If so,
5141 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5142 that runtime checking can be done. [This check is also applied to ONCE
5143 groups at runtime, but in a different way.]
5144
5145 Then, if the quantifier was possessive and the bracket is not a
5146 conditional, we convert the BRA code to the POS form, and the KET code to
5147 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5148 subpattern at both the start and at the end.) The use of special opcodes
5149 makes it possible to reduce greatly the stack usage in pcre_exec(). If
5150 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5151
5152 Then, if the minimum number of matches is 1 or 0, cancel the possessive
5153 flag so that the default action below, of wrapping everything inside
5154 atomic brackets, does not happen. When the minimum is greater than 1,
5155 there will be earlier copies of the group, and so we still have to wrap
5156 the whole thing. */
5157
5158 else
5159 {
5160 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5161 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5162
5163 /* Convert possessive ONCE brackets to non-capturing */
5164
5165 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5166 possessive_quantifier) *bracode = OP_BRA;
5167
5168 /* For non-possessive ONCE brackets, all we need to do is to
5169 set the KET. */
5170
5171 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5172 *ketcode = OP_KETRMAX + repeat_type;
5173
5174 /* Handle non-ONCE brackets and possessive ONCEs (which have been
5175 converted to non-capturing above). */
5176
5177 else
5178 {
5179 /* In the compile phase, check for empty string matching. */
5180
5181 if (lengthptr == NULL)
5182 {
5183 pcre_uchar *scode = bracode;
5184 do
5185 {
5186 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
5187 {
5188 *bracode += OP_SBRA - OP_BRA;
5189 break;
5190 }
5191 scode += GET(scode, 1);
5192 }
5193 while (*scode == OP_ALT);
5194 }
5195
5196 /* Handle possessive quantifiers. */
5197
5198 if (possessive_quantifier)
5199 {
5200 /* For COND brackets, we wrap the whole thing in a possessively
5201 repeated non-capturing bracket, because we have not invented POS
5202 versions of the COND opcodes. Because we are moving code along, we
5203 must ensure that any pending recursive references are updated. */
5204
5205 if (*bracode == OP_COND || *bracode == OP_SCOND)
5206 {
5207 int nlen = (int)(code - bracode);
5208 *code = OP_END;
5209 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5210 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5211 code += 1 + LINK_SIZE;
5212 nlen += 1 + LINK_SIZE;
5213 *bracode = OP_BRAPOS;
5214 *code++ = OP_KETRPOS;
5215 PUTINC(code, 0, nlen);
5216 PUT(bracode, 1, nlen);
5217 }
5218
5219 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5220
5221 else
5222 {
5223 *bracode += 1; /* Switch to xxxPOS opcodes */
5224 *ketcode = OP_KETRPOS;
5225 }
5226
5227 /* If the minimum is zero, mark it as possessive, then unset the
5228 possessive flag when the minimum is 0 or 1. */
5229
5230 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5231 if (repeat_min < 2) possessive_quantifier = FALSE;
5232 }
5233
5234 /* Non-possessive quantifier */
5235
5236 else *ketcode = OP_KETRMAX + repeat_type;
5237 }
5238 }
5239 }
5240
5241 /* If previous is OP_FAIL, it was generated by an empty class [] in
5242 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5243 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5244 error above. We can just ignore the repeat in JS case. */
5245
5246 else if (*previous == OP_FAIL) goto END_REPEAT;
5247
5248 /* Else there's some kind of shambles */
5249
5250 else
5251 {
5252 *errorcodeptr = ERR11;
5253 goto FAILED;
5254 }
5255
5256 /* If the character following a repeat is '+', or if certain optimization
5257 tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5258 there are special alternative opcodes for this case. For anything else, we
5259 wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5260 notation is just syntactic sugar, taken from Sun's Java package, but the
5261 special opcodes can optimize it.
5262
5263 Some (but not all) possessively repeated subpatterns have already been
5264 completely handled in the code just above. For them, possessive_quantifier
5265 is always FALSE at this stage.
5266
5267 Note that the repeated item starts at tempcode, not at previous, which
5268 might be the first part of a string whose (former) last char we repeated.
5269
5270 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5271 an 'upto' may follow. We skip over an 'exact' item, and then test the
5272 length of what remains before proceeding. */
5273
5274 if (possessive_quantifier)
5275 {
5276 int len;
5277
5278 if (*tempcode == OP_TYPEEXACT)
5279 tempcode += PRIV(OP_lengths)[*tempcode] +
5280 ((tempcode[1 + IMM2_SIZE] == OP_PROP
5281 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5282
5283 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5284 {
5285 tempcode += PRIV(OP_lengths)[*tempcode];
5286 #ifdef SUPPORT_UTF
5287 if (utf && HAS_EXTRALEN(tempcode[-1]))
5288 tempcode += GET_EXTRALEN(tempcode[-1]);
5289 #endif
5290 }
5291
5292 len = (int)(code - tempcode);
5293 if (len > 0) switch (*tempcode)
5294 {
5295 case OP_STAR: *tempcode = OP_POSSTAR; break;
5296 case OP_PLUS: *tempcode = OP_POSPLUS; break;
5297 case OP_QUERY: *tempcode = OP_POSQUERY; break;
5298 case OP_UPTO: *tempcode = OP_POSUPTO; break;
5299
5300 case OP_STARI: *tempcode = OP_POSSTARI; break;
5301 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
5302 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5303 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
5304
5305 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
5306 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
5307 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5308 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
5309
5310 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
5311 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
5312 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5313 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
5314
5315 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
5316 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
5317 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5318 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
5319
5320 /* Because we are moving code along, we must ensure that any
5321 pending recursive references are updated. */
5322
5323 default:
5324 *code = OP_END;
5325 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5326 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5327 code += 1 + LINK_SIZE;
5328 len += 1 + LINK_SIZE;
5329 tempcode[0] = OP_ONCE;
5330 *code++ = OP_KET;
5331 PUTINC(code, 0, len);
5332 PUT(tempcode, 1, len);
5333 break;
5334 }
5335 }
5336
5337 /* In all case we no longer have a previous item. We also set the
5338 "follows varying string" flag for subsequently encountered reqchars if
5339 it isn't already set and we have just passed a varying length item. */
5340
5341 END_REPEAT:
5342 previous = NULL;
5343 cd->req_varyopt |= reqvary;
5344 break;
5345
5346
5347 /* ===================================================================*/
5348 /* Start of nested parenthesized sub-expression, or comment or lookahead or
5349 lookbehind or option setting or condition or all the other extended
5350 parenthesis forms. */
5351
5352 case CHAR_LEFT_PARENTHESIS:
5353 newoptions = options;
5354 skipbytes = 0;
5355 bravalue = OP_CBRA;
5356 save_hwm = cd->hwm;
5357 reset_bracount = FALSE;
5358
5359 /* First deal with various "verbs" that can be introduced by '*'. */
5360
5361 ptr++;
5362 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5363 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5364 {
5365 int i, namelen;
5366 int arglen = 0;
5367 const char *vn = verbnames;
5368 const pcre_uchar *name = ptr + 1;
5369 const pcre_uchar *arg = NULL;
5370 previous = NULL;
5371 ptr++;
5372 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5373 namelen = (int)(ptr - name);
5374
5375 /* It appears that Perl allows any characters whatsoever, other than
5376 a closing parenthesis, to appear in arguments, so we no longer insist on
5377 letters, digits, and underscores. */
5378
5379 if (*ptr == CHAR_COLON)
5380 {
5381 arg = ++ptr;
5382 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5383 arglen = (int)(ptr - arg);
5384 if ((unsigned int)arglen > MAX_MARK)
5385 {
5386 *errorcodeptr = ERR75;
5387 goto FAILED;
5388 }
5389 }
5390
5391 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5392 {
5393 *errorcodeptr = ERR60;
5394 goto FAILED;
5395 }
5396
5397 /* Scan the table of verb names */
5398
5399 for (i = 0; i < verbcount; i++)
5400 {
5401 if (namelen == verbs[i].len &&
5402 STRNCMP_UC_C8(name, vn, namelen) == 0)
5403 {
5404 int setverb;
5405
5406 /* Check for open captures before ACCEPT and convert it to
5407 ASSERT_ACCEPT if in an assertion. */
5408
5409 if (verbs[i].op == OP_ACCEPT)
5410 {
5411 open_capitem *oc;
5412 if (arglen != 0)
5413 {
5414 *errorcodeptr = ERR59;
5415 goto FAILED;
5416 }
5417 cd->had_accept = TRUE;
5418 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5419 {
5420 *code++ = OP_CLOSE;
5421 PUT2INC(code, 0, oc->number);
5422 }
5423 setverb = *code++ =
5424 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5425
5426 /* Do not set firstchar after *ACCEPT */
5427 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5428 }
5429
5430 /* Handle other cases with/without an argument */
5431
5432 else if (arglen == 0)
5433 {
5434 if (verbs[i].op < 0) /* Argument is mandatory */
5435 {
5436 *errorcodeptr = ERR66;
5437 goto FAILED;
5438 }
5439 setverb = *code++ = verbs[i].op;
5440 }
5441
5442 else
5443 {
5444 if (verbs[i].op_arg < 0) /* Argument is forbidden */
5445 {
5446 *errorcodeptr = ERR59;
5447 goto FAILED;
5448 }
5449 setverb = *code++ = verbs[i].op_arg;
5450 *code++ = arglen;
5451 memcpy(code, arg, IN_UCHARS(arglen));
5452 code += arglen;
5453 *code++ = 0;
5454 }
5455
5456 switch (setverb)
5457 {
5458 case OP_THEN:
5459 case OP_THEN_ARG:
5460 cd->external_flags |= PCRE_HASTHEN;
5461 break;
5462
5463 case OP_PRUNE:
5464 case OP_PRUNE_ARG:
5465 case OP_SKIP:
5466 case OP_SKIP_ARG:
5467 cd->had_pruneorskip = TRUE;
5468 break;
5469 }
5470
5471 break; /* Found verb, exit loop */
5472 }
5473
5474 vn += verbs[i].len + 1;
5475 }
5476
5477 if (i < verbcount) continue; /* Successfully handled a verb */
5478 *errorcodeptr = ERR60; /* Verb not recognized */
5479 goto FAILED;
5480 }
5481
5482 /* Deal with the extended parentheses; all are introduced by '?', and the
5483 appearance of any of them means that this is not a capturing group. */
5484
5485 else if (*ptr == CHAR_QUESTION_MARK)
5486 {
5487 int i, set, unset, namelen;
5488 int *optset;
5489 const pcre_uchar *name;
5490 pcre_uchar *slot;
5491
5492 switch (*(++ptr))
5493 {
5494 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
5495 ptr++;
5496 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5497 if (*ptr == CHAR_NULL)
5498 {
5499 *errorcodeptr = ERR18;
5500 goto FAILED;
5501 }
5502 continue;
5503
5504
5505 /* ------------------------------------------------------------ */
5506 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
5507 reset_bracount = TRUE;
5508 /* Fall through */
5509
5510 /* ------------------------------------------------------------ */
5511 case CHAR_COLON: /* Non-capturing bracket */
5512 bravalue = OP_BRA;
5513 ptr++;
5514 break;
5515
5516
5517 /* ------------------------------------------------------------ */
5518 case CHAR_LEFT_PARENTHESIS:
5519 bravalue = OP_COND; /* Conditional group */
5520 tempptr = ptr;
5521
5522 /* A condition can be an assertion, a number (referring to a numbered
5523 group), a name (referring to a named group), or 'R', referring to
5524 recursion. R<digits> and R&name are also permitted for recursion tests.
5525
5526 There are several syntaxes for testing a named group: (?(name)) is used
5527 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5528
5529 There are two unfortunate ambiguities, caused by history. (a) 'R' can
5530 be the recursive thing or the name 'R' (and similarly for 'R' followed
5531 by digits), and (b) a number could be a name that consists of digits.
5532 In both cases, we look for a name first; if not found, we try the other
5533 cases.
5534
5535 For compatibility with auto-callouts, we allow a callout to be
5536 specified before a condition that is an assertion. First, check for the
5537 syntax of a callout; if found, adjust the temporary pointer that is
5538 used to check for an assertion condition. That's all that is needed! */
5539
5540 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
5541 {
5542 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
5543 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
5544 tempptr += i + 1;
5545 }
5546
5547 /* For conditions that are assertions, check the syntax, and then exit
5548 the switch. This will take control down to where bracketed groups,
5549 including assertions, are processed. */
5550
5551 if (tempptr[1] == CHAR_QUESTION_MARK &&
5552 (tempptr[2] == CHAR_EQUALS_SIGN ||
5553 tempptr[2] == CHAR_EXCLAMATION_MARK ||
5554 tempptr[2] == CHAR_LESS_THAN_SIGN))
5555 break;
5556
5557 /* Most other conditions use OP_CREF (a couple change to OP_RREF
5558 below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5559
5560 code[1+LINK_SIZE] = OP_CREF;
5561 skipbytes = 1+IMM2_SIZE;
5562 refsign = -1;
5563
5564 /* Check for a test for recursion in a named group. */
5565
5566 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
5567 {
5568 terminator = -1;
5569 ptr += 2;
5570 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
5571 }
5572
5573 /* Check for a test for a named group's having been set, using the Perl
5574 syntax (?(<name>) or (?('name') */
5575
5576 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
5577 {
5578 terminator = CHAR_GREATER_THAN_SIGN;
5579 ptr++;
5580 }
5581 else if (ptr[1] == CHAR_APOSTROPHE)
5582 {
5583 terminator = CHAR_APOSTROPHE;
5584 ptr++;
5585 }
5586 else
5587 {
5588 terminator = CHAR_NULL;
5589 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5590 }
5591
5592 /* We now expect to read a name; any thing else is an error */
5593
5594 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5595 {
5596 ptr += 1; /* To get the right offset */
5597 *errorcodeptr = ERR28;
5598 goto FAILED;
5599 }
5600
5601 /* Read the name, but also get it as a number if it's all digits */
5602
5603 recno = 0;
5604 name = ++ptr;
5605 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5606 {
5607 if (recno >= 0)
5608 recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
5609 ptr++;
5610 }
5611 namelen = (int)(ptr - name);
5612
5613 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
5614 *ptr++ != CHAR_RIGHT_PARENTHESIS)
5615 {
5616 ptr--; /* Error offset */
5617 *errorcodeptr = ERR26;
5618 goto FAILED;
5619 }
5620
5621 /* Do no further checking in the pre-compile phase. */
5622
5623 if (lengthptr != NULL) break;
5624
5625 /* In the real compile we do the work of looking for the actual
5626 reference. If the string started with "+" or "-" we require the rest to
5627 be digits, in which case recno will be set. */
5628
5629 if (refsign > 0)
5630 {
5631 if (recno <= 0)
5632 {
5633 *errorcodeptr = ERR58;
5634 goto FAILED;
5635 }
5636 recno = (refsign == CHAR_MINUS)?
5637 cd->bracount - recno + 1 : recno +cd->bracount;
5638 if (recno <= 0 || recno > cd->final_bracount)
5639 {
5640 *errorcodeptr = ERR15;
5641 goto FAILED;
5642 }
5643 PUT2(code, 2+LINK_SIZE, recno);
5644 break;
5645 }
5646
5647 /* Otherwise (did not start with "+" or "-"), start by looking for the
5648 name. If we find a name, add one to the opcode to change OP_CREF or
5649 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5650 except they record that the reference was originally to a name. The
5651 information is used to check duplicate names. */
5652
5653 slot = cd->name_table;
5654 for (i = 0; i < cd->names_found; i++)
5655 {
5656 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5657 slot += cd->name_entry_size;
5658 }
5659
5660 /* Found the named subpattern */
5661
5662 if (i < cd->names_found)
5663 {
5664 recno = GET2(slot, 0);
5665 PUT2(code, 2+LINK_SIZE, recno);
5666 code[1+LINK_SIZE]++;
5667 }
5668
5669 /* If terminator == CHAR_NULL it means that the name followed directly
5670 after the opening parenthesis [e.g. (?(abc)...] and in this case there
5671 are some further alternatives to try. For the cases where terminator !=
5672 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5673 now checked all the possibilities, so give an error. */
5674
5675 else if (terminator != CHAR_NULL)
5676 {
5677 *errorcodeptr = ERR15;
5678 goto FAILED;
5679 }
5680
5681 /* Check for (?(R) for recursion. Allow digits after R to specify a
5682 specific group number. */
5683
5684 else if (*name == CHAR_R)
5685 {
5686 recno = 0;
5687 for (i = 1; i < namelen; i++)
5688 {
5689 if (!IS_DIGIT(name[i]))
5690 {
5691 *errorcodeptr = ERR15;
5692 goto FAILED;
5693 }
5694 recno = recno * 10 + name[i] - CHAR_0;
5695 }
5696 if (recno == 0) recno = RREF_ANY;
5697 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
5698 PUT2(code, 2+LINK_SIZE, recno);
5699 }
5700
5701 /* Similarly, check for the (?(DEFINE) "condition", which is always
5702 false. */
5703
5704 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5705 {
5706 code[1+LINK_SIZE] = OP_DEF;
5707 skipbytes = 1;
5708 }
5709
5710 /* Check for the "name" actually being a subpattern number. We are
5711 in the second pass here, so final_bracount is set. */
5712
5713 else if (recno > 0 && recno <= cd->final_bracount)
5714 {
5715 PUT2(code, 2+LINK_SIZE, recno);
5716 }
5717
5718 /* Either an unidentified subpattern, or a reference to (?(0) */
5719
5720 else
5721 {
5722 *errorcodeptr = (recno == 0)? ERR35: ERR15;
5723 goto FAILED;
5724 }
5725 break;
5726
5727
5728 /* ------------------------------------------------------------ */
5729 case CHAR_EQUALS_SIGN: /* Positive lookahead */
5730 bravalue = OP_ASSERT;
5731 cd->assert_depth += 1;
5732 ptr++;
5733 break;
5734
5735
5736 /* ------------------------------------------------------------ */
5737 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
5738 ptr++;
5739 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
5740 {
5741 *code++ = OP_FAIL;
5742 previous = NULL;
5743 continue;
5744 }
5745 bravalue = OP_ASSERT_NOT;
5746 cd->assert_depth += 1;
5747 break;
5748
5749
5750 /* ------------------------------------------------------------ */
5751 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
5752 switch (ptr[1])
5753 {
5754 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
5755 bravalue = OP_ASSERTBACK;
5756 cd->assert_depth += 1;
5757 ptr += 2;
5758 break;
5759
5760 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
5761 bravalue = OP_ASSERTBACK_NOT;
5762 cd->assert_depth += 1;
5763 ptr += 2;
5764 break;
5765
5766 default: /* Could be name define, else bad */
5767 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5768 goto DEFINE_NAME;
5769 ptr++; /* Correct offset for error */
5770 *errorcodeptr = ERR24;
5771 goto FAILED;
5772 }
5773 break;
5774
5775
5776 /* ------------------------------------------------------------ */
5777 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
5778 bravalue = OP_ONCE;
5779 ptr++;
5780 break;
5781
5782
5783 /* ------------------------------------------------------------ */
5784 case CHAR_C: /* Callout - may be followed by digits; */
5785 previous_callout = code; /* Save for later completion */
5786 after_manual_callout = 1; /* Skip one item before completing */
5787 *code++ = OP_CALLOUT;
5788 {
5789 int n = 0;
5790 ptr++;
5791 while(IS_DIGIT(*ptr))
5792 n = n * 10 + *ptr++ - CHAR_0;
5793 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5794 {
5795 *errorcodeptr = ERR39;
5796 goto FAILED;
5797 }
5798 if (n > 255)
5799 {
5800 *errorcodeptr = ERR38;
5801 goto FAILED;
5802 }
5803 *code++ = n;
5804 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
5805 PUT(code, LINK_SIZE, 0); /* Default length */
5806 code += 2 * LINK_SIZE;
5807 }
5808 previous = NULL;
5809 continue;
5810
5811
5812 /* ------------------------------------------------------------ */
5813 case CHAR_P: /* Python-style named subpattern handling */
5814 if (*(++ptr) == CHAR_EQUALS_SIGN ||
5815 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
5816 {
5817 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
5818 terminator = CHAR_RIGHT_PARENTHESIS;
5819 goto NAMED_REF_OR_RECURSE;
5820 }
5821 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
5822 {
5823 *errorcodeptr = ERR41;
5824 goto FAILED;
5825 }
5826 /* Fall through to handle (?P< as (?< is handled */
5827
5828
5829 /* ------------------------------------------------------------ */
5830 DEFINE_NAME: /* Come here from (?< handling */
5831 case CHAR_APOSTROPHE:
5832 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
5833 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5834 name = ++ptr;
5835
5836 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5837 namelen = (int)(ptr - name);
5838
5839 /* In the pre-compile phase, do a syntax check, remember the longest
5840 name, and then remember the group in a vector, expanding it if
5841 necessary. Duplicates for the same number are skipped; other duplicates
5842 are checked for validity. In the actual compile, there is nothing to
5843 do. */
5844
5845 if (lengthptr != NULL)
5846 {
5847 named_group *ng;
5848 pcre_uint32 number = cd->bracount + 1;
5849
5850 if (*ptr != (pcre_uchar)terminator)
5851 {
5852 *errorcodeptr = ERR42;
5853 goto FAILED;
5854 }
5855
5856 if (cd->names_found >= MAX_NAME_COUNT)
5857 {
5858 *errorcodeptr = ERR49;
5859 goto FAILED;
5860 }
5861
5862 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
5863 {
5864 cd->name_entry_size = namelen + IMM2_SIZE + 1;
5865 if (namelen > MAX_NAME_SIZE)
5866 {
5867 *errorcodeptr = ERR48;
5868 goto FAILED;
5869 }
5870 }
5871
5872 /* Scan the list to check for duplicates. For duplicate names, if the
5873 number is the same, break the loop, which causes the name to be
5874 discarded; otherwise, if DUPNAMES is not set, give an error.
5875 If it is set, allow the name with a different number, but continue
5876 scanning in case this is a duplicate with the same number. For
5877 non-duplicate names, give an error if the number is duplicated. */
5878
5879 ng = cd->named_groups;
5880 for (i = 0; i < cd->names_found; i++, ng++)
5881 {
5882 if (namelen == ng->length &&
5883 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
5884 {
5885 if (ng->number == number) break;
5886 if ((options & PCRE_DUPNAMES) == 0)
5887 {
5888 *errorcodeptr = ERR43;
5889 goto FAILED;
5890 }
5891 cd->dupnames = TRUE; /* Duplicate names exist */
5892 }
5893 else if (ng->number == number)
5894 {
5895 *errorcodeptr = ERR65;
5896 goto FAILED;
5897 }
5898 }
5899
5900 if (i >= cd->names_found) /* Not a duplicate with same number */
5901 {
5902 /* Increase the list size if necessary */
5903
5904 if (cd->names_found >= cd->named_group_list_size)
5905 {
5906 int newsize = cd->named_group_list_size * 2;
5907 named_group *newspace = (PUBL(malloc))
5908 (newsize * sizeof(named_group));
5909
5910 if (newspace == NULL)
5911 {
5912 *errorcodeptr = ERR21;
5913 goto FAILED;
5914 }
5915
5916 memcpy(newspace, cd->named_groups,
5917 cd->named_group_list_size * sizeof(named_group));
5918 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5919 (PUBL(free))((void *)cd->named_groups);
5920 cd->named_groups = newspace;
5921 cd->named_group_list_size = newsize;
5922 }
5923
5924 cd->named_groups[cd->names_found].name = name;
5925 cd->named_groups[cd->names_found].length = namelen;
5926 cd->named_groups[cd->names_found].number = number;
5927 cd->names_found++;
5928 }
5929 }
5930
5931 ptr++; /* Move past > or ' in both passes. */
5932 goto NUMBERED_GROUP;
5933
5934
5935 /* ------------------------------------------------------------ */
5936 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
5937 terminator = CHAR_RIGHT_PARENTHESIS;
5938 is_recurse = TRUE;
5939 /* Fall through */
5940
5941 /* We come here from the Python syntax above that handles both
5942 references (?P=name) and recursion (?P>name), as well as falling
5943 through from the Perl recursion syntax (?&name). We also come here from
5944 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5945 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5946
5947 NAMED_REF_OR_RECURSE:
5948 name = ++ptr;
5949 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5950 namelen = (int)(ptr - name);
5951
5952 /* In the pre-compile phase, do a syntax check. We used to just set
5953 a dummy reference number, because it was not used in the first pass.
5954 However, with the change of recursive back references to be atomic,
5955 we have to look for the number so that this state can be identified, as
5956 otherwise the incorrect length is computed. If it's not a backwards
5957 reference, the dummy number will do. */
5958
5959 if (lengthptr != NULL)
5960 {
5961 named_group *ng;
5962
5963 if (namelen == 0)
5964 {
5965 *errorcodeptr = ERR62;
5966 goto FAILED;
5967 }
5968 if (*ptr != (pcre_uchar)terminator)
5969 {
5970 *errorcodeptr = ERR42;
5971 goto FAILED;
5972 }
5973 if (namelen > MAX_NAME_SIZE)
5974 {
5975 *errorcodeptr = ERR48;
5976 goto FAILED;
5977 }
5978
5979 /* The name table does not exist in the first pass; instead we must
5980 scan the list of names encountered so far in order to get the
5981 number. If the name is not found, set the value to 0 for a forward
5982 reference. */
5983
5984 ng = cd->named_groups;
5985 for (i = 0; i < cd->names_found; i++, ng++)
5986 {
5987 if (namelen == ng->length &&
5988 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
5989 break;
5990 }
5991 recno = (i < cd->names_found)? ng->number : 0;
5992
5993 /* Count named back references. */
5994
5995 if (!is_recurse) cd->namedrefcount++;
5996 }
5997
5998 /* In the real compile, search the name table. We check the name
5999 first, and then check that we have reached the end of the name in the
6000 table. That way, if the name is longer than any in the table, the
6001 comparison will fail without reading beyond the table entry. */
6002
6003 else
6004 {
6005 slot = cd->name_table;
6006 for (i = 0; i < cd->names_found; i++)
6007 {
6008 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6009 slot[IMM2_SIZE+namelen] == 0)
6010 break;
6011 slot += cd->name_entry_size;
6012 }
6013
6014 if (i < cd->names_found)
6015 {
6016 recno = GET2(slot, 0);
6017 }
6018 else
6019 {
6020 *errorcodeptr = ERR15;
6021 goto FAILED;
6022 }
6023 }
6024
6025 /* In both phases, for recursions, we can now go to the code than
6026 handles numerical recursion. */
6027
6028 if (is_recurse) goto HANDLE_RECURSION;
6029
6030 /* In the second pass we must see if the name is duplicated. If so, we
6031 generate a different opcode. */
6032
6033 if (lengthptr == NULL && cd->dupnames)
6034 {
6035 int count = 1;
6036 unsigned int index = i;
6037 pcre_uchar *cslot = slot + cd->name_entry_size;
6038
6039 for (i++; i < cd->names_found; i++)
6040 {
6041 if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
6042 count++;
6043 cslot += cd->name_entry_size;
6044 }
6045
6046 if (count > 1)
6047 {
6048 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6049 previous = code;
6050 *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6051 PUT2INC(code, 0, index);
6052 PUT2INC(code, 0, count);
6053
6054 /* Process each potentially referenced group. */
6055
6056 for (; slot < cslot; slot += cd->name_entry_size)
6057 {
6058 open_capitem *oc;
6059 recno = GET2(slot, 0);
6060 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6061 if (recno > cd->top_backref) cd->top_backref = recno;
6062
6063 /* Check to see if this back reference is recursive, that it, it
6064 is inside the group that it references. A flag is set so that the
6065 group can be made atomic. */
6066
6067 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6068 {
6069 if (oc->number == recno)
6070 {
6071 oc->flag = TRUE;
6072 break;
6073 }
6074 }
6075 }
6076
6077 continue; /* End of back ref handling */
6078 }
6079 }
6080
6081 /* First pass, or a non-duplicated name. */
6082
6083 goto HANDLE_REFERENCE;
6084
6085
6086 /* ------------------------------------------------------------ */
6087 case CHAR_R: /* Recursion */
6088 ptr++; /* Same as (?0) */
6089 /* Fall through */
6090
6091
6092 /* ------------------------------------------------------------ */
6093 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
6094 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6095 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6096 {
6097 const pcre_uchar *called;
6098 terminator = CHAR_RIGHT_PARENTHESIS;
6099
6100 /* Come here from the \g<...> and \g'...' code (Oniguruma
6101 compatibility). However, the syntax has been checked to ensure that
6102 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6103 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6104 ever be taken. */
6105
6106 HANDLE_NUMERICAL_RECURSION:
6107
6108 if ((refsign = *ptr) == CHAR_PLUS)
6109 {
6110 ptr++;
6111 if (!IS_DIGIT(*ptr))
6112 {
6113 *errorcodeptr = ERR63;
6114 goto FAILED;
6115 }
6116 }
6117 else if (refsign == CHAR_MINUS)
6118 {
6119 if (!IS_DIGIT(ptr[1]))
6120 goto OTHER_CHAR_AFTER_QUERY;
6121 ptr++;
6122 }
6123
6124 recno = 0;
6125 while(IS_DIGIT(*ptr))
6126 recno = recno * 10 + *ptr++ - CHAR_0;
6127
6128 if (*ptr != (pcre_uchar)terminator)
6129 {
6130 *errorcodeptr = ERR29;
6131 goto FAILED;
6132 }
6133
6134 if (refsign == CHAR_MINUS)
6135 {
6136 if (recno == 0)
6137 {
6138 *errorcodeptr = ERR58;
6139 goto FAILED;
6140 }
6141 recno = cd->bracount - recno + 1;
6142 if (recno <= 0)
6143 {
6144 *errorcodeptr = ERR15;
6145 goto FAILED;
6146 }
6147 }
6148 else if (refsign == CHAR_PLUS)
6149 {
6150 if (recno == 0)
6151 {
6152 *errorcodeptr = ERR58;
6153 goto FAILED;
6154 }
6155 recno += cd->bracount;
6156 }
6157
6158 /* Come here from code above that handles a named recursion */
6159
6160 HANDLE_RECURSION:
6161
6162 previous = code;
6163 called = cd->start_code;
6164
6165 /* When we are actually compiling, find the bracket that is being
6166 referenced. Temporarily end the regex in case it doesn't exist before
6167 this point. If we end up with a forward reference, first check that
6168 the bracket does occur later so we can give the error (and position)
6169 now. Then remember this forward reference in the workspace so it can
6170 be filled in at the end. */
6171
6172 if (lengthptr == NULL)
6173 {
6174 *code = OP_END;
6175 if (recno != 0)
6176 called = PRIV(find_bracket)(cd->start_code, utf, recno);
6177
6178 /* Forward reference */
6179
6180 if (called == NULL)
6181 {
6182 if (recno > cd->final_bracount)
6183 {
6184 *errorcodeptr = ERR15;
6185 goto FAILED;
6186 }
6187
6188 /* Fudge the value of "called" so that when it is inserted as an
6189 offset below, what it actually inserted is the reference number
6190 of the group. Then remember the forward reference. */
6191
6192 called = cd->start_code + recno;
6193 if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6194 WORK_SIZE_SAFETY_MARGIN)
6195 {
6196 *errorcodeptr = expand_workspace(cd);
6197 if (*errorcodeptr != 0) goto FAILED;
6198 }
6199 PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6200 }
6201
6202 /* If not a forward reference, and the subpattern is still open,
6203 this is a recursive call. We check to see if this is a left
6204 recursion that could loop for ever, and diagnose that case. We
6205 must not, however, do this check if we are in a conditional
6206 subpattern because the condition might be testing for recursion in
6207 a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
6208 Forever loops are also detected at runtime, so those that occur in
6209 conditional subpatterns will be picked up then. */
6210
6211 else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6212 could_be_empty(called, code, bcptr, utf, cd))
6213 {
6214 *errorcodeptr = ERR40;
6215 goto FAILED;
6216 }
6217 }
6218
6219 /* Insert the recursion/subroutine item. It does not have a set first
6220 character (relevant if it is repeated, because it will then be
6221 wrapped with ONCE brackets). */
6222
6223 *code = OP_RECURSE;
6224 PUT(code, 1, (int)(called - cd->start_code));
6225 code += 1 + LINK_SIZE;
6226 groupsetfirstchar = FALSE;
6227 }
6228
6229 /* Can't determine a first byte now */
6230
6231 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6232 continue;
6233
6234
6235 /* ------------------------------------------------------------ */
6236 default: /* Other characters: check option setting */
6237 OTHER_CHAR_AFTER_QUERY:
6238 set = unset = 0;
6239 optset = &set;
6240
6241 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6242 {
6243 switch (*ptr++)
6244 {
6245 case CHAR_MINUS: optset = &unset; break;
6246
6247 case CHAR_J: /* Record that it changed in the external options */
6248 *optset |= PCRE_DUPNAMES;
6249 cd->external_flags |= PCRE_JCHANGED;
6250 break;
6251
6252 case CHAR_i: *optset |= PCRE_CASELESS; break;
6253 case CHAR_m: *optset |= PCRE_MULTILINE; break;
6254 case CHAR_s: *optset |= PCRE_DOTALL; break;
6255 case CHAR_x: *optset |= PCRE_EXTENDED; break;
6256 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
6257 case CHAR_X: *optset |= PCRE_EXTRA; break;
6258
6259 default: *errorcodeptr = ERR12;
6260 ptr--; /* Correct the offset */
6261 goto FAILED;
6262 }
6263 }
6264
6265 /* Set up the changed option bits, but don't change anything yet. */
6266
6267 newoptions = (options | set) & (~unset);
6268
6269 /* If the options ended with ')' this is not the start of a nested
6270 group with option changes, so the options change at this level. If this
6271 item is right at the start of the pattern, the options can be
6272 abstracted and made external in the pre-compile phase, and ignored in
6273 the compile phase. This can be helpful when matching -- for instance in
6274 caseless checking of required bytes.
6275
6276 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
6277 definitely *not* at the start of the pattern because something has been
6278 compiled. In the pre-compile phase, however, the code pointer can have
6279 that value after the start, because it gets reset as code is discarded
6280 during the pre-compile. However, this can happen only at top level - if
6281 we are within parentheses, the starting BRA will still be present. At
6282 any parenthesis level, the length value can be used to test if anything
6283 has been compiled at that level. Thus, a test for both these conditions
6284 is necessary to ensure we correctly detect the start of the pattern in
6285 both phases.
6286
6287 If we are not at the pattern start, reset the greedy defaults and the
6288 case value for firstchar and reqchar. */
6289
6290 if (*ptr == CHAR_RIGHT_PARENTHESIS)
6291 {
6292 if (code == cd->start_code + 1 + LINK_SIZE &&
6293 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
6294 {
6295 cd->external_options = newoptions;
6296 }
6297 else
6298 {
6299 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6300 greedy_non_default = greedy_default ^ 1;
6301 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6302 }
6303
6304 /* Change options at this level, and pass them back for use
6305 in subsequent branches. */
6306
6307 *optionsptr = options = newoptions;
6308 previous = NULL; /* This item can't be repeated */
6309 continue; /* It is complete */
6310 }
6311
6312 /* If the options ended with ':' we are heading into a nested group
6313 with possible change of options. Such groups are non-capturing and are
6314 not assertions of any kind. All we need to do is skip over the ':';
6315 the newoptions value is handled below. */
6316
6317 bravalue = OP_BRA;
6318 ptr++;
6319 } /* End of switch for character following (? */
6320 } /* End of (? handling */
6321
6322 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
6323 is set, all unadorned brackets become non-capturing and behave like (?:...)
6324 brackets. */
6325
6326 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
6327 {
6328 bravalue = OP_BRA;
6329 }
6330
6331 /* Else we have a capturing group. */
6332
6333 else
6334 {
6335 NUMBERED_GROUP:
6336 cd->bracount += 1;
6337 PUT2(code, 1+LINK_SIZE, cd->bracount);
6338 skipbytes = IMM2_SIZE;
6339 }
6340
6341 /* Process nested bracketed regex. Assertions used not to be repeatable,
6342 but this was changed for Perl compatibility, so all kinds can now be
6343 repeated. We copy code into a non-register variable (tempcode) in order to
6344 be able to pass its address because some compilers complain otherwise. */
6345
6346 previous = code; /* For handling repetition */
6347 *code = bravalue;
6348 tempcode = code;
6349 tempreqvary = cd->req_varyopt; /* Save value before bracket */
6350 tempbracount = cd->bracount; /* Save value before bracket */
6351 length_prevgroup = 0; /* Initialize for pre-compile phase */
6352
6353 if (!compile_regex(
6354 newoptions, /* The complete new option state */
6355 &tempcode, /* Where to put code (updated) */
6356 &ptr, /* Input pointer (updated) */
6357 errorcodeptr, /* Where to put an error message */
6358 (bravalue == OP_ASSERTBACK ||
6359 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
6360 reset_bracount, /* True if (?| group */
6361 skipbytes, /* Skip over bracket number */
6362 cond_depth +
6363 ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
6364 &subfirstchar, /* For possible first char */
6365 &subfirstcharflags,
6366 &subreqchar, /* For possible last char */
6367 &subreqcharflags,
6368 bcptr, /* Current branch chain */
6369 cd, /* Tables block */
6370 (lengthptr == NULL)? NULL : /* Actual compile phase */
6371 &length_prevgroup /* Pre-compile phase */
6372 ))
6373 goto FAILED;
6374
6375 /* If this was an atomic group and there are no capturing groups within it,
6376 generate OP_ONCE_NC instead of OP_ONCE. */
6377
6378 if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
6379 *code = OP_ONCE_NC;
6380
6381 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6382 cd->assert_depth -= 1;
6383
6384 /* At the end of compiling, code is still pointing to the start of the
6385 group, while tempcode has been updated to point past the end of the group.
6386 The pattern pointer (ptr) is on the bracket.
6387
6388 If this is a conditional bracket, check that there are no more than
6389 two branches in the group, or just one if it's a DEFINE group. We do this
6390 in the real compile phase, not in the pre-pass, where the whole group may
6391 not be available. */
6392
6393 if (bravalue == OP_COND && lengthptr == NULL)
6394 {
6395 pcre_uchar *tc = code;
6396 int condcount = 0;
6397
6398 do {
6399 condcount++;
6400 tc += GET(tc,1);
6401 }
6402 while (*tc != OP_KET);
6403
6404 /* A DEFINE group is never obeyed inline (the "condition" is always
6405 false). It must have only one branch. */
6406
6407 if (code[LINK_SIZE+1] == OP_DEF)
6408 {
6409 if (condcount > 1)
6410 {
6411 *errorcodeptr = ERR54;
6412 goto FAILED;
6413 }
6414 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
6415 }
6416
6417 /* A "normal" conditional group. If there is just one branch, we must not
6418 make use of its firstchar or reqchar, because this is equivalent to an
6419 empty second branch. */
6420
6421 else
6422 {
6423 if (condcount > 2)
6424 {
6425 *errorcodeptr = ERR27;
6426 goto FAILED;
6427 }
6428 if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
6429 }
6430 }
6431
6432 /* Error if hit end of pattern */
6433
6434 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6435 {
6436 *errorcodeptr = ERR14;
6437 goto FAILED;
6438 }
6439
6440 /* In the pre-compile phase, update the length by the length of the group,
6441 less the brackets at either end. Then reduce the compiled code to just a
6442 set of non-capturing brackets so that it doesn't use much memory if it is
6443 duplicated by a quantifier.*/
6444
6445 if (lengthptr != NULL)
6446 {
6447 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6448 {
6449 *errorcodeptr = ERR20;
6450 goto FAILED;
6451 }
6452 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6453 code++; /* This already contains bravalue */
6454 PUTINC(code, 0, 1 + LINK_SIZE);
6455 *code++ = OP_KET;
6456 PUTINC(code, 0, 1 + LINK_SIZE);
6457 break; /* No need to waste time with special character handling */
6458 }
6459
6460 /* Otherwise update the main code pointer to the end of the group. */
6461
6462 code = tempcode;
6463
6464 /* For a DEFINE group, required and first character settings are not
6465 relevant. */
6466
6467 if (bravalue == OP_DEF) break;
6468
6469 /* Handle updating of the required and first characters for other types of
6470 group. Update for normal brackets of all kinds, and conditions with two
6471 branches (see code above). If the bracket is followed by a quantifier with
6472 zero repeat, we have to back off. Hence the definition of zeroreqchar and
6473 zerofirstchar outside the main loop so that they can be accessed for the
6474 back off. */
6475
6476 zeroreqchar = reqchar;
6477 zeroreqcharflags = reqcharflags;
6478 zerofirstchar = firstchar;
6479 zerofirstcharflags = firstcharflags;
6480 groupsetfirstchar = FALSE;
6481
6482 if (bravalue >= OP_ONCE)
6483 {
6484 /* If we have not yet set a firstchar in this branch, take it from the
6485 subpattern, remembering that it was set here so that a repeat of more
6486 than one can replicate it as reqchar if necessary. If the subpattern has
6487 no firstchar, set "none" for the whole branch. In both cases, a zero
6488 repeat forces firstchar to "none". */
6489
6490 if (firstcharflags == REQ_UNSET)
6491 {
6492 if (subfirstcharflags >= 0)
6493 {
6494 firstchar = subfirstchar;
6495 firstcharflags = subfirstcharflags;
6496 groupsetfirstchar = TRUE;
6497 }
6498 else firstcharflags = REQ_NONE;
6499 zerofirstcharflags = REQ_NONE;
6500 }
6501
6502 /* If firstchar was previously set, convert the subpattern's firstchar
6503 into reqchar if there wasn't one, using the vary flag that was in
6504 existence beforehand. */
6505
6506 else if (subfirstcharflags >= 0 && subreqcharflags < 0)
6507 {
6508 subreqchar = subfirstchar;
6509 subreqcharflags = subfirstcharflags | tempreqvary;
6510 }
6511
6512 /* If the subpattern set a required byte (or set a first byte that isn't
6513 really the first byte - see above), set it. */
6514
6515 if (subreqcharflags >= 0)
6516 {
6517 reqchar = subreqchar;
6518 reqcharflags = subreqcharflags;
6519 }
6520 }
6521
6522 /* For a forward assertion, we take the reqchar, if set. This can be
6523 helpful if the pattern that follows the assertion doesn't set a different
6524 char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
6525 for an assertion, however because it leads to incorrect effect for patterns
6526 such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
6527 of a firstchar. This is overcome by a scan at the end if there's no
6528 firstchar, looking for an asserted first char. */
6529
6530 else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
6531 {
6532 reqchar = subreqchar;
6533 reqcharflags = subreqcharflags;
6534 }
6535 break; /* End of processing '(' */
6536
6537
6538 /* ===================================================================*/
6539 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
6540 are arranged to be the negation of the corresponding OP_values in the
6541 default case when PCRE_UCP is not set. For the back references, the values
6542 are negative the reference number. Only back references and those types
6543 that consume a character may be repeated. We can test for values between
6544 ESC_b and ESC_Z for the latter; this may have to change if any new ones are
6545 ever created. */
6546
6547 case CHAR_BACKSLASH:
6548 tempptr = ptr;
6549 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
6550 if (*errorcodeptr != 0) goto FAILED;
6551
6552 if (escape == 0) /* The escape coded a single character */
6553 c = ec;
6554 else
6555 {
6556 if (escape == ESC_Q) /* Handle start of quoted string */
6557 {
6558 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
6559 ptr += 2; /* avoid empty string */
6560 else inescq = TRUE;
6561 continue;
6562 }
6563
6564 if (escape == ESC_E) continue; /* Perl ignores an orphan \E */
6565
6566 /* For metasequences that actually match a character, we disable the
6567 setting of a first character if it hasn't already been set. */
6568
6569 if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
6570 firstcharflags = REQ_NONE;
6571
6572 /* Set values to reset to if this is followed by a zero repeat. */
6573
6574 zerofirstchar = firstchar;
6575 zerofirstcharflags = firstcharflags;
6576 zeroreqchar = reqchar;
6577 zeroreqcharflags = reqcharflags;
6578
6579 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6580 is a subroutine call by number (Oniguruma syntax). In fact, the value
6581 ESC_g is returned only for these cases. So we don't need to check for <
6582 or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
6583 -n, and for the Perl syntax \g{name} the result is ESC_k (as
6584 that is a synonym for a named back reference). */
6585
6586 if (escape == ESC_g)
6587 {
6588 const pcre_uchar *p;
6589 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
6590 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6591 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6592
6593 /* These two statements stop the compiler for warning about possibly
6594 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
6595 fact, because we actually check for a number below, the paths that
6596 would actually be in error are never taken. */
6597
6598 skipbytes = 0;
6599 reset_bracount = FALSE;
6600
6601 /* Test for a name */
6602
6603 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6604 {
6605 BOOL is_a_number = TRUE;
6606 for (p = ptr + 1; *p != CHAR_NULL && *p != (pcre_uchar)terminator; p++)
6607 {
6608 if (!MAX_255(*p)) { is_a_number = FALSE; break; }
6609 if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;
6610 if ((cd->ctypes[*p] & ctype_word) == 0) break;
6611 }
6612 if (*p != (pcre_uchar)terminator)
6613 {
6614 *errorcodeptr = ERR57;
6615 break;
6616 }
6617 if (is_a_number)
6618 {
6619 ptr++;
6620 goto HANDLE_NUMERICAL_RECURSION;
6621 }
6622 is_recurse = TRUE;
6623 goto NAMED_REF_OR_RECURSE;
6624 }
6625
6626 /* Test a signed number in angle brackets or quotes. */
6627
6628 p = ptr + 2;
6629 while (IS_DIGIT(*p)) p++;
6630 if (*p != (pcre_uchar)terminator)
6631 {
6632 *errorcodeptr = ERR57;
6633 break;
6634 }
6635 ptr++;
6636 goto HANDLE_NUMERICAL_RECURSION;
6637 }
6638