/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1359 - (show annotations)
Tue Sep 3 10:10:59 2013 UTC (6 years ago) by ph10
File MIME type: text/plain
File size: 272047 byte(s)
Refactor the code for creating the name/number table.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Tables of names of POSIX character classes and their lengths. The names are
264 now all in a single string, to reduce the number of relocations when a shared
265 library is dynamically loaded. The list of lengths is terminated by a zero
266 length entry. The first three must be alpha, lower, upper, as this is assumed
267 for handling case independence. */
268
269 static const char posix_names[] =
270 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
271 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
272 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
273 STRING_word0 STRING_xdigit;
274
275 static const pcre_uint8 posix_name_lengths[] = {
276 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
277
278 /* Table of class bit maps for each POSIX class. Each class is formed from a
279 base map, with an optional addition or removal of another map. Then, for some
280 classes, there is some additional tweaking: for [:blank:] the vertical space
281 characters are removed, and for [:alpha:] and [:alnum:] the underscore
282 character is removed. The triples in the table consist of the base map offset,
283 second map offset or -1 if no second map, and a non-negative value for map
284 addition or a negative value for map subtraction (if there are two maps). The
285 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
286 remove vertical space characters, 2 => remove underscore. */
287
288 static const int posix_class_maps[] = {
289 cbit_word, cbit_digit, -2, /* alpha */
290 cbit_lower, -1, 0, /* lower */
291 cbit_upper, -1, 0, /* upper */
292 cbit_word, -1, 2, /* alnum - word without underscore */
293 cbit_print, cbit_cntrl, 0, /* ascii */
294 cbit_space, -1, 1, /* blank - a GNU extension */
295 cbit_cntrl, -1, 0, /* cntrl */
296 cbit_digit, -1, 0, /* digit */
297 cbit_graph, -1, 0, /* graph */
298 cbit_print, -1, 0, /* print */
299 cbit_punct, -1, 0, /* punct */
300 cbit_space, -1, 0, /* space */
301 cbit_word, -1, 0, /* word - a Perl extension */
302 cbit_xdigit,-1, 0 /* xdigit */
303 };
304
305 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
306 substitutes must be in the order of the names, defined above, and there are
307 both positive and negative cases. NULL means no substitute. */
308
309 #ifdef SUPPORT_UCP
310 static const pcre_uchar string_PNd[] = {
311 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
312 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
313 static const pcre_uchar string_pNd[] = {
314 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
315 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
316 static const pcre_uchar string_PXsp[] = {
317 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
318 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319 static const pcre_uchar string_pXsp[] = {
320 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322 static const pcre_uchar string_PXwd[] = {
323 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
324 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325 static const pcre_uchar string_pXwd[] = {
326 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328
329 static const pcre_uchar *substitutes[] = {
330 string_PNd, /* \D */
331 string_pNd, /* \d */
332 string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
333 string_pXsp, /* \s */
334 string_PXwd, /* \W */
335 string_pXwd /* \w */
336 };
337
338 static const pcre_uchar string_pL[] = {
339 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
340 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pLl[] = {
342 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_pLu[] = {
345 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
346 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXan[] = {
348 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 static const pcre_uchar string_h[] = {
351 CHAR_BACKSLASH, CHAR_h, '\0' };
352 static const pcre_uchar string_pXps[] = {
353 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355 static const pcre_uchar string_PL[] = {
356 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
357 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358 static const pcre_uchar string_PLl[] = {
359 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
360 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361 static const pcre_uchar string_PLu[] = {
362 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
363 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
364 static const pcre_uchar string_PXan[] = {
365 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
366 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
367 static const pcre_uchar string_H[] = {
368 CHAR_BACKSLASH, CHAR_H, '\0' };
369 static const pcre_uchar string_PXps[] = {
370 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372
373 static const pcre_uchar *posix_substitutes[] = {
374 string_pL, /* alpha */
375 string_pLl, /* lower */
376 string_pLu, /* upper */
377 string_pXan, /* alnum */
378 NULL, /* ascii */
379 string_h, /* blank */
380 NULL, /* cntrl */
381 string_pNd, /* digit */
382 NULL, /* graph */
383 NULL, /* print */
384 NULL, /* punct */
385 string_pXps, /* space */ /* NOTE: Xps is POSIX space */
386 string_pXwd, /* word */
387 NULL, /* xdigit */
388 /* Negated cases */
389 string_PL, /* ^alpha */
390 string_PLl, /* ^lower */
391 string_PLu, /* ^upper */
392 string_PXan, /* ^alnum */
393 NULL, /* ^ascii */
394 string_H, /* ^blank */
395 NULL, /* ^cntrl */
396 string_PNd, /* ^digit */
397 NULL, /* ^graph */
398 NULL, /* ^print */
399 NULL, /* ^punct */
400 string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
401 string_PXwd, /* ^word */
402 NULL /* ^xdigit */
403 };
404 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
405 #endif
406
407 #define STRING(a) # a
408 #define XSTRING(s) STRING(s)
409
410 /* The texts of compile-time error messages. These are "char *" because they
411 are passed to the outside world. Do not ever re-use any error number, because
412 they are documented. Always add a new error instead. Messages marked DEAD below
413 are no longer used. This used to be a table of strings, but in order to reduce
414 the number of relocations needed when a shared library is loaded dynamically,
415 it is now one long string. We cannot use a table of offsets, because the
416 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
417 simply count through to the one we want - this isn't a performance issue
418 because these strings are used only when there is a compilation error.
419
420 Each substring ends with \0 to insert a null character. This includes the final
421 substring, so that the whole string ends with \0\0, which can be detected when
422 counting through. */
423
424 static const char error_texts[] =
425 "no error\0"
426 "\\ at end of pattern\0"
427 "\\c at end of pattern\0"
428 "unrecognized character follows \\\0"
429 "numbers out of order in {} quantifier\0"
430 /* 5 */
431 "number too big in {} quantifier\0"
432 "missing terminating ] for character class\0"
433 "invalid escape sequence in character class\0"
434 "range out of order in character class\0"
435 "nothing to repeat\0"
436 /* 10 */
437 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
438 "internal error: unexpected repeat\0"
439 "unrecognized character after (? or (?-\0"
440 "POSIX named classes are supported only within a class\0"
441 "missing )\0"
442 /* 15 */
443 "reference to non-existent subpattern\0"
444 "erroffset passed as NULL\0"
445 "unknown option bit(s) set\0"
446 "missing ) after comment\0"
447 "parentheses nested too deeply\0" /** DEAD **/
448 /* 20 */
449 "regular expression is too large\0"
450 "failed to get memory\0"
451 "unmatched parentheses\0"
452 "internal error: code overflow\0"
453 "unrecognized character after (?<\0"
454 /* 25 */
455 "lookbehind assertion is not fixed length\0"
456 "malformed number or name after (?(\0"
457 "conditional group contains more than two branches\0"
458 "assertion expected after (?(\0"
459 "(?R or (?[+-]digits must be followed by )\0"
460 /* 30 */
461 "unknown POSIX class name\0"
462 "POSIX collating elements are not supported\0"
463 "this version of PCRE is compiled without UTF support\0"
464 "spare error\0" /** DEAD **/
465 "character value in \\x{...} sequence is too large\0"
466 /* 35 */
467 "invalid condition (?(0)\0"
468 "\\C not allowed in lookbehind assertion\0"
469 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
470 "number after (?C is > 255\0"
471 "closing ) for (?C expected\0"
472 /* 40 */
473 "recursive call could loop indefinitely\0"
474 "unrecognized character after (?P\0"
475 "syntax error in subpattern name (missing terminator)\0"
476 "two named subpatterns have the same name\0"
477 "invalid UTF-8 string\0"
478 /* 45 */
479 "support for \\P, \\p, and \\X has not been compiled\0"
480 "malformed \\P or \\p sequence\0"
481 "unknown property name after \\P or \\p\0"
482 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
483 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
484 /* 50 */
485 "repeated subpattern is too long\0" /** DEAD **/
486 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
487 "internal error: overran compiling workspace\0"
488 "internal error: previously-checked referenced subpattern not found\0"
489 "DEFINE group contains more than one branch\0"
490 /* 55 */
491 "repeating a DEFINE group is not allowed\0" /** DEAD **/
492 "inconsistent NEWLINE options\0"
493 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
494 "a numbered reference must not be zero\0"
495 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
496 /* 60 */
497 "(*VERB) not recognized or malformed\0"
498 "number is too big\0"
499 "subpattern name expected\0"
500 "digit expected after (?+\0"
501 "] is an invalid data character in JavaScript compatibility mode\0"
502 /* 65 */
503 "different names for subpatterns of the same number are not allowed\0"
504 "(*MARK) must have an argument\0"
505 "this version of PCRE is not compiled with Unicode property support\0"
506 "\\c must be followed by an ASCII character\0"
507 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
508 /* 70 */
509 "internal error: unknown opcode in find_fixedlength()\0"
510 "\\N is not supported in a class\0"
511 "too many forward references\0"
512 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
513 "invalid UTF-16 string\0"
514 /* 75 */
515 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
516 "character value in \\u.... sequence is too large\0"
517 "invalid UTF-32 string\0"
518 "setting UTF is disabled by the application\0"
519 ;
520
521 /* Table to identify digits and hex digits. This is used when compiling
522 patterns. Note that the tables in chartables are dependent on the locale, and
523 may mark arbitrary characters as digits - but the PCRE compiling code expects
524 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
525 a private table here. It costs 256 bytes, but it is a lot faster than doing
526 character value tests (at least in some simple cases I timed), and in some
527 applications one wants PCRE to compile efficiently as well as match
528 efficiently.
529
530 For convenience, we use the same bit definitions as in chartables:
531
532 0x04 decimal digit
533 0x08 hexadecimal digit
534
535 Then we can use ctype_digit and ctype_xdigit in the code. */
536
537 /* Using a simple comparison for decimal numbers rather than a memory read
538 is much faster, and the resulting code is simpler (the compiler turns it
539 into a subtraction and unsigned comparison). */
540
541 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
542
543 #ifndef EBCDIC
544
545 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
546 UTF-8 mode. */
547
548 static const pcre_uint8 digitab[] =
549 {
550 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
551 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
552 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
553 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
554 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
555 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
556 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
557 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
558 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
559 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
560 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
561 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
562 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
563 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
564 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
565 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
566 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
582
583 #else
584
585 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
586
587 static const pcre_uint8 digitab[] =
588 {
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
605 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
613 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
619 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
620 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
621
622 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
623 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
624 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
625 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
627 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
631 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
632 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
634 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
636 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
639 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
640 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
641 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
642 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
643 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
644 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
645 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
646 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
647 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
648 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
649 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
650 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
651 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
652 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
653 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
654 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
655 #endif
656
657
658
659 /*************************************************
660 * Find an error text *
661 *************************************************/
662
663 /* The error texts are now all in one long string, to save on relocations. As
664 some of the text is of unknown length, we can't use a table of offsets.
665 Instead, just count through the strings. This is not a performance issue
666 because it happens only when there has been a compilation error.
667
668 Argument: the error number
669 Returns: pointer to the error string
670 */
671
672 static const char *
673 find_error_text(int n)
674 {
675 const char *s = error_texts;
676 for (; n > 0; n--)
677 {
678 while (*s++ != CHAR_NULL) {};
679 if (*s == CHAR_NULL) return "Error text not found (please report)";
680 }
681 return s;
682 }
683
684
685 /*************************************************
686 * Expand the workspace *
687 *************************************************/
688
689 /* This function is called during the second compiling phase, if the number of
690 forward references fills the existing workspace, which is originally a block on
691 the stack. A larger block is obtained from malloc() unless the ultimate limit
692 has been reached or the increase will be rather small.
693
694 Argument: pointer to the compile data block
695 Returns: 0 if all went well, else an error number
696 */
697
698 static int
699 expand_workspace(compile_data *cd)
700 {
701 pcre_uchar *newspace;
702 int newsize = cd->workspace_size * 2;
703
704 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
705 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
706 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
707 return ERR72;
708
709 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
710 if (newspace == NULL) return ERR21;
711 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
712 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
713 if (cd->workspace_size > COMPILE_WORK_SIZE)
714 (PUBL(free))((void *)cd->start_workspace);
715 cd->start_workspace = newspace;
716 cd->workspace_size = newsize;
717 return 0;
718 }
719
720
721
722 /*************************************************
723 * Check for counted repeat *
724 *************************************************/
725
726 /* This function is called when a '{' is encountered in a place where it might
727 start a quantifier. It looks ahead to see if it really is a quantifier or not.
728 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
729 where the ddds are digits.
730
731 Arguments:
732 p pointer to the first char after '{'
733
734 Returns: TRUE or FALSE
735 */
736
737 static BOOL
738 is_counted_repeat(const pcre_uchar *p)
739 {
740 if (!IS_DIGIT(*p)) return FALSE;
741 p++;
742 while (IS_DIGIT(*p)) p++;
743 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
744
745 if (*p++ != CHAR_COMMA) return FALSE;
746 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
747
748 if (!IS_DIGIT(*p)) return FALSE;
749 p++;
750 while (IS_DIGIT(*p)) p++;
751
752 return (*p == CHAR_RIGHT_CURLY_BRACKET);
753 }
754
755
756
757 /*************************************************
758 * Handle escapes *
759 *************************************************/
760
761 /* This function is called when a \ has been encountered. It either returns a
762 positive value for a simple escape such as \n, or 0 for a data character
763 which will be placed in chptr. A backreference to group n is returned as
764 negative n. When UTF-8 is enabled, a positive value greater than 255 may
765 be returned in chptr.
766 On entry,ptr is pointing at the \. On exit, it is on the final character of the
767 escape sequence.
768
769 Arguments:
770 ptrptr points to the pattern position pointer
771 chptr points to the data character
772 errorcodeptr points to the errorcode variable
773 bracount number of previous extracting brackets
774 options the options bits
775 isclass TRUE if inside a character class
776
777 Returns: zero => a data character
778 positive => a special escape sequence
779 negative => a back reference
780 on error, errorcodeptr is set
781 */
782
783 static int
784 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
785 int bracount, int options, BOOL isclass)
786 {
787 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
788 BOOL utf = (options & PCRE_UTF8) != 0;
789 const pcre_uchar *ptr = *ptrptr + 1;
790 pcre_uint32 c;
791 int escape = 0;
792 int i;
793
794 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
795 ptr--; /* Set pointer back to the last byte */
796
797 /* If backslash is at the end of the pattern, it's an error. */
798
799 if (c == CHAR_NULL) *errorcodeptr = ERR1;
800
801 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
802 in a table. A non-zero result is something that can be returned immediately.
803 Otherwise further processing may be required. */
804
805 #ifndef EBCDIC /* ASCII/UTF-8 coding */
806 /* Not alphanumeric */
807 else if (c < CHAR_0 || c > CHAR_z) {}
808 else if ((i = escapes[c - CHAR_0]) != 0)
809 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
810
811 #else /* EBCDIC coding */
812 /* Not alphanumeric */
813 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
814 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
815 #endif
816
817 /* Escapes that need further processing, or are illegal. */
818
819 else
820 {
821 const pcre_uchar *oldptr;
822 BOOL braced, negated, overflow;
823 int s;
824
825 switch (c)
826 {
827 /* A number of Perl escapes are not handled by PCRE. We give an explicit
828 error. */
829
830 case CHAR_l:
831 case CHAR_L:
832 *errorcodeptr = ERR37;
833 break;
834
835 case CHAR_u:
836 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
837 {
838 /* In JavaScript, \u must be followed by four hexadecimal numbers.
839 Otherwise it is a lowercase u letter. */
840 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
841 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
842 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
843 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
844 {
845 c = 0;
846 for (i = 0; i < 4; ++i)
847 {
848 register pcre_uint32 cc = *(++ptr);
849 #ifndef EBCDIC /* ASCII/UTF-8 coding */
850 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
851 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
852 #else /* EBCDIC coding */
853 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
854 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
855 #endif
856 }
857
858 #if defined COMPILE_PCRE8
859 if (c > (utf ? 0x10ffffU : 0xffU))
860 #elif defined COMPILE_PCRE16
861 if (c > (utf ? 0x10ffffU : 0xffffU))
862 #elif defined COMPILE_PCRE32
863 if (utf && c > 0x10ffffU)
864 #endif
865 {
866 *errorcodeptr = ERR76;
867 }
868 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
869 }
870 }
871 else
872 *errorcodeptr = ERR37;
873 break;
874
875 case CHAR_U:
876 /* In JavaScript, \U is an uppercase U letter. */
877 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
878 break;
879
880 /* In a character class, \g is just a literal "g". Outside a character
881 class, \g must be followed by one of a number of specific things:
882
883 (1) A number, either plain or braced. If positive, it is an absolute
884 backreference. If negative, it is a relative backreference. This is a Perl
885 5.10 feature.
886
887 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
888 is part of Perl's movement towards a unified syntax for back references. As
889 this is synonymous with \k{name}, we fudge it up by pretending it really
890 was \k.
891
892 (3) For Oniguruma compatibility we also support \g followed by a name or a
893 number either in angle brackets or in single quotes. However, these are
894 (possibly recursive) subroutine calls, _not_ backreferences. Just return
895 the ESC_g code (cf \k). */
896
897 case CHAR_g:
898 if (isclass) break;
899 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
900 {
901 escape = ESC_g;
902 break;
903 }
904
905 /* Handle the Perl-compatible cases */
906
907 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
908 {
909 const pcre_uchar *p;
910 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
911 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
912 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
913 {
914 escape = ESC_k;
915 break;
916 }
917 braced = TRUE;
918 ptr++;
919 }
920 else braced = FALSE;
921
922 if (ptr[1] == CHAR_MINUS)
923 {
924 negated = TRUE;
925 ptr++;
926 }
927 else negated = FALSE;
928
929 /* The integer range is limited by the machine's int representation. */
930 s = 0;
931 overflow = FALSE;
932 while (IS_DIGIT(ptr[1]))
933 {
934 if (s > INT_MAX / 10 - 1) /* Integer overflow */
935 {
936 overflow = TRUE;
937 break;
938 }
939 s = s * 10 + (int)(*(++ptr) - CHAR_0);
940 }
941 if (overflow) /* Integer overflow */
942 {
943 while (IS_DIGIT(ptr[1]))
944 ptr++;
945 *errorcodeptr = ERR61;
946 break;
947 }
948
949 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
950 {
951 *errorcodeptr = ERR57;
952 break;
953 }
954
955 if (s == 0)
956 {
957 *errorcodeptr = ERR58;
958 break;
959 }
960
961 if (negated)
962 {
963 if (s > bracount)
964 {
965 *errorcodeptr = ERR15;
966 break;
967 }
968 s = bracount - (s - 1);
969 }
970
971 escape = -s;
972 break;
973
974 /* The handling of escape sequences consisting of a string of digits
975 starting with one that is not zero is not straightforward. By experiment,
976 the way Perl works seems to be as follows:
977
978 Outside a character class, the digits are read as a decimal number. If the
979 number is less than 10, or if there are that many previous extracting
980 left brackets, then it is a back reference. Otherwise, up to three octal
981 digits are read to form an escaped byte. Thus \123 is likely to be octal
982 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
983 value is greater than 377, the least significant 8 bits are taken. Inside a
984 character class, \ followed by a digit is always an octal number. */
985
986 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
987 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
988
989 if (!isclass)
990 {
991 oldptr = ptr;
992 /* The integer range is limited by the machine's int representation. */
993 s = (int)(c -CHAR_0);
994 overflow = FALSE;
995 while (IS_DIGIT(ptr[1]))
996 {
997 if (s > INT_MAX / 10 - 1) /* Integer overflow */
998 {
999 overflow = TRUE;
1000 break;
1001 }
1002 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1003 }
1004 if (overflow) /* Integer overflow */
1005 {
1006 while (IS_DIGIT(ptr[1]))
1007 ptr++;
1008 *errorcodeptr = ERR61;
1009 break;
1010 }
1011 if (s < 10 || s <= bracount)
1012 {
1013 escape = -s;
1014 break;
1015 }
1016 ptr = oldptr; /* Put the pointer back and fall through */
1017 }
1018
1019 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
1020 generates a binary zero byte and treats the digit as a following literal.
1021 Thus we have to pull back the pointer by one. */
1022
1023 if ((c = *ptr) >= CHAR_8)
1024 {
1025 ptr--;
1026 c = 0;
1027 break;
1028 }
1029
1030 /* \0 always starts an octal number, but we may drop through to here with a
1031 larger first octal digit. The original code used just to take the least
1032 significant 8 bits of octal numbers (I think this is what early Perls used
1033 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1034 but no more than 3 octal digits. */
1035
1036 case CHAR_0:
1037 c -= CHAR_0;
1038 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1039 c = c * 8 + *(++ptr) - CHAR_0;
1040 #ifdef COMPILE_PCRE8
1041 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1042 #endif
1043 break;
1044
1045 /* \x is complicated. \x{ddd} is a character number which can be greater
1046 than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1047 If not, { is treated as a data character. */
1048
1049 case CHAR_x:
1050 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1051 {
1052 /* In JavaScript, \x must be followed by two hexadecimal numbers.
1053 Otherwise it is a lowercase x letter. */
1054 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1055 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1056 {
1057 c = 0;
1058 for (i = 0; i < 2; ++i)
1059 {
1060 register pcre_uint32 cc = *(++ptr);
1061 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1062 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1063 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1064 #else /* EBCDIC coding */
1065 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1066 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1067 #endif
1068 }
1069 }
1070 break;
1071 }
1072
1073 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1074 {
1075 const pcre_uchar *pt = ptr + 2;
1076
1077 c = 0;
1078 overflow = FALSE;
1079 while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1080 {
1081 register pcre_uint32 cc = *pt++;
1082 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1083
1084 #ifdef COMPILE_PCRE32
1085 if (c >= 0x10000000l) { overflow = TRUE; break; }
1086 #endif
1087
1088 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1089 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1090 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1091 #else /* EBCDIC coding */
1092 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1093 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1094 #endif
1095
1096 #if defined COMPILE_PCRE8
1097 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1098 #elif defined COMPILE_PCRE16
1099 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1100 #elif defined COMPILE_PCRE32
1101 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1102 #endif
1103 }
1104
1105 if (overflow)
1106 {
1107 while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1108 *errorcodeptr = ERR34;
1109 }
1110
1111 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1112 {
1113 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1114 ptr = pt;
1115 break;
1116 }
1117
1118 /* If the sequence of hex digits does not end with '}', then we don't
1119 recognize this construct; fall through to the normal \x handling. */
1120 }
1121
1122 /* Read just a single-byte hex-defined char */
1123
1124 c = 0;
1125 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1126 {
1127 pcre_uint32 cc; /* Some compilers don't like */
1128 cc = *(++ptr); /* ++ in initializers */
1129 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1130 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1131 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1132 #else /* EBCDIC coding */
1133 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1134 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1135 #endif
1136 }
1137 break;
1138
1139 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1140 An error is given if the byte following \c is not an ASCII character. This
1141 coding is ASCII-specific, but then the whole concept of \cx is
1142 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1143
1144 case CHAR_c:
1145 c = *(++ptr);
1146 if (c == CHAR_NULL)
1147 {
1148 *errorcodeptr = ERR2;
1149 break;
1150 }
1151 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1152 if (c > 127) /* Excludes all non-ASCII in either mode */
1153 {
1154 *errorcodeptr = ERR68;
1155 break;
1156 }
1157 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1158 c ^= 0x40;
1159 #else /* EBCDIC coding */
1160 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1161 c ^= 0xC0;
1162 #endif
1163 break;
1164
1165 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1166 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1167 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1168 odd, but there used to be some cases other than the default, and there may
1169 be again in future, so I haven't "optimized" it. */
1170
1171 default:
1172 if ((options & PCRE_EXTRA) != 0) switch(c)
1173 {
1174 default:
1175 *errorcodeptr = ERR3;
1176 break;
1177 }
1178 break;
1179 }
1180 }
1181
1182 /* Perl supports \N{name} for character names, as well as plain \N for "not
1183 newline". PCRE does not support \N{name}. However, it does support
1184 quantification such as \N{2,3}. */
1185
1186 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1187 !is_counted_repeat(ptr+2))
1188 *errorcodeptr = ERR37;
1189
1190 /* If PCRE_UCP is set, we change the values for \d etc. */
1191
1192 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1193 escape += (ESC_DU - ESC_D);
1194
1195 /* Set the pointer to the final character before returning. */
1196
1197 *ptrptr = ptr;
1198 *chptr = c;
1199 return escape;
1200 }
1201
1202 #ifdef SUPPORT_UCP
1203 /*************************************************
1204 * Handle \P and \p *
1205 *************************************************/
1206
1207 /* This function is called after \P or \p has been encountered, provided that
1208 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1209 pointing at the P or p. On exit, it is pointing at the final character of the
1210 escape sequence.
1211
1212 Argument:
1213 ptrptr points to the pattern position pointer
1214 negptr points to a boolean that is set TRUE for negation else FALSE
1215 ptypeptr points to an unsigned int that is set to the type value
1216 pdataptr points to an unsigned int that is set to the detailed property value
1217 errorcodeptr points to the error code variable
1218
1219 Returns: TRUE if the type value was found, or FALSE for an invalid type
1220 */
1221
1222 static BOOL
1223 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1224 unsigned int *pdataptr, int *errorcodeptr)
1225 {
1226 pcre_uchar c;
1227 int i, bot, top;
1228 const pcre_uchar *ptr = *ptrptr;
1229 pcre_uchar name[32];
1230
1231 c = *(++ptr);
1232 if (c == CHAR_NULL) goto ERROR_RETURN;
1233
1234 *negptr = FALSE;
1235
1236 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1237 negation. */
1238
1239 if (c == CHAR_LEFT_CURLY_BRACKET)
1240 {
1241 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1242 {
1243 *negptr = TRUE;
1244 ptr++;
1245 }
1246 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1247 {
1248 c = *(++ptr);
1249 if (c == CHAR_NULL) goto ERROR_RETURN;
1250 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1251 name[i] = c;
1252 }
1253 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1254 name[i] = 0;
1255 }
1256
1257 /* Otherwise there is just one following character */
1258
1259 else
1260 {
1261 name[0] = c;
1262 name[1] = 0;
1263 }
1264
1265 *ptrptr = ptr;
1266
1267 /* Search for a recognized property name using binary chop */
1268
1269 bot = 0;
1270 top = PRIV(utt_size);
1271
1272 while (bot < top)
1273 {
1274 int r;
1275 i = (bot + top) >> 1;
1276 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1277 if (r == 0)
1278 {
1279 *ptypeptr = PRIV(utt)[i].type;
1280 *pdataptr = PRIV(utt)[i].value;
1281 return TRUE;
1282 }
1283 if (r > 0) bot = i + 1; else top = i;
1284 }
1285
1286 *errorcodeptr = ERR47;
1287 *ptrptr = ptr;
1288 return FALSE;
1289
1290 ERROR_RETURN:
1291 *errorcodeptr = ERR46;
1292 *ptrptr = ptr;
1293 return FALSE;
1294 }
1295 #endif
1296
1297
1298
1299
1300 /*************************************************
1301 * Read repeat counts *
1302 *************************************************/
1303
1304 /* Read an item of the form {n,m} and return the values. This is called only
1305 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1306 so the syntax is guaranteed to be correct, but we need to check the values.
1307
1308 Arguments:
1309 p pointer to first char after '{'
1310 minp pointer to int for min
1311 maxp pointer to int for max
1312 returned as -1 if no max
1313 errorcodeptr points to error code variable
1314
1315 Returns: pointer to '}' on success;
1316 current ptr on error, with errorcodeptr set non-zero
1317 */
1318
1319 static const pcre_uchar *
1320 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1321 {
1322 int min = 0;
1323 int max = -1;
1324
1325 /* Read the minimum value and do a paranoid check: a negative value indicates
1326 an integer overflow. */
1327
1328 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1329 if (min < 0 || min > 65535)
1330 {
1331 *errorcodeptr = ERR5;
1332 return p;
1333 }
1334
1335 /* Read the maximum value if there is one, and again do a paranoid on its size.
1336 Also, max must not be less than min. */
1337
1338 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1339 {
1340 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1341 {
1342 max = 0;
1343 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1344 if (max < 0 || max > 65535)
1345 {
1346 *errorcodeptr = ERR5;
1347 return p;
1348 }
1349 if (max < min)
1350 {
1351 *errorcodeptr = ERR4;
1352 return p;
1353 }
1354 }
1355 }
1356
1357 /* Fill in the required variables, and pass back the pointer to the terminating
1358 '}'. */
1359
1360 *minp = min;
1361 *maxp = max;
1362 return p;
1363 }
1364
1365
1366
1367 /*************************************************
1368 * Find first significant op code *
1369 *************************************************/
1370
1371 /* This is called by several functions that scan a compiled expression looking
1372 for a fixed first character, or an anchoring op code etc. It skips over things
1373 that do not influence this. For some calls, it makes sense to skip negative
1374 forward and all backward assertions, and also the \b assertion; for others it
1375 does not.
1376
1377 Arguments:
1378 code pointer to the start of the group
1379 skipassert TRUE if certain assertions are to be skipped
1380
1381 Returns: pointer to the first significant opcode
1382 */
1383
1384 static const pcre_uchar*
1385 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1386 {
1387 for (;;)
1388 {
1389 switch ((int)*code)
1390 {
1391 case OP_ASSERT_NOT:
1392 case OP_ASSERTBACK:
1393 case OP_ASSERTBACK_NOT:
1394 if (!skipassert) return code;
1395 do code += GET(code, 1); while (*code == OP_ALT);
1396 code += PRIV(OP_lengths)[*code];
1397 break;
1398
1399 case OP_WORD_BOUNDARY:
1400 case OP_NOT_WORD_BOUNDARY:
1401 if (!skipassert) return code;
1402 /* Fall through */
1403
1404 case OP_CALLOUT:
1405 case OP_CREF:
1406 case OP_NCREF:
1407 case OP_RREF:
1408 case OP_NRREF:
1409 case OP_DEF:
1410 code += PRIV(OP_lengths)[*code];
1411 break;
1412
1413 default:
1414 return code;
1415 }
1416 }
1417 /* Control never reaches here */
1418 }
1419
1420
1421
1422
1423 /*************************************************
1424 * Find the fixed length of a branch *
1425 *************************************************/
1426
1427 /* Scan a branch and compute the fixed length of subject that will match it,
1428 if the length is fixed. This is needed for dealing with backward assertions.
1429 In UTF8 mode, the result is in characters rather than bytes. The branch is
1430 temporarily terminated with OP_END when this function is called.
1431
1432 This function is called when a backward assertion is encountered, so that if it
1433 fails, the error message can point to the correct place in the pattern.
1434 However, we cannot do this when the assertion contains subroutine calls,
1435 because they can be forward references. We solve this by remembering this case
1436 and doing the check at the end; a flag specifies which mode we are running in.
1437
1438 Arguments:
1439 code points to the start of the pattern (the bracket)
1440 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1441 atend TRUE if called when the pattern is complete
1442 cd the "compile data" structure
1443
1444 Returns: the fixed length,
1445 or -1 if there is no fixed length,
1446 or -2 if \C was encountered (in UTF-8 mode only)
1447 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1448 or -4 if an unknown opcode was encountered (internal error)
1449 */
1450
1451 static int
1452 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1453 {
1454 int length = -1;
1455
1456 register int branchlength = 0;
1457 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1458
1459 /* Scan along the opcodes for this branch. If we get to the end of the
1460 branch, check the length against that of the other branches. */
1461
1462 for (;;)
1463 {
1464 int d;
1465 pcre_uchar *ce, *cs;
1466 register pcre_uchar op = *cc;
1467
1468 switch (op)
1469 {
1470 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1471 OP_BRA (normal non-capturing bracket) because the other variants of these
1472 opcodes are all concerned with unlimited repeated groups, which of course
1473 are not of fixed length. */
1474
1475 case OP_CBRA:
1476 case OP_BRA:
1477 case OP_ONCE:
1478 case OP_ONCE_NC:
1479 case OP_COND:
1480 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1481 if (d < 0) return d;
1482 branchlength += d;
1483 do cc += GET(cc, 1); while (*cc == OP_ALT);
1484 cc += 1 + LINK_SIZE;
1485 break;
1486
1487 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1488 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1489 an ALT. If it is END it's the end of the outer call. All can be handled by
1490 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1491 because they all imply an unlimited repeat. */
1492
1493 case OP_ALT:
1494 case OP_KET:
1495 case OP_END:
1496 case OP_ACCEPT:
1497 case OP_ASSERT_ACCEPT:
1498 if (length < 0) length = branchlength;
1499 else if (length != branchlength) return -1;
1500 if (*cc != OP_ALT) return length;
1501 cc += 1 + LINK_SIZE;
1502 branchlength = 0;
1503 break;
1504
1505 /* A true recursion implies not fixed length, but a subroutine call may
1506 be OK. If the subroutine is a forward reference, we can't deal with
1507 it until the end of the pattern, so return -3. */
1508
1509 case OP_RECURSE:
1510 if (!atend) return -3;
1511 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1512 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1513 if (cc > cs && cc < ce) return -1; /* Recursion */
1514 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1515 if (d < 0) return d;
1516 branchlength += d;
1517 cc += 1 + LINK_SIZE;
1518 break;
1519
1520 /* Skip over assertive subpatterns */
1521
1522 case OP_ASSERT:
1523 case OP_ASSERT_NOT:
1524 case OP_ASSERTBACK:
1525 case OP_ASSERTBACK_NOT:
1526 do cc += GET(cc, 1); while (*cc == OP_ALT);
1527 cc += PRIV(OP_lengths)[*cc];
1528 break;
1529
1530 /* Skip over things that don't match chars */
1531
1532 case OP_MARK:
1533 case OP_PRUNE_ARG:
1534 case OP_SKIP_ARG:
1535 case OP_THEN_ARG:
1536 cc += cc[1] + PRIV(OP_lengths)[*cc];
1537 break;
1538
1539 case OP_CALLOUT:
1540 case OP_CIRC:
1541 case OP_CIRCM:
1542 case OP_CLOSE:
1543 case OP_COMMIT:
1544 case OP_CREF:
1545 case OP_DEF:
1546 case OP_DOLL:
1547 case OP_DOLLM:
1548 case OP_EOD:
1549 case OP_EODN:
1550 case OP_FAIL:
1551 case OP_NCREF:
1552 case OP_NRREF:
1553 case OP_NOT_WORD_BOUNDARY:
1554 case OP_PRUNE:
1555 case OP_REVERSE:
1556 case OP_RREF:
1557 case OP_SET_SOM:
1558 case OP_SKIP:
1559 case OP_SOD:
1560 case OP_SOM:
1561 case OP_THEN:
1562 case OP_WORD_BOUNDARY:
1563 cc += PRIV(OP_lengths)[*cc];
1564 break;
1565
1566 /* Handle literal characters */
1567
1568 case OP_CHAR:
1569 case OP_CHARI:
1570 case OP_NOT:
1571 case OP_NOTI:
1572 branchlength++;
1573 cc += 2;
1574 #ifdef SUPPORT_UTF
1575 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1576 #endif
1577 break;
1578
1579 /* Handle exact repetitions. The count is already in characters, but we
1580 need to skip over a multibyte character in UTF8 mode. */
1581
1582 case OP_EXACT:
1583 case OP_EXACTI:
1584 case OP_NOTEXACT:
1585 case OP_NOTEXACTI:
1586 branchlength += (int)GET2(cc,1);
1587 cc += 2 + IMM2_SIZE;
1588 #ifdef SUPPORT_UTF
1589 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1590 #endif
1591 break;
1592
1593 case OP_TYPEEXACT:
1594 branchlength += GET2(cc,1);
1595 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1596 cc += 2;
1597 cc += 1 + IMM2_SIZE + 1;
1598 break;
1599
1600 /* Handle single-char matchers */
1601
1602 case OP_PROP:
1603 case OP_NOTPROP:
1604 cc += 2;
1605 /* Fall through */
1606
1607 case OP_HSPACE:
1608 case OP_VSPACE:
1609 case OP_NOT_HSPACE:
1610 case OP_NOT_VSPACE:
1611 case OP_NOT_DIGIT:
1612 case OP_DIGIT:
1613 case OP_NOT_WHITESPACE:
1614 case OP_WHITESPACE:
1615 case OP_NOT_WORDCHAR:
1616 case OP_WORDCHAR:
1617 case OP_ANY:
1618 case OP_ALLANY:
1619 branchlength++;
1620 cc++;
1621 break;
1622
1623 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1624 otherwise \C is coded as OP_ALLANY. */
1625
1626 case OP_ANYBYTE:
1627 return -2;
1628
1629 /* Check a class for variable quantification */
1630
1631 case OP_CLASS:
1632 case OP_NCLASS:
1633 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1634 case OP_XCLASS:
1635 /* The original code caused an unsigned overflow in 64 bit systems,
1636 so now we use a conditional statement. */
1637 if (op == OP_XCLASS)
1638 cc += GET(cc, 1);
1639 else
1640 cc += PRIV(OP_lengths)[OP_CLASS];
1641 #else
1642 cc += PRIV(OP_lengths)[OP_CLASS];
1643 #endif
1644
1645 switch (*cc)
1646 {
1647 case OP_CRPLUS:
1648 case OP_CRMINPLUS:
1649 case OP_CRSTAR:
1650 case OP_CRMINSTAR:
1651 case OP_CRQUERY:
1652 case OP_CRMINQUERY:
1653 return -1;
1654
1655 case OP_CRRANGE:
1656 case OP_CRMINRANGE:
1657 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1658 branchlength += (int)GET2(cc,1);
1659 cc += 1 + 2 * IMM2_SIZE;
1660 break;
1661
1662 default:
1663 branchlength++;
1664 }
1665 break;
1666
1667 /* Anything else is variable length */
1668
1669 case OP_ANYNL:
1670 case OP_BRAMINZERO:
1671 case OP_BRAPOS:
1672 case OP_BRAPOSZERO:
1673 case OP_BRAZERO:
1674 case OP_CBRAPOS:
1675 case OP_EXTUNI:
1676 case OP_KETRMAX:
1677 case OP_KETRMIN:
1678 case OP_KETRPOS:
1679 case OP_MINPLUS:
1680 case OP_MINPLUSI:
1681 case OP_MINQUERY:
1682 case OP_MINQUERYI:
1683 case OP_MINSTAR:
1684 case OP_MINSTARI:
1685 case OP_MINUPTO:
1686 case OP_MINUPTOI:
1687 case OP_NOTMINPLUS:
1688 case OP_NOTMINPLUSI:
1689 case OP_NOTMINQUERY:
1690 case OP_NOTMINQUERYI:
1691 case OP_NOTMINSTAR:
1692 case OP_NOTMINSTARI:
1693 case OP_NOTMINUPTO:
1694 case OP_NOTMINUPTOI:
1695 case OP_NOTPLUS:
1696 case OP_NOTPLUSI:
1697 case OP_NOTPOSPLUS:
1698 case OP_NOTPOSPLUSI:
1699 case OP_NOTPOSQUERY:
1700 case OP_NOTPOSQUERYI:
1701 case OP_NOTPOSSTAR:
1702 case OP_NOTPOSSTARI:
1703 case OP_NOTPOSUPTO:
1704 case OP_NOTPOSUPTOI:
1705 case OP_NOTQUERY:
1706 case OP_NOTQUERYI:
1707 case OP_NOTSTAR:
1708 case OP_NOTSTARI:
1709 case OP_NOTUPTO:
1710 case OP_NOTUPTOI:
1711 case OP_PLUS:
1712 case OP_PLUSI:
1713 case OP_POSPLUS:
1714 case OP_POSPLUSI:
1715 case OP_POSQUERY:
1716 case OP_POSQUERYI:
1717 case OP_POSSTAR:
1718 case OP_POSSTARI:
1719 case OP_POSUPTO:
1720 case OP_POSUPTOI:
1721 case OP_QUERY:
1722 case OP_QUERYI:
1723 case OP_REF:
1724 case OP_REFI:
1725 case OP_SBRA:
1726 case OP_SBRAPOS:
1727 case OP_SCBRA:
1728 case OP_SCBRAPOS:
1729 case OP_SCOND:
1730 case OP_SKIPZERO:
1731 case OP_STAR:
1732 case OP_STARI:
1733 case OP_TYPEMINPLUS:
1734 case OP_TYPEMINQUERY:
1735 case OP_TYPEMINSTAR:
1736 case OP_TYPEMINUPTO:
1737 case OP_TYPEPLUS:
1738 case OP_TYPEPOSPLUS:
1739 case OP_TYPEPOSQUERY:
1740 case OP_TYPEPOSSTAR:
1741 case OP_TYPEPOSUPTO:
1742 case OP_TYPEQUERY:
1743 case OP_TYPESTAR:
1744 case OP_TYPEUPTO:
1745 case OP_UPTO:
1746 case OP_UPTOI:
1747 return -1;
1748
1749 /* Catch unrecognized opcodes so that when new ones are added they
1750 are not forgotten, as has happened in the past. */
1751
1752 default:
1753 return -4;
1754 }
1755 }
1756 /* Control never gets here */
1757 }
1758
1759
1760
1761
1762 /*************************************************
1763 * Scan compiled regex for specific bracket *
1764 *************************************************/
1765
1766 /* This little function scans through a compiled pattern until it finds a
1767 capturing bracket with the given number, or, if the number is negative, an
1768 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1769 so that it can be called from pcre_study() when finding the minimum matching
1770 length.
1771
1772 Arguments:
1773 code points to start of expression
1774 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1775 number the required bracket number or negative to find a lookbehind
1776
1777 Returns: pointer to the opcode for the bracket, or NULL if not found
1778 */
1779
1780 const pcre_uchar *
1781 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
1782 {
1783 for (;;)
1784 {
1785 register pcre_uchar c = *code;
1786
1787 if (c == OP_END) return NULL;
1788
1789 /* XCLASS is used for classes that cannot be represented just by a bit
1790 map. This includes negated single high-valued characters. The length in
1791 the table is zero; the actual length is stored in the compiled code. */
1792
1793 if (c == OP_XCLASS) code += GET(code, 1);
1794
1795 /* Handle recursion */
1796
1797 else if (c == OP_REVERSE)
1798 {
1799 if (number < 0) return (pcre_uchar *)code;
1800 code += PRIV(OP_lengths)[c];
1801 }
1802
1803 /* Handle capturing bracket */
1804
1805 else if (c == OP_CBRA || c == OP_SCBRA ||
1806 c == OP_CBRAPOS || c == OP_SCBRAPOS)
1807 {
1808 int n = (int)GET2(code, 1+LINK_SIZE);
1809 if (n == number) return (pcre_uchar *)code;
1810 code += PRIV(OP_lengths)[c];
1811 }
1812
1813 /* Otherwise, we can get the item's length from the table, except that for
1814 repeated character types, we have to test for \p and \P, which have an extra
1815 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1816 must add in its length. */
1817
1818 else
1819 {
1820 switch(c)
1821 {
1822 case OP_TYPESTAR:
1823 case OP_TYPEMINSTAR:
1824 case OP_TYPEPLUS:
1825 case OP_TYPEMINPLUS:
1826 case OP_TYPEQUERY:
1827 case OP_TYPEMINQUERY:
1828 case OP_TYPEPOSSTAR:
1829 case OP_TYPEPOSPLUS:
1830 case OP_TYPEPOSQUERY:
1831 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1832 break;
1833
1834 case OP_TYPEUPTO:
1835 case OP_TYPEMINUPTO:
1836 case OP_TYPEEXACT:
1837 case OP_TYPEPOSUPTO:
1838 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1839 code += 2;
1840 break;
1841
1842 case OP_MARK:
1843 case OP_PRUNE_ARG:
1844 case OP_SKIP_ARG:
1845 case OP_THEN_ARG:
1846 code += code[1];
1847 break;
1848 }
1849
1850 /* Add in the fixed length from the table */
1851
1852 code += PRIV(OP_lengths)[c];
1853
1854 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1855 a multi-byte character. The length in the table is a minimum, so we have to
1856 arrange to skip the extra bytes. */
1857
1858 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1859 if (utf) switch(c)
1860 {
1861 case OP_CHAR:
1862 case OP_CHARI:
1863 case OP_EXACT:
1864 case OP_EXACTI:
1865 case OP_UPTO:
1866 case OP_UPTOI:
1867 case OP_MINUPTO:
1868 case OP_MINUPTOI:
1869 case OP_POSUPTO:
1870 case OP_POSUPTOI:
1871 case OP_STAR:
1872 case OP_STARI:
1873 case OP_MINSTAR:
1874 case OP_MINSTARI:
1875 case OP_POSSTAR:
1876 case OP_POSSTARI:
1877 case OP_PLUS:
1878 case OP_PLUSI:
1879 case OP_MINPLUS:
1880 case OP_MINPLUSI:
1881 case OP_POSPLUS:
1882 case OP_POSPLUSI:
1883 case OP_QUERY:
1884 case OP_QUERYI:
1885 case OP_MINQUERY:
1886 case OP_MINQUERYI:
1887 case OP_POSQUERY:
1888 case OP_POSQUERYI:
1889 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1890 break;
1891 }
1892 #else
1893 (void)(utf); /* Keep compiler happy by referencing function argument */
1894 #endif
1895 }
1896 }
1897 }
1898
1899
1900
1901 /*************************************************
1902 * Scan compiled regex for recursion reference *
1903 *************************************************/
1904
1905 /* This little function scans through a compiled pattern until it finds an
1906 instance of OP_RECURSE.
1907
1908 Arguments:
1909 code points to start of expression
1910 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1911
1912 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1913 */
1914
1915 static const pcre_uchar *
1916 find_recurse(const pcre_uchar *code, BOOL utf)
1917 {
1918 for (;;)
1919 {
1920 register pcre_uchar c = *code;
1921 if (c == OP_END) return NULL;
1922 if (c == OP_RECURSE) return code;
1923
1924 /* XCLASS is used for classes that cannot be represented just by a bit
1925 map. This includes negated single high-valued characters. The length in
1926 the table is zero; the actual length is stored in the compiled code. */
1927
1928 if (c == OP_XCLASS) code += GET(code, 1);
1929
1930 /* Otherwise, we can get the item's length from the table, except that for
1931 repeated character types, we have to test for \p and \P, which have an extra
1932 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1933 must add in its length. */
1934
1935 else
1936 {
1937 switch(c)
1938 {
1939 case OP_TYPESTAR:
1940 case OP_TYPEMINSTAR:
1941 case OP_TYPEPLUS:
1942 case OP_TYPEMINPLUS:
1943 case OP_TYPEQUERY:
1944 case OP_TYPEMINQUERY:
1945 case OP_TYPEPOSSTAR:
1946 case OP_TYPEPOSPLUS:
1947 case OP_TYPEPOSQUERY:
1948 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1949 break;
1950
1951 case OP_TYPEPOSUPTO:
1952 case OP_TYPEUPTO:
1953 case OP_TYPEMINUPTO:
1954 case OP_TYPEEXACT:
1955 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1956 code += 2;
1957 break;
1958
1959 case OP_MARK:
1960 case OP_PRUNE_ARG:
1961 case OP_SKIP_ARG:
1962 case OP_THEN_ARG:
1963 code += code[1];
1964 break;
1965 }
1966
1967 /* Add in the fixed length from the table */
1968
1969 code += PRIV(OP_lengths)[c];
1970
1971 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1972 by a multi-byte character. The length in the table is a minimum, so we have
1973 to arrange to skip the extra bytes. */
1974
1975 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1976 if (utf) switch(c)
1977 {
1978 case OP_CHAR:
1979 case OP_CHARI:
1980 case OP_NOT:
1981 case OP_NOTI:
1982 case OP_EXACT:
1983 case OP_EXACTI:
1984 case OP_NOTEXACT:
1985 case OP_NOTEXACTI:
1986 case OP_UPTO:
1987 case OP_UPTOI:
1988 case OP_NOTUPTO:
1989 case OP_NOTUPTOI:
1990 case OP_MINUPTO:
1991 case OP_MINUPTOI:
1992 case OP_NOTMINUPTO:
1993 case OP_NOTMINUPTOI:
1994 case OP_POSUPTO:
1995 case OP_POSUPTOI:
1996 case OP_NOTPOSUPTO:
1997 case OP_NOTPOSUPTOI:
1998 case OP_STAR:
1999 case OP_STARI:
2000 case OP_NOTSTAR:
2001 case OP_NOTSTARI:
2002 case OP_MINSTAR:
2003 case OP_MINSTARI:
2004 case OP_NOTMINSTAR:
2005 case OP_NOTMINSTARI:
2006 case OP_POSSTAR:
2007 case OP_POSSTARI:
2008 case OP_NOTPOSSTAR:
2009 case OP_NOTPOSSTARI:
2010 case OP_PLUS:
2011 case OP_PLUSI:
2012 case OP_NOTPLUS:
2013 case OP_NOTPLUSI:
2014 case OP_MINPLUS:
2015 case OP_MINPLUSI:
2016 case OP_NOTMINPLUS:
2017 case OP_NOTMINPLUSI:
2018 case OP_POSPLUS:
2019 case OP_POSPLUSI:
2020 case OP_NOTPOSPLUS:
2021 case OP_NOTPOSPLUSI:
2022 case OP_QUERY:
2023 case OP_QUERYI:
2024 case OP_NOTQUERY:
2025 case OP_NOTQUERYI:
2026 case OP_MINQUERY:
2027 case OP_MINQUERYI:
2028 case OP_NOTMINQUERY:
2029 case OP_NOTMINQUERYI:
2030 case OP_POSQUERY:
2031 case OP_POSQUERYI:
2032 case OP_NOTPOSQUERY:
2033 case OP_NOTPOSQUERYI:
2034 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2035 break;
2036 }
2037 #else
2038 (void)(utf); /* Keep compiler happy by referencing function argument */
2039 #endif
2040 }
2041 }
2042 }
2043
2044
2045
2046 /*************************************************
2047 * Scan compiled branch for non-emptiness *
2048 *************************************************/
2049
2050 /* This function scans through a branch of a compiled pattern to see whether it
2051 can match the empty string or not. It is called from could_be_empty()
2052 below and from compile_branch() when checking for an unlimited repeat of a
2053 group that can match nothing. Note that first_significant_code() skips over
2054 backward and negative forward assertions when its final argument is TRUE. If we
2055 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2056 bracket whose current branch will already have been scanned.
2057
2058 Arguments:
2059 code points to start of search
2060 endcode points to where to stop
2061 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2062 cd contains pointers to tables etc.
2063 recurses chain of recurse_check to catch mutual recursion
2064
2065 Returns: TRUE if what is matched could be empty
2066 */
2067
2068 typedef struct recurse_check {
2069 struct recurse_check *prev;
2070 const pcre_uchar *group;
2071 } recurse_check;
2072
2073 static BOOL
2074 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2075 BOOL utf, compile_data *cd, recurse_check *recurses)
2076 {
2077 register pcre_uchar c;
2078 recurse_check this_recurse;
2079
2080 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2081 code < endcode;
2082 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2083 {
2084 const pcre_uchar *ccode;
2085
2086 c = *code;
2087
2088 /* Skip over forward assertions; the other assertions are skipped by
2089 first_significant_code() with a TRUE final argument. */
2090
2091 if (c == OP_ASSERT)
2092 {
2093 do code += GET(code, 1); while (*code == OP_ALT);
2094 c = *code;
2095 continue;
2096 }
2097
2098 /* For a recursion/subroutine call, if its end has been reached, which
2099 implies a backward reference subroutine call, we can scan it. If it's a
2100 forward reference subroutine call, we can't. To detect forward reference
2101 we have to scan up the list that is kept in the workspace. This function is
2102 called only when doing the real compile, not during the pre-compile that
2103 measures the size of the compiled pattern. */
2104
2105 if (c == OP_RECURSE)
2106 {
2107 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2108 BOOL empty_branch;
2109
2110 /* Test for forward reference or uncompleted reference. This is disabled
2111 when called to scan a completed pattern by setting cd->start_workspace to
2112 NULL. */
2113
2114 if (cd->start_workspace != NULL)
2115 {
2116 const pcre_uchar *tcode;
2117 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2118 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2119 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2120 }
2121
2122 /* If we are scanning a completed pattern, there are no forward references
2123 and all groups are complete. We need to detect whether this is a recursive
2124 call, as otherwise there will be an infinite loop. If it is a recursion,
2125 just skip over it. Simple recursions are easily detected. For mutual
2126 recursions we keep a chain on the stack. */
2127
2128 else
2129 {
2130 recurse_check *r = recurses;
2131 const pcre_uchar *endgroup = scode;
2132
2133 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2134 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2135
2136 for (r = recurses; r != NULL; r = r->prev)
2137 if (r->group == scode) break;
2138 if (r != NULL) continue; /* Mutual recursion */
2139 }
2140
2141 /* Completed reference; scan the referenced group, remembering it on the
2142 stack chain to detect mutual recursions. */
2143
2144 empty_branch = FALSE;
2145 this_recurse.prev = recurses;
2146 this_recurse.group = scode;
2147
2148 do
2149 {
2150 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2151 {
2152 empty_branch = TRUE;
2153 break;
2154 }
2155 scode += GET(scode, 1);
2156 }
2157 while (*scode == OP_ALT);
2158
2159 if (!empty_branch) return FALSE; /* All branches are non-empty */
2160 continue;
2161 }
2162
2163 /* Groups with zero repeats can of course be empty; skip them. */
2164
2165 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2166 c == OP_BRAPOSZERO)
2167 {
2168 code += PRIV(OP_lengths)[c];
2169 do code += GET(code, 1); while (*code == OP_ALT);
2170 c = *code;
2171 continue;
2172 }
2173
2174 /* A nested group that is already marked as "could be empty" can just be
2175 skipped. */
2176
2177 if (c == OP_SBRA || c == OP_SBRAPOS ||
2178 c == OP_SCBRA || c == OP_SCBRAPOS)
2179 {
2180 do code += GET(code, 1); while (*code == OP_ALT);
2181 c = *code;
2182 continue;
2183 }
2184
2185 /* For other groups, scan the branches. */
2186
2187 if (c == OP_BRA || c == OP_BRAPOS ||
2188 c == OP_CBRA || c == OP_CBRAPOS ||
2189 c == OP_ONCE || c == OP_ONCE_NC ||
2190 c == OP_COND)
2191 {
2192 BOOL empty_branch;
2193 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2194
2195 /* If a conditional group has only one branch, there is a second, implied,
2196 empty branch, so just skip over the conditional, because it could be empty.
2197 Otherwise, scan the individual branches of the group. */
2198
2199 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2200 code += GET(code, 1);
2201 else
2202 {
2203 empty_branch = FALSE;
2204 do
2205 {
2206 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2207 empty_branch = TRUE;
2208 code += GET(code, 1);
2209 }
2210 while (*code == OP_ALT);
2211 if (!empty_branch) return FALSE; /* All branches are non-empty */
2212 }
2213
2214 c = *code;
2215 continue;
2216 }
2217
2218 /* Handle the other opcodes */
2219
2220 switch (c)
2221 {
2222 /* Check for quantifiers after a class. XCLASS is used for classes that
2223 cannot be represented just by a bit map. This includes negated single
2224 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2225 actual length is stored in the compiled code, so we must update "code"
2226 here. */
2227
2228 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2229 case OP_XCLASS:
2230 ccode = code += GET(code, 1);
2231 goto CHECK_CLASS_REPEAT;
2232 #endif
2233
2234 case OP_CLASS:
2235 case OP_NCLASS:
2236 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2237
2238 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2239 CHECK_CLASS_REPEAT:
2240 #endif
2241
2242 switch (*ccode)
2243 {
2244 case OP_CRSTAR: /* These could be empty; continue */
2245 case OP_CRMINSTAR:
2246 case OP_CRQUERY:
2247 case OP_CRMINQUERY:
2248 break;
2249
2250 default: /* Non-repeat => class must match */
2251 case OP_CRPLUS: /* These repeats aren't empty */
2252 case OP_CRMINPLUS:
2253 return FALSE;
2254
2255 case OP_CRRANGE:
2256 case OP_CRMINRANGE:
2257 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2258 break;
2259 }
2260 break;
2261
2262 /* Opcodes that must match a character */
2263
2264 case OP_ANY:
2265 case OP_ALLANY:
2266 case OP_ANYBYTE:
2267
2268 case OP_PROP:
2269 case OP_NOTPROP:
2270 case OP_ANYNL:
2271
2272 case OP_NOT_HSPACE:
2273 case OP_HSPACE:
2274 case OP_NOT_VSPACE:
2275 case OP_VSPACE:
2276 case OP_EXTUNI:
2277
2278 case OP_NOT_DIGIT:
2279 case OP_DIGIT:
2280 case OP_NOT_WHITESPACE:
2281 case OP_WHITESPACE:
2282 case OP_NOT_WORDCHAR:
2283 case OP_WORDCHAR:
2284
2285 case OP_CHAR:
2286 case OP_CHARI:
2287 case OP_NOT:
2288 case OP_NOTI:
2289
2290 case OP_PLUS:
2291 case OP_PLUSI:
2292 case OP_MINPLUS:
2293 case OP_MINPLUSI:
2294
2295 case OP_NOTPLUS:
2296 case OP_NOTPLUSI:
2297 case OP_NOTMINPLUS:
2298 case OP_NOTMINPLUSI:
2299
2300 case OP_POSPLUS:
2301 case OP_POSPLUSI:
2302 case OP_NOTPOSPLUS:
2303 case OP_NOTPOSPLUSI:
2304
2305 case OP_EXACT:
2306 case OP_EXACTI:
2307 case OP_NOTEXACT:
2308 case OP_NOTEXACTI:
2309
2310 case OP_TYPEPLUS:
2311 case OP_TYPEMINPLUS:
2312 case OP_TYPEPOSPLUS:
2313 case OP_TYPEEXACT:
2314
2315 return FALSE;
2316
2317 /* These are going to continue, as they may be empty, but we have to
2318 fudge the length for the \p and \P cases. */
2319
2320 case OP_TYPESTAR:
2321 case OP_TYPEMINSTAR:
2322 case OP_TYPEPOSSTAR:
2323 case OP_TYPEQUERY:
2324 case OP_TYPEMINQUERY:
2325 case OP_TYPEPOSQUERY:
2326 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2327 break;
2328
2329 /* Same for these */
2330
2331 case OP_TYPEUPTO:
2332 case OP_TYPEMINUPTO:
2333 case OP_TYPEPOSUPTO:
2334 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2335 code += 2;
2336 break;
2337
2338 /* End of branch */
2339
2340 case OP_KET:
2341 case OP_KETRMAX:
2342 case OP_KETRMIN:
2343 case OP_KETRPOS:
2344 case OP_ALT:
2345 return TRUE;
2346
2347 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2348 MINUPTO, and POSUPTO and their caseless and negative versions may be
2349 followed by a multibyte character. */
2350
2351 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2352 case OP_STAR:
2353 case OP_STARI:
2354 case OP_NOTSTAR:
2355 case OP_NOTSTARI:
2356
2357 case OP_MINSTAR:
2358 case OP_MINSTARI:
2359 case OP_NOTMINSTAR:
2360 case OP_NOTMINSTARI:
2361
2362 case OP_POSSTAR:
2363 case OP_POSSTARI:
2364 case OP_NOTPOSSTAR:
2365 case OP_NOTPOSSTARI:
2366
2367 case OP_QUERY:
2368 case OP_QUERYI:
2369 case OP_NOTQUERY:
2370 case OP_NOTQUERYI:
2371
2372 case OP_MINQUERY:
2373 case OP_MINQUERYI:
2374 case OP_NOTMINQUERY:
2375 case OP_NOTMINQUERYI:
2376
2377 case OP_POSQUERY:
2378 case OP_POSQUERYI:
2379 case OP_NOTPOSQUERY:
2380 case OP_NOTPOSQUERYI:
2381
2382 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2383 break;
2384
2385 case OP_UPTO:
2386 case OP_UPTOI:
2387 case OP_NOTUPTO:
2388 case OP_NOTUPTOI:
2389
2390 case OP_MINUPTO:
2391 case OP_MINUPTOI:
2392 case OP_NOTMINUPTO:
2393 case OP_NOTMINUPTOI:
2394
2395 case OP_POSUPTO:
2396 case OP_POSUPTOI:
2397 case OP_NOTPOSUPTO:
2398 case OP_NOTPOSUPTOI:
2399
2400 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2401 break;
2402 #endif
2403
2404 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2405 string. */
2406
2407 case OP_MARK:
2408 case OP_PRUNE_ARG:
2409 case OP_SKIP_ARG:
2410 case OP_THEN_ARG:
2411 code += code[1];
2412 break;
2413
2414 /* None of the remaining opcodes are required to match a character. */
2415
2416 default:
2417 break;
2418 }
2419 }
2420
2421 return TRUE;
2422 }
2423
2424
2425
2426 /*************************************************
2427 * Scan compiled regex for non-emptiness *
2428 *************************************************/
2429
2430 /* This function is called to check for left recursive calls. We want to check
2431 the current branch of the current pattern to see if it could match the empty
2432 string. If it could, we must look outwards for branches at other levels,
2433 stopping when we pass beyond the bracket which is the subject of the recursion.
2434 This function is called only during the real compile, not during the
2435 pre-compile.
2436
2437 Arguments:
2438 code points to start of the recursion
2439 endcode points to where to stop (current RECURSE item)
2440 bcptr points to the chain of current (unclosed) branch starts
2441 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2442 cd pointers to tables etc
2443
2444 Returns: TRUE if what is matched could be empty
2445 */
2446
2447 static BOOL
2448 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2449 branch_chain *bcptr, BOOL utf, compile_data *cd)
2450 {
2451 while (bcptr != NULL && bcptr->current_branch >= code)
2452 {
2453 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2454 return FALSE;
2455 bcptr = bcptr->outer;
2456 }
2457 return TRUE;
2458 }
2459
2460
2461
2462 /*************************************************
2463 * Check for POSIX class syntax *
2464 *************************************************/
2465
2466 /* This function is called when the sequence "[:" or "[." or "[=" is
2467 encountered in a character class. It checks whether this is followed by a
2468 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2469 reach an unescaped ']' without the special preceding character, return FALSE.
2470
2471 Originally, this function only recognized a sequence of letters between the
2472 terminators, but it seems that Perl recognizes any sequence of characters,
2473 though of course unknown POSIX names are subsequently rejected. Perl gives an
2474 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2475 didn't consider this to be a POSIX class. Likewise for [:1234:].
2476
2477 The problem in trying to be exactly like Perl is in the handling of escapes. We
2478 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2479 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2480 below handles the special case of \], but does not try to do any other escape
2481 processing. This makes it different from Perl for cases such as [:l\ower:]
2482 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2483 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2484 I think.
2485
2486 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2487 It seems that the appearance of a nested POSIX class supersedes an apparent
2488 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2489 a digit.
2490
2491 In Perl, unescaped square brackets may also appear as part of class names. For
2492 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2493 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2494 seem right at all. PCRE does not allow closing square brackets in POSIX class
2495 names.
2496
2497 Arguments:
2498 ptr pointer to the initial [
2499 endptr where to return the end pointer
2500
2501 Returns: TRUE or FALSE
2502 */
2503
2504 static BOOL
2505 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2506 {
2507 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
2508 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2509 for (++ptr; *ptr != CHAR_NULL; ptr++)
2510 {
2511 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2512 ptr++;
2513 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2514 else
2515 {
2516 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2517 {
2518 *endptr = ptr;
2519 return TRUE;
2520 }
2521 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2522 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2523 ptr[1] == CHAR_EQUALS_SIGN) &&
2524 check_posix_syntax(ptr, endptr))
2525 return FALSE;
2526 }
2527 }
2528 return FALSE;
2529 }
2530
2531
2532
2533
2534 /*************************************************
2535 * Check POSIX class name *
2536 *************************************************/
2537
2538 /* This function is called to check the name given in a POSIX-style class entry
2539 such as [:alnum:].
2540
2541 Arguments:
2542 ptr points to the first letter
2543 len the length of the name
2544
2545 Returns: a value representing the name, or -1 if unknown
2546 */
2547
2548 static int
2549 check_posix_name(const pcre_uchar *ptr, int len)
2550 {
2551 const char *pn = posix_names;
2552 register int yield = 0;
2553 while (posix_name_lengths[yield] != 0)
2554 {
2555 if (len == posix_name_lengths[yield] &&
2556 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
2557 pn += posix_name_lengths[yield] + 1;
2558 yield++;
2559 }
2560 return -1;
2561 }
2562
2563
2564 /*************************************************
2565 * Adjust OP_RECURSE items in repeated group *
2566 *************************************************/
2567
2568 /* OP_RECURSE items contain an offset from the start of the regex to the group
2569 that is referenced. This means that groups can be replicated for fixed
2570 repetition simply by copying (because the recursion is allowed to refer to
2571 earlier groups that are outside the current group). However, when a group is
2572 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2573 inserted before it, after it has been compiled. This means that any OP_RECURSE
2574 items within it that refer to the group itself or any contained groups have to
2575 have their offsets adjusted. That one of the jobs of this function. Before it
2576 is called, the partially compiled regex must be temporarily terminated with
2577 OP_END.
2578
2579 This function has been extended with the possibility of forward references for
2580 recursions and subroutine calls. It must also check the list of such references
2581 for the group we are dealing with. If it finds that one of the recursions in
2582 the current group is on this list, it adjusts the offset in the list, not the
2583 value in the reference (which is a group number).
2584
2585 Arguments:
2586 group points to the start of the group
2587 adjust the amount by which the group is to be moved
2588 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2589 cd contains pointers to tables etc.
2590 save_hwm the hwm forward reference pointer at the start of the group
2591
2592 Returns: nothing
2593 */
2594
2595 static void
2596 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2597 pcre_uchar *save_hwm)
2598 {
2599 pcre_uchar *ptr = group;
2600
2601 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2602 {
2603 int offset;
2604 pcre_uchar *hc;
2605
2606 /* See if this recursion is on the forward reference list. If so, adjust the
2607 reference. */
2608
2609 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2610 {
2611 offset = (int)GET(hc, 0);
2612 if (cd->start_code + offset == ptr + 1)
2613 {
2614 PUT(hc, 0, offset + adjust);
2615 break;
2616 }
2617 }
2618
2619 /* Otherwise, adjust the recursion offset if it's after the start of this
2620 group. */
2621
2622 if (hc >= cd->hwm)
2623 {
2624 offset = (int)GET(ptr, 1);
2625 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2626 }
2627
2628 ptr += 1 + LINK_SIZE;
2629 }
2630 }
2631
2632
2633
2634 /*************************************************
2635 * Insert an automatic callout point *
2636 *************************************************/
2637
2638 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2639 callout points before each pattern item.
2640
2641 Arguments:
2642 code current code pointer
2643 ptr current pattern pointer
2644 cd pointers to tables etc
2645
2646 Returns: new code pointer
2647 */
2648
2649 static pcre_uchar *
2650 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2651 {
2652 *code++ = OP_CALLOUT;
2653 *code++ = 255;
2654 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2655 PUT(code, LINK_SIZE, 0); /* Default length */
2656 return code + 2 * LINK_SIZE;
2657 }
2658
2659
2660
2661 /*************************************************
2662 * Complete a callout item *
2663 *************************************************/
2664
2665 /* A callout item contains the length of the next item in the pattern, which
2666 we can't fill in till after we have reached the relevant point. This is used
2667 for both automatic and manual callouts.
2668
2669 Arguments:
2670 previous_callout points to previous callout item
2671 ptr current pattern pointer
2672 cd pointers to tables etc
2673
2674 Returns: nothing
2675 */
2676
2677 static void
2678 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2679 {
2680 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2681 PUT(previous_callout, 2 + LINK_SIZE, length);
2682 }
2683
2684
2685
2686 #ifdef SUPPORT_UCP
2687 /*************************************************
2688 * Get othercase range *
2689 *************************************************/
2690
2691 /* This function is passed the start and end of a class range, in UTF-8 mode
2692 with UCP support. It searches up the characters, looking for ranges of
2693 characters in the "other" case. Each call returns the next one, updating the
2694 start address. A character with multiple other cases is returned on its own
2695 with a special return value.
2696
2697 Arguments:
2698 cptr points to starting character value; updated
2699 d end value
2700 ocptr where to put start of othercase range
2701 odptr where to put end of othercase range
2702
2703 Yield: -1 when no more
2704 0 when a range is returned
2705 >0 the CASESET offset for char with multiple other cases
2706 in this case, ocptr contains the original
2707 */
2708
2709 static int
2710 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
2711 pcre_uint32 *odptr)
2712 {
2713 pcre_uint32 c, othercase, next;
2714 unsigned int co;
2715
2716 /* Find the first character that has an other case. If it has multiple other
2717 cases, return its case offset value. */
2718
2719 for (c = *cptr; c <= d; c++)
2720 {
2721 if ((co = UCD_CASESET(c)) != 0)
2722 {
2723 *ocptr = c++; /* Character that has the set */
2724 *cptr = c; /* Rest of input range */
2725 return (int)co;
2726 }
2727 if ((othercase = UCD_OTHERCASE(c)) != c) break;
2728 }
2729
2730 if (c > d) return -1; /* Reached end of range */
2731
2732 *ocptr = othercase;
2733 next = othercase + 1;
2734
2735 for (++c; c <= d; c++)
2736 {
2737 if (UCD_OTHERCASE(c) != next) break;
2738 next++;
2739 }
2740
2741 *odptr = next - 1; /* End of othercase range */
2742 *cptr = c; /* Rest of input range */
2743 return 0;
2744 }
2745
2746
2747
2748 /*************************************************
2749 * Check a character and a property *
2750 *************************************************/
2751
2752 /* This function is called by check_auto_possessive() when a property item
2753 is adjacent to a fixed character.
2754
2755 Arguments:
2756 c the character
2757 ptype the property type
2758 pdata the data for the type
2759 negated TRUE if it's a negated property (\P or \p{^)
2760
2761 Returns: TRUE if auto-possessifying is OK
2762 */
2763
2764 static BOOL
2765 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata, BOOL negated)
2766 {
2767 #ifdef SUPPORT_UCP
2768 const pcre_uint32 *p;
2769 #endif
2770
2771 const ucd_record *prop = GET_UCD(c);
2772
2773 switch(ptype)
2774 {
2775 case PT_LAMP:
2776 return (prop->chartype == ucp_Lu ||
2777 prop->chartype == ucp_Ll ||
2778 prop->chartype == ucp_Lt) == negated;
2779
2780 case PT_GC:
2781 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2782
2783 case PT_PC:
2784 return (pdata == prop->chartype) == negated;
2785
2786 case PT_SC:
2787 return (pdata == prop->script) == negated;
2788
2789 /* These are specials */
2790
2791 case PT_ALNUM:
2792 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2793 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2794
2795 case PT_SPACE: /* Perl space */
2796 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2797 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2798 == negated;
2799
2800 case PT_PXSPACE: /* POSIX space */
2801 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2802 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2803 c == CHAR_FF || c == CHAR_CR)
2804 == negated;
2805
2806 case PT_WORD:
2807 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2808 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2809 c == CHAR_UNDERSCORE) == negated;
2810
2811 #ifdef SUPPORT_UCP
2812 case PT_CLIST:
2813 p = PRIV(ucd_caseless_sets) + prop->caseset;
2814 for (;;)
2815 {
2816 if (c < *p) return !negated;
2817 if (c == *p++) return negated;
2818 }
2819 break; /* Control never reaches here */
2820 #endif
2821 }
2822
2823 return FALSE;
2824 }
2825 #endif /* SUPPORT_UCP */
2826
2827
2828
2829 /*************************************************
2830 * Check if auto-possessifying is possible *
2831 *************************************************/
2832
2833 /* This function is called for unlimited repeats of certain items, to see
2834 whether the next thing could possibly match the repeated item. If not, it makes
2835 sense to automatically possessify the repeated item.
2836
2837 Arguments:
2838 previous pointer to the repeated opcode
2839 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2840 ptr next character in pattern
2841 options options bits
2842 cd contains pointers to tables etc.
2843
2844 Returns: TRUE if possessifying is wanted
2845 */
2846
2847 static BOOL
2848 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2849 const pcre_uchar *ptr, int options, compile_data *cd)
2850 {
2851 pcre_uint32 c = NOTACHAR;
2852 pcre_uint32 next;
2853 int escape;
2854 pcre_uchar op_code = *previous++;
2855
2856 /* Skip whitespace and comments in extended mode */
2857
2858 if ((options & PCRE_EXTENDED) != 0)
2859 {
2860 for (;;)
2861 {
2862 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2863 if (*ptr == CHAR_NUMBER_SIGN)
2864 {
2865 ptr++;
2866 while (*ptr != CHAR_NULL)
2867 {
2868 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2869 ptr++;
2870 #ifdef SUPPORT_UTF
2871 if (utf) FORWARDCHAR(ptr);
2872 #endif
2873 }
2874 }
2875 else break;
2876 }
2877 }
2878
2879 /* If the next item is one that we can handle, get its value. A non-negative
2880 value is a character, a negative value is an escape value. */
2881
2882 if (*ptr == CHAR_BACKSLASH)
2883 {
2884 int temperrorcode = 0;
2885 escape = check_escape(&ptr, &next, &temperrorcode, cd->bracount, options,
2886 FALSE);
2887 if (temperrorcode != 0) return FALSE;
2888 ptr++; /* Point after the escape sequence */
2889 }
2890 else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
2891 {
2892 escape = 0;
2893 #ifdef SUPPORT_UTF
2894 if (utf) { GETCHARINC(next, ptr); } else
2895 #endif
2896 next = *ptr++;
2897 }
2898 else return FALSE;
2899
2900 /* Skip whitespace and comments in extended mode */
2901
2902 if ((options & PCRE_EXTENDED) != 0)
2903 {
2904 for (;;)
2905 {
2906 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2907 if (*ptr == CHAR_NUMBER_SIGN)
2908 {
2909 ptr++;
2910 while (*ptr != CHAR_NULL)
2911 {
2912 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2913 ptr++;
2914 #ifdef SUPPORT_UTF
2915 if (utf) FORWARDCHAR(ptr);
2916 #endif
2917 }
2918 }
2919 else break;
2920 }
2921 }
2922
2923 /* If the next thing is itself optional, we have to give up. */
2924
2925 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2926 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2927 return FALSE;
2928
2929 /* If the previous item is a character, get its value. */
2930
2931 if (op_code == OP_CHAR || op_code == OP_CHARI ||
2932 op_code == OP_NOT || op_code == OP_NOTI)
2933 {
2934 #ifdef SUPPORT_UTF
2935 GETCHARTEST(c, previous);
2936 #else
2937 c = *previous;
2938 #endif
2939 }
2940
2941 /* Now compare the next item with the previous opcode. First, handle cases when
2942 the next item is a character. */
2943
2944 if (escape == 0)
2945 {
2946 /* For a caseless UTF match, the next character may have more than one other
2947 case, which maps to the special PT_CLIST property. Check this first. */
2948
2949 #ifdef SUPPORT_UCP
2950 if (utf && c != NOTACHAR && (options & PCRE_CASELESS) != 0)
2951 {
2952 unsigned int ocs = UCD_CASESET(next);
2953 if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, op_code >= OP_NOT);
2954 }
2955 #endif
2956
2957 switch(op_code)
2958 {
2959 case OP_CHAR:
2960 return c != next;
2961
2962 /* For CHARI (caseless character) we must check the other case. If we have
2963 Unicode property support, we can use it to test the other case of
2964 high-valued characters. We know that next can have only one other case,
2965 because multi-other-case characters are dealt with above. */
2966
2967 case OP_CHARI:
2968 if (c == next) return FALSE;
2969 #ifdef SUPPORT_UTF
2970 if (utf)
2971 {
2972 pcre_uint32 othercase;
2973 if (next < 128) othercase = cd->fcc[next]; else
2974 #ifdef SUPPORT_UCP
2975 othercase = UCD_OTHERCASE(next);
2976 #else
2977 othercase = NOTACHAR;
2978 #endif
2979 return c != othercase;
2980 }
2981 else
2982 #endif /* SUPPORT_UTF */
2983 return (c != TABLE_GET(next, cd->fcc, next)); /* Not UTF */
2984
2985 case OP_NOT:
2986 return c == next;
2987
2988 case OP_NOTI:
2989 if (c == next) return TRUE;
2990 #ifdef SUPPORT_UTF
2991 if (utf)
2992 {
2993 pcre_uint32 othercase;
2994 if (next < 128) othercase = cd->fcc[next]; else
2995 #ifdef SUPPORT_UCP
2996 othercase = UCD_OTHERCASE(next);
2997 #else
2998 othercase = NOTACHAR;
2999 #endif
3000 return c == othercase;
3001 }
3002 else
3003 #endif /* SUPPORT_UTF */
3004 return (c == TABLE_GET(next, cd->fcc, next)); /* Not UTF */
3005
3006 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3007 When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3008
3009 case OP_DIGIT:
3010 return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3011
3012 case OP_NOT_DIGIT:
3013 return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3014
3015 case OP_WHITESPACE:
3016 return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3017
3018 case OP_NOT_WHITESPACE:
3019 return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3020
3021 case OP_WORDCHAR:
3022 return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3023
3024 case OP_NOT_WORDCHAR:
3025 return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3026
3027 case OP_HSPACE:
3028 case OP_NOT_HSPACE:
3029 switch(next)
3030 {
3031 HSPACE_CASES:
3032 return op_code == OP_NOT_HSPACE;
3033
3034 default:
3035 return op_code != OP_NOT_HSPACE;
3036 }
3037
3038 case OP_ANYNL:
3039 case OP_VSPACE:
3040 case OP_NOT_VSPACE:
3041 switch(next)
3042 {
3043 VSPACE_CASES:
3044 return op_code == OP_NOT_VSPACE;
3045
3046 default:
3047 return op_code != OP_NOT_VSPACE;
3048 }
3049
3050 #ifdef SUPPORT_UCP
3051 case OP_PROP:
3052 return check_char_prop(next, previous[0], previous[1], FALSE);
3053
3054 case OP_NOTPROP:
3055 return check_char_prop(next, previous[0], previous[1], TRUE);
3056 #endif
3057
3058 default:
3059 return FALSE;
3060 }
3061 }
3062
3063 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3064 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3065 generated only when PCRE_UCP is *not* set, that is, when only ASCII
3066 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3067 replaced by OP_PROP codes when PCRE_UCP is set. */
3068
3069 switch(op_code)
3070 {
3071 case OP_CHAR:
3072 case OP_CHARI:
3073 switch(escape)
3074 {
3075 case ESC_d:
3076 return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3077
3078 case ESC_D:
3079 return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3080
3081 case ESC_s:
3082 return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3083
3084 case ESC_S:
3085 return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3086
3087 case ESC_w:
3088 return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3089
3090 case ESC_W:
3091 return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3092
3093 case ESC_h:
3094 case ESC_H:
3095 switch(c)
3096 {
3097 HSPACE_CASES:
3098 return escape != ESC_h;
3099
3100 default:
3101 return escape == ESC_h;
3102 }
3103
3104 case ESC_v:
3105 case ESC_V:
3106 switch(c)
3107 {
3108 VSPACE_CASES:
3109 return escape != ESC_v;
3110
3111 default:
3112 return escape == ESC_v;
3113 }
3114
3115 /* When PCRE_UCP is set, these values get generated for \d etc. Find
3116 their substitutions and process them. The result will always be either
3117 ESC_p or ESC_P. Then fall through to process those values. */
3118
3119 #ifdef SUPPORT_UCP
3120 case ESC_du:
3121 case ESC_DU:
3122 case ESC_wu:
3123 case ESC_WU:
3124 case ESC_su:
3125 case ESC_SU:
3126 {
3127 int temperrorcode = 0;
3128 ptr = substitutes[escape - ESC_DU];
3129 escape = check_escape(&ptr, &next, &temperrorcode, 0, options, FALSE);
3130 if (temperrorcode != 0) return FALSE;
3131 ptr++; /* For compatibility */
3132 }
3133 /* Fall through */
3134
3135 case ESC_p:
3136 case ESC_P:
3137 {
3138 unsigned int ptype = 0, pdata = 0;
3139 int errorcodeptr;
3140 BOOL negated;
3141
3142 ptr--; /* Make ptr point at the p or P */
3143 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcodeptr))
3144 return FALSE;
3145 ptr++; /* Point past the final curly ket */
3146
3147 /* If the property item is optional, we have to give up. (When generated
3148 from \d etc by PCRE_UCP, this test will have been applied much earlier,
3149 to the original \d etc. At this point, ptr will point to a zero byte. */
3150
3151 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3152 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3153 return FALSE;
3154
3155 /* Do the property check. */
3156
3157 return check_char_prop(c, ptype, pdata, (escape == ESC_P) != negated);
3158 }
3159 #endif
3160
3161 default:
3162 return FALSE;
3163 }
3164
3165 /* In principle, support for Unicode properties should be integrated here as
3166 well. It means re-organizing the above code so as to get hold of the property
3167 values before switching on the op-code. However, I wonder how many patterns
3168 combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3169 these op-codes are never generated.) */
3170
3171 case OP_DIGIT:
3172 return escape == ESC_D || escape == ESC_s || escape == ESC_W ||
3173 escape == ESC_h || escape == ESC_v || escape == ESC_R;
3174
3175 case OP_NOT_DIGIT:
3176 return escape == ESC_d;
3177
3178 case OP_WHITESPACE:
3179 return escape == ESC_S || escape == ESC_d || escape == ESC_w;
3180
3181 case OP_NOT_WHITESPACE:
3182 return escape == ESC_s || escape == ESC_h || escape == ESC_v || escape == ESC_R;
3183
3184 case OP_HSPACE:
3185 return escape == ESC_S || escape == ESC_H || escape == ESC_d ||
3186 escape == ESC_w || escape == ESC_v || escape == ESC_R;
3187
3188 case OP_NOT_HSPACE:
3189 return escape == ESC_h;
3190
3191 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3192 case OP_ANYNL:
3193 case OP_VSPACE:
3194 return escape == ESC_V || escape == ESC_d || escape == ESC_w;
3195
3196 case OP_NOT_VSPACE:
3197 return escape == ESC_v || escape == ESC_R;
3198
3199 case OP_WORDCHAR:
3200 return escape == ESC_W || escape == ESC_s || escape == ESC_h ||
3201 escape == ESC_v || escape == ESC_R;
3202
3203 case OP_NOT_WORDCHAR:
3204 return escape == ESC_w || escape == ESC_d;
3205
3206 default:
3207 return FALSE;
3208 }
3209
3210 /* Control does not reach here */
3211 }
3212
3213
3214
3215 /*************************************************
3216 * Add a character or range to a class *
3217 *************************************************/
3218
3219 /* This function packages up the logic of adding a character or range of
3220 characters to a class. The character values in the arguments will be within the
3221 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3222 mutually recursive with the function immediately below.
3223
3224 Arguments:
3225 classbits the bit map for characters < 256
3226 uchardptr points to the pointer for extra data
3227 options the options word
3228 cd contains pointers to tables etc.
3229 start start of range character
3230 end end of range character
3231
3232 Returns: the number of < 256 characters added
3233 the pointer to extra data is updated
3234 */
3235
3236 static int
3237 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3238 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
3239 {
3240 pcre_uint32 c;
3241 int n8 = 0;
3242
3243 /* If caseless matching is required, scan the range and process alternate
3244 cases. In Unicode, there are 8-bit characters that have alternate cases that
3245 are greater than 255 and vice-versa. Sometimes we can just extend the original
3246 range. */
3247
3248 if ((options & PCRE_CASELESS) != 0)
3249 {
3250 #ifdef SUPPORT_UCP
3251 if ((options & PCRE_UTF8) != 0)
3252 {
3253 int rc;
3254 pcre_uint32 oc, od;
3255
3256 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
3257 c = start;
3258
3259 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3260 {
3261 /* Handle a single character that has more than one other case. */
3262
3263 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3264 PRIV(ucd_caseless_sets) + rc, oc);
3265
3266 /* Do nothing if the other case range is within the original range. */
3267
3268 else if (oc >= start && od <= end) continue;
3269
3270 /* Extend the original range if there is overlap, noting that if oc < c, we
3271 can't have od > end because a subrange is always shorter than the basic
3272 range. Otherwise, use a recursive call to add the additional range. */
3273
3274 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3275 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
3276 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3277 }
3278 }
3279 else
3280 #endif /* SUPPORT_UCP */
3281
3282 /* Not UTF-mode, or no UCP */
3283
3284 for (c = start; c <= end && c < 256; c++)
3285 {
3286 SETBIT(classbits, cd->fcc[c]);
3287 n8++;
3288 }
3289 }
3290
3291 /* Now handle the original range. Adjust the final value according to the bit
3292 length - this means that the same lists of (e.g.) horizontal spaces can be used
3293 in all cases. */
3294
3295 #if defined COMPILE_PCRE8
3296 #ifdef SUPPORT_UTF
3297 if ((options & PCRE_UTF8) == 0)
3298 #endif
3299 if (end > 0xff) end = 0xff;
3300
3301 #elif defined COMPILE_PCRE16
3302 #ifdef SUPPORT_UTF
3303 if ((options & PCRE_UTF16) == 0)
3304 #endif
3305 if (end > 0xffff) end = 0xffff;
3306
3307 #endif /* COMPILE_PCRE[8|16] */
3308
3309 /* If all characters are less than 256, use the bit map. Otherwise use extra
3310 data. */
3311
3312 if (end < 0x100)
3313 {
3314 for (c = start; c <= end; c++)
3315 {
3316 n8++;
3317 SETBIT(classbits, c);
3318 }
3319 }
3320
3321 else
3322 {
3323 pcre_uchar *uchardata = *uchardptr;
3324
3325 #ifdef SUPPORT_UTF
3326 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
3327 {
3328 if (start < end)
3329 {
3330 *uchardata++ = XCL_RANGE;
3331 uchardata += PRIV(ord2utf)(start, uchardata);
3332 uchardata += PRIV(ord2utf)(end, uchardata);
3333 }
3334 else if (start == end)
3335 {
3336 *uchardata++ = XCL_SINGLE;
3337 uchardata += PRIV(ord2utf)(start, uchardata);
3338 }
3339 }
3340 else
3341 #endif /* SUPPORT_UTF */
3342
3343 /* Without UTF support, character values are constrained by the bit length,
3344 and can only be > 256 for 16-bit and 32-bit libraries. */
3345
3346 #ifdef COMPILE_PCRE8
3347 {}
3348 #else
3349 if (start < end)
3350 {
3351 *uchardata++ = XCL_RANGE;
3352 *uchardata++ = start;
3353 *uchardata++ = end;
3354 }
3355 else if (start == end)
3356 {
3357 *uchardata++ = XCL_SINGLE;
3358 *uchardata++ = start;
3359 }
3360 #endif
3361
3362 *uchardptr = uchardata; /* Updata extra data pointer */
3363 }
3364
3365 return n8; /* Number of 8-bit characters */
3366 }
3367
3368
3369
3370
3371 /*************************************************
3372 * Add a list of characters to a class *
3373 *************************************************/
3374
3375 /* This function is used for adding a list of case-equivalent characters to a
3376 class, and also for adding a list of horizontal or vertical whitespace. If the
3377 list is in order (which it should be), ranges of characters are detected and
3378 handled appropriately. This function is mutually recursive with the function
3379 above.
3380
3381 Arguments:
3382 classbits the bit map for characters < 256
3383 uchardptr points to the pointer for extra data
3384 options the options word
3385 cd contains pointers to tables etc.
3386 p points to row of 32-bit values, terminated by NOTACHAR
3387 except character to omit; this is used when adding lists of
3388 case-equivalent characters to avoid including the one we
3389 already know about
3390
3391 Returns: the number of < 256 characters added
3392 the pointer to extra data is updated
3393 */
3394
3395 static int
3396 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3397 compile_data *cd, const pcre_uint32 *p, unsigned int except)
3398 {
3399 int n8 = 0;
3400 while (p[0] < NOTACHAR)
3401 {
3402 int n = 0;
3403 if (p[0] != except)
3404 {
3405 while(p[n+1] == p[0] + n + 1) n++;
3406 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
3407 }
3408 p += n + 1;
3409 }
3410 return n8;
3411 }
3412
3413
3414
3415 /*************************************************
3416 * Add characters not in a list to a class *
3417 *************************************************/
3418
3419 /* This function is used for adding the complement of a list of horizontal or
3420 vertical whitespace to a class. The list must be in order.
3421
3422 Arguments:
3423 classbits the bit map for characters < 256
3424 uchardptr points to the pointer for extra data
3425 options the options word
3426 cd contains pointers to tables etc.
3427 p points to row of 32-bit values, terminated by NOTACHAR
3428
3429 Returns: the number of < 256 characters added
3430 the pointer to extra data is updated
3431 */
3432
3433 static int
3434 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
3435 int options, compile_data *cd, const pcre_uint32 *p)
3436 {
3437 BOOL utf = (options & PCRE_UTF8) != 0;
3438 int n8 = 0;
3439 if (p[0] > 0)
3440 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
3441 while (p[0] < NOTACHAR)
3442 {
3443 while (p[1] == p[0] + 1) p++;
3444 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
3445 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3446 p++;
3447 }
3448 return n8;
3449 }
3450
3451
3452
3453 /*************************************************
3454 * Compile one branch *
3455 *************************************************/
3456
3457 /* Scan the pattern, compiling it into the a vector. If the options are
3458 changed during the branch, the pointer is used to change the external options
3459 bits. This function is used during the pre-compile phase when we are trying
3460 to find out the amount of memory needed, as well as during the real compile
3461 phase. The value of lengthptr distinguishes the two phases.
3462
3463 Arguments:
3464 optionsptr pointer to the option bits
3465 codeptr points to the pointer to the current code point
3466 ptrptr points to the current pattern pointer
3467 errorcodeptr points to error code variable
3468 firstcharptr place to put the first required character
3469 firstcharflagsptr place to put the first character flags, or a negative number
3470 reqcharptr place to put the last required character
3471 reqcharflagsptr place to put the last required character flags, or a negative number
3472 bcptr points to current branch chain
3473 cond_depth conditional nesting depth
3474 cd contains pointers to tables etc.
3475 lengthptr NULL during the real compile phase
3476 points to length accumulator during pre-compile phase
3477
3478 Returns: TRUE on success
3479 FALSE, with *errorcodeptr set non-zero on error
3480 */
3481
3482 static BOOL
3483 compile_branch(int *optionsptr, pcre_uchar **codeptr,
3484 const pcre_uchar **ptrptr, int *errorcodeptr,
3485 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
3486 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
3487 branch_chain *bcptr, int cond_depth,
3488 compile_data *cd, int *lengthptr)
3489 {
3490 int repeat_type, op_type;
3491 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3492 int bravalue = 0;
3493 int greedy_default, greedy_non_default;
3494 pcre_uint32 firstchar, reqchar;
3495 pcre_int32 firstcharflags, reqcharflags;
3496 pcre_uint32 zeroreqchar, zerofirstchar;
3497 pcre_int32 zeroreqcharflags, zerofirstcharflags;
3498 pcre_int32 req_caseopt, reqvary, tempreqvary;
3499 int options = *optionsptr; /* May change dynamically */
3500 int after_manual_callout = 0;
3501 int length_prevgroup = 0;
3502 register pcre_uint32 c;
3503 int escape;
3504 register pcre_uchar *code = *codeptr;
3505 pcre_uchar *last_code = code;
3506 pcre_uchar *orig_code = code;
3507 pcre_uchar *tempcode;
3508 BOOL inescq = FALSE;
3509 BOOL groupsetfirstchar = FALSE;
3510 const pcre_uchar *ptr = *ptrptr;
3511 const pcre_uchar *tempptr;
3512 const pcre_uchar *nestptr = NULL;
3513 pcre_uchar *previous = NULL;
3514 pcre_uchar *previous_callout = NULL;
3515 pcre_uchar *save_hwm = NULL;
3516 pcre_uint8 classbits[32];
3517
3518 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3519 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3520 dynamically as we process the pattern. */
3521
3522 #ifdef SUPPORT_UTF
3523 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
3524 BOOL utf = (options & PCRE_UTF8) != 0;
3525 #ifndef COMPILE_PCRE32
3526 pcre_uchar utf_chars[6];
3527 #endif
3528 #else
3529 BOOL utf = FALSE;
3530 #endif
3531
3532 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
3533 class_uchardata always so that it can be passed to add_to_class() always,
3534 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
3535 alternative calls for the different cases. */
3536
3537 pcre_uchar *class_uchardata;
3538 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3539 BOOL xclass;
3540 pcre_uchar *class_uchardata_base;
3541 #endif
3542
3543 #ifdef PCRE_DEBUG
3544 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3545 #endif
3546
3547 /* Set up the default and non-default settings for greediness */
3548
3549 greedy_default = ((options & PCRE_UNGREEDY) != 0);
3550 greedy_non_default = greedy_default ^ 1;
3551
3552 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3553 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3554 matches a non-fixed char first char; reqchar just remains unset if we never
3555 find one.
3556
3557 When we hit a repeat whose minimum is zero, we may have to adjust these values
3558 to take the zero repeat into account. This is implemented by setting them to
3559 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3560 item types that can be repeated set these backoff variables appropriately. */
3561
3562 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
3563 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
3564
3565 /* The variable req_caseopt contains either the REQ_CASELESS value
3566 or zero, according to the current setting of the caseless flag. The
3567 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3568 firstchar or reqchar variables to record the case status of the
3569 value. This is used only for ASCII characters. */
3570
3571 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3572
3573 /* Switch on next character until the end of the branch */
3574
3575 for (;; ptr++)
3576 {
3577 BOOL negate_class;
3578 BOOL should_flip_negation;
3579 BOOL possessive_quantifier;
3580 BOOL is_quantifier;
3581 BOOL is_recurse;
3582 BOOL reset_bracount;
3583 int class_has_8bitchar;
3584 int class_one_char;
3585 int newoptions;
3586 int recno;
3587 int refsign;
3588 int skipbytes;
3589 pcre_uint32 subreqchar, subfirstchar;
3590 pcre_int32 subreqcharflags, subfirstcharflags;
3591 int terminator;
3592 unsigned int mclength;
3593 unsigned int tempbracount;
3594 pcre_uint32 ec;
3595 pcre_uchar mcbuffer[8];
3596
3597 /* Get next character in the pattern */
3598
3599 c = *ptr;
3600
3601 /* If we are at the end of a nested substitution, revert to the outer level
3602 string. Nesting only happens one level deep. */
3603
3604 if (c == CHAR_NULL && nestptr != NULL)
3605 {
3606 ptr = nestptr;
3607 nestptr = NULL;
3608 c = *ptr;
3609 }
3610
3611 /* If we are in the pre-compile phase, accumulate the length used for the
3612 previous cycle of this loop. */
3613
3614 if (lengthptr != NULL)
3615 {
3616 #ifdef PCRE_DEBUG
3617 if (code > cd->hwm) cd->hwm = code; /* High water info */
3618 #endif
3619 if (code > cd->start_workspace + cd->workspace_size -
3620 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
3621 {
3622 *errorcodeptr = ERR52;
3623 goto FAILED;
3624 }
3625
3626 /* There is at least one situation where code goes backwards: this is the
3627 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3628 the class is simply eliminated. However, it is created first, so we have to
3629 allow memory for it. Therefore, don't ever reduce the length at this point.
3630 */
3631
3632 if (code < last_code) code = last_code;
3633
3634 /* Paranoid check for integer overflow */
3635
3636 if (OFLOW_MAX - *lengthptr < code - last_code)
3637 {
3638 *errorcodeptr = ERR20;
3639 goto FAILED;
3640 }
3641
3642 *lengthptr += (int)(code - last_code);
3643 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3644 (int)(code - last_code), c, c));
3645
3646 /* If "previous" is set and it is not at the start of the work space, move
3647 it back to there, in order to avoid filling up the work space. Otherwise,
3648 if "previous" is NULL, reset the current code pointer to the start. */
3649
3650 if (previous != NULL)
3651 {
3652 if (previous > orig_code)
3653 {
3654 memmove(orig_code, previous, IN_UCHARS(code - previous));
3655 code -= previous - orig_code;
3656 previous = orig_code;
3657 }
3658 }
3659 else code = orig_code;
3660
3661 /* Remember where this code item starts so we can pick up the length
3662 next time round. */
3663
3664 last_code = code;
3665 }
3666
3667 /* In the real compile phase, just check the workspace used by the forward
3668 reference list. */
3669
3670 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3671 WORK_SIZE_SAFETY_MARGIN)
3672 {
3673 *errorcodeptr = ERR52;
3674 goto FAILED;
3675 }
3676
3677 /* If in \Q...\E, check for the end; if not, we have a literal */
3678
3679 if (inescq && c != CHAR_NULL)
3680 {
3681 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3682 {
3683 inescq = FALSE;
3684 ptr++;
3685 continue;
3686 }
3687 else
3688 {
3689 if (previous_callout != NULL)
3690 {
3691 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3692 complete_callout(previous_callout, ptr, cd);
3693 previous_callout = NULL;
3694 }
3695 if ((options & PCRE_AUTO_CALLOUT) != 0)
3696 {
3697 previous_callout = code;
3698 code = auto_callout(code, ptr, cd);
3699 }
3700 goto NORMAL_CHAR;
3701 }
3702 }
3703
3704 /* Fill in length of a previous callout, except when the next thing is
3705 a quantifier. */
3706
3707 is_quantifier =
3708 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3709 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3710
3711 if (!is_quantifier && previous_callout != NULL &&
3712 after_manual_callout-- <= 0)
3713 {
3714 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3715 complete_callout(previous_callout, ptr, cd);
3716 previous_callout = NULL;
3717 }
3718
3719 /* In extended mode, skip white space and comments. */
3720
3721 if ((options & PCRE_EXTENDED) != 0)
3722 {
3723 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3724 if (c == CHAR_NUMBER_SIGN)
3725 {
3726 ptr++;
3727 while (*ptr != CHAR_NULL)
3728 {
3729 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3730 ptr++;
3731 #ifdef SUPPORT_UTF
3732 if (utf) FORWARDCHAR(ptr);
3733 #endif
3734 }
3735 if (*ptr != CHAR_NULL) continue;
3736
3737 /* Else fall through to handle end of string */
3738 c = 0;
3739 }
3740 }
3741
3742 /* No auto callout for quantifiers. */
3743
3744 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3745 {
3746 previous_callout = code;
3747 code = auto_callout(code, ptr, cd);
3748 }
3749
3750 switch(c)
3751 {
3752 /* ===================================================================*/
3753 case 0: /* The branch terminates at string end */
3754 case CHAR_VERTICAL_LINE: /* or | or ) */
3755 case CHAR_RIGHT_PARENTHESIS:
3756 *firstcharptr = firstchar;
3757 *firstcharflagsptr = firstcharflags;
3758 *reqcharptr = reqchar;
3759 *reqcharflagsptr = reqcharflags;
3760 *codeptr = code;
3761 *ptrptr = ptr;
3762 if (lengthptr != NULL)
3763 {
3764 if (OFLOW_MAX - *lengthptr < code - last_code)
3765 {
3766 *errorcodeptr = ERR20;
3767 goto FAILED;
3768 }
3769 *lengthptr += (int)(code - last_code); /* To include callout length */
3770 DPRINTF((">> end branch\n"));
3771 }
3772 return TRUE;
3773
3774
3775 /* ===================================================================*/
3776 /* Handle single-character metacharacters. In multiline mode, ^ disables
3777 the setting of any following char as a first character. */
3778
3779 case CHAR_CIRCUMFLEX_ACCENT:
3780 previous = NULL;
3781 if ((options & PCRE_MULTILINE) != 0)
3782 {
3783 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
3784 *code++ = OP_CIRCM;
3785 }
3786 else *code++ = OP_CIRC;
3787 break;
3788
3789 case CHAR_DOLLAR_SIGN:
3790 previous = NULL;
3791 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3792 break;
3793
3794 /* There can never be a first char if '.' is first, whatever happens about
3795 repeats. The value of reqchar doesn't change either. */
3796
3797 case CHAR_DOT:
3798 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
3799 zerofirstchar = firstchar;
3800 zerofirstcharflags = firstcharflags;
3801 zeroreqchar = reqchar;
3802 zeroreqcharflags = reqcharflags;
3803 previous = code;
3804 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3805 break;
3806
3807
3808 /* ===================================================================*/
3809 /* Character classes. If the included characters are all < 256, we build a
3810 32-byte bitmap of the permitted characters, except in the special case
3811 where there is only one such character. For negated classes, we build the
3812 map as usual, then invert it at the end. However, we use a different opcode
3813 so that data characters > 255 can be handled correctly.
3814
3815 If the class contains characters outside the 0-255 range, a different
3816 opcode is compiled. It may optionally have a bit map for characters < 256,
3817 but those above are are explicitly listed afterwards. A flag byte tells
3818 whether the bitmap is present, and whether this is a negated class or not.
3819
3820 In JavaScript compatibility mode, an isolated ']' causes an error. In
3821 default (Perl) mode, it is treated as a data character. */
3822
3823 case CHAR_RIGHT_SQUARE_BRACKET:
3824 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3825 {
3826 *errorcodeptr = ERR64;
3827 goto FAILED;
3828 }
3829 goto NORMAL_CHAR;
3830
3831 case CHAR_LEFT_SQUARE_BRACKET:
3832 previous = code;
3833
3834 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3835 they are encountered at the top level, so we'll do that too. */
3836
3837 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3838 ptr[1] == CHAR_EQUALS_SIGN) &&
3839 check_posix_syntax(ptr, &tempptr))
3840 {
3841 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3842 goto FAILED;
3843 }
3844
3845 /* If the first character is '^', set the negation flag and skip it. Also,
3846 if the first few characters (either before or after ^) are \Q\E or \E we
3847 skip them too. This makes for compatibility with Perl. */
3848
3849 negate_class = FALSE;
3850 for (;;)
3851 {
3852 c = *(++ptr);
3853 if (c == CHAR_BACKSLASH)
3854 {
3855 if (ptr[1] == CHAR_E)
3856 ptr++;
3857 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3858 ptr += 3;
3859 else
3860 break;
3861 }
3862 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3863 negate_class = TRUE;
3864 else break;
3865 }
3866
3867 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3868 an initial ']' is taken as a data character -- the code below handles
3869 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3870 [^] must match any character, so generate OP_ALLANY. */
3871
3872 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3873 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3874 {
3875 *code++ = negate_class? OP_ALLANY : OP_FAIL;
3876 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
3877 zerofirstchar = firstchar;
3878 zerofirstcharflags = firstcharflags;
3879 break;
3880 }
3881
3882 /* If a class contains a negative special such as \S, we need to flip the
3883 negation flag at the end, so that support for characters > 255 works
3884 correctly (they are all included in the class). */
3885
3886 should_flip_negation = FALSE;
3887
3888 /* For optimization purposes, we track some properties of the class:
3889 class_has_8bitchar will be non-zero if the class contains at least one <
3890 256 character; class_one_char will be 1 if the class contains just one
3891 character. */
3892
3893 class_has_8bitchar = 0;
3894 class_one_char = 0;
3895
3896 /* Initialize the 32-char bit map to all zeros. We build the map in a
3897 temporary bit of memory, in case the class contains fewer than two
3898 8-bit characters because in that case the compiled code doesn't use the bit
3899 map. */
3900
3901 memset(classbits, 0, 32 * sizeof(pcre_uint8));
3902
3903 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3904 xclass = FALSE;
3905 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
3906 class_uchardata_base = class_uchardata; /* Save the start */
3907 #endif
3908
3909 /* Process characters until ] is reached. By writing this as a "do" it
3910 means that an initial ] is taken as a data character. At the start of the
3911 loop, c contains the first byte of the character. */
3912
3913 if (c != CHAR_NULL) do
3914 {
3915 const pcre_uchar *oldptr;
3916
3917 #ifdef SUPPORT_UTF
3918 if (utf && HAS_EXTRALEN(c))
3919 { /* Braces are required because the */
3920 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3921 }
3922 #endif
3923
3924 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3925 /* In the pre-compile phase, accumulate the length of any extra
3926 data and reset the pointer. This is so that very large classes that
3927 contain a zillion > 255 characters no longer overwrite the work space
3928 (which is on the stack). We have to remember that there was XCLASS data,
3929 however. */
3930
3931 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
3932 {
3933 xclass = TRUE;
3934 *lengthptr += class_uchardata - class_uchardata_base;
3935 class_uchardata = class_uchardata_base;
3936 }
3937 #endif
3938
3939 /* Inside \Q...\E everything is literal except \E */
3940
3941 if (inescq)
3942 {
3943 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3944 {
3945 inescq = FALSE; /* Reset literal state */
3946 ptr++; /* Skip the 'E' */
3947 continue; /* Carry on with next */
3948 }
3949 goto CHECK_RANGE; /* Could be range if \E follows */
3950 }
3951
3952 /* Handle POSIX class names. Perl allows a negation extension of the
3953 form [:^name:]. A square bracket that doesn't match the syntax is
3954 treated as a literal. We also recognize the POSIX constructions
3955 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3956 5.6 and 5.8 do. */
3957
3958 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3959 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3960 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3961 {
3962 BOOL local_negate = FALSE;
3963 int posix_class, taboffset, tabopt;
3964 register const pcre_uint8 *cbits = cd->cbits;
3965 pcre_uint8 pbits[32];
3966
3967 if (ptr[1] != CHAR_COLON)
3968 {
3969 *errorcodeptr = ERR31;
3970 goto FAILED;
3971 }
3972
3973 ptr += 2;
3974 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3975 {
3976 local_negate = TRUE;
3977 should_flip_negation = TRUE; /* Note negative special */
3978 ptr++;
3979 }
3980
3981 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3982 if (posix_class < 0)
3983 {
3984 *errorcodeptr = ERR30;
3985 goto FAILED;
3986 }
3987
3988 /* If matching is caseless, upper and lower are converted to
3989 alpha. This relies on the fact that the class table starts with
3990 alpha, lower, upper as the first 3 entries. */
3991
3992 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3993 posix_class = 0;
3994
3995 /* When PCRE_UCP is set, some of the POSIX classes are converted to
3996 different escape sequences that use Unicode properties. */
3997
3998 #ifdef SUPPORT_UCP
3999 if ((options & PCRE_UCP) != 0)
4000 {
4001 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4002 if (posix_substitutes[pc] != NULL)
4003 {
4004 nestptr = tempptr + 1;
4005 ptr = posix_substitutes[pc] - 1;
4006 continue;
4007 }
4008 }
4009 #endif
4010 /* In the non-UCP case, we build the bit map for the POSIX class in a
4011 chunk of local store because we may be adding and subtracting from it,
4012 and we don't want to subtract bits that may be in the main map already.
4013 At the end we or the result into the bit map that is being built. */
4014
4015 posix_class *= 3;
4016
4017 /* Copy in the first table (always present) */
4018
4019 memcpy(pbits, cbits + posix_class_maps[posix_class],
4020 32 * sizeof(pcre_uint8));
4021
4022 /* If there is a second table, add or remove it as required. */
4023
4024 taboffset = posix_class_maps[posix_class + 1];
4025 tabopt = posix_class_maps[posix_class + 2];
4026
4027 if (taboffset >= 0)
4028 {
4029 if (tabopt >= 0)
4030 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4031 else
4032 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4033 }
4034
4035 /* Now see if we need to remove any special characters. An option
4036 value of 1 removes vertical space and 2 removes underscore. */
4037
4038 if (tabopt < 0) tabopt = -tabopt;
4039 if (tabopt == 1) pbits[1] &= ~0x3c;
4040 else if (tabopt == 2) pbits[11] &= 0x7f;
4041
4042 /* Add the POSIX table or its complement into the main table that is
4043 being built and we are done. */
4044
4045 if (local_negate)
4046 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4047 else
4048 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4049
4050 ptr = tempptr + 1;
4051 /* Every class contains at least one < 256 character. */
4052 class_has_8bitchar = 1;
4053 /* Every class contains at least two characters. */
4054 class_one_char = 2;
4055 continue; /* End of POSIX syntax handling */
4056 }
4057
4058 /* Backslash may introduce a single character, or it may introduce one
4059 of the specials, which just set a flag. The sequence \b is a special
4060 case. Inside a class (and only there) it is treated as backspace. We
4061 assume that other escapes have more than one character in them, so
4062 speculatively set both class_has_8bitchar and class_one_char bigger
4063 than one. Unrecognized escapes fall through and are either treated
4064 as literal characters (by default), or are faulted if
4065 PCRE_EXTRA is set. */
4066
4067 if (c == CHAR_BACKSLASH)
4068 {
4069 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4070 TRUE);
4071 if (*errorcodeptr != 0) goto FAILED;
4072 if (escape == 0) c = ec;
4073 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4074 else if (escape == ESC_N) /* \N is not supported in a class */
4075 {
4076 *errorcodeptr = ERR71;
4077 goto FAILED;
4078 }
4079 else if (escape == ESC_Q) /* Handle start of quoted string */
4080 {
4081 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4082 {
4083 ptr += 2; /* avoid empty string */
4084 }
4085 else inescq = TRUE;
4086 continue;
4087 }
4088 else if (escape == ESC_E) continue; /* Ignore orphan \E */
4089
4090 else
4091 {
4092 register const pcre_uint8 *cbits = cd->cbits;
4093 /* Every class contains at least two < 256 characters. */
4094 class_has_8bitchar++;
4095 /* Every class contains at least two characters. */
4096 class_one_char += 2;
4097
4098 switch (escape)
4099 {
4100 #ifdef SUPPORT_UCP
4101 case ESC_du: /* These are the values given for \d etc */
4102 case ESC_DU: /* when PCRE_UCP is set. We replace the */
4103 case ESC_wu: /* escape sequence with an appropriate \p */
4104 case ESC_WU: /* or \P to test Unicode properties instead */
4105 case ESC_su: /* of the default ASCII testing. */
4106 case ESC_SU:
4107 nestptr = ptr;
4108 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4109 class_has_8bitchar--; /* Undo! */
4110 continue;
4111 #endif
4112 case ESC_d:
4113 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4114 continue;
4115
4116 case ESC_D:
4117 should_flip_negation = TRUE;
4118 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4119 continue;
4120
4121 case ESC_w:
4122 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4123 continue;
4124
4125 case ESC_W:
4126 should_flip_negation = TRUE;
4127 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4128 continue;
4129
4130 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4131 if it was previously set by something earlier in the character
4132 class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and
4133 EBCDIC, so we lazily just adjust the appropriate bit. */
4134
4135 case ESC_s:
4136 classbits[0] |= cbits[cbit_space];
4137 classbits[1] |= cbits[cbit_space+1] & ~0x08;
4138 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4139 continue;
4140
4141 case ESC_S:
4142 should_flip_negation = TRUE;
4143 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4144 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
4145 continue;
4146
4147 /* The rest apply in both UCP and non-UCP cases. */
4148
4149 case ESC_h:
4150 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4151 PRIV(hspace_list), NOTACHAR);
4152 continue;
4153
4154 case ESC_H:
4155 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4156 cd, PRIV(hspace_list));
4157 continue;
4158
4159 case ESC_v:
4160 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4161 PRIV(vspace_list), NOTACHAR);
4162 continue;
4163
4164 case ESC_V:
4165 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4166 cd, PRIV(vspace_list));
4167 continue;
4168
4169 #ifdef SUPPORT_UCP
4170 case ESC_p:
4171 case ESC_P:
4172 {
4173 BOOL negated;
4174 unsigned int ptype = 0, pdata = 0;
4175 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
4176 goto FAILED;
4177 *class_uchardata++ = ((escape == ESC_p) != negated)?
4178 XCL_PROP : XCL_NOTPROP;
4179 *class_uchardata++ = ptype;
4180 *class_uchardata++ = pdata;
4181 class_has_8bitchar--; /* Undo! */
4182 continue;
4183 }
4184 #endif
4185 /* Unrecognized escapes are faulted if PCRE is running in its
4186 strict mode. By default, for compatibility with Perl, they are
4187 treated as literals. */
4188
4189 default:
4190 if ((options & PCRE_EXTRA) != 0)
4191 {
4192 *errorcodeptr = ERR7;
4193 goto FAILED;
4194 }
4195 class_has_8bitchar--; /* Undo the speculative increase. */
4196 class_one_char -= 2; /* Undo the speculative increase. */
4197 c = *ptr; /* Get the final character and fall through */
4198 break;
4199 }
4200 }
4201
4202 /* Fall through if the escape just defined a single character (c >= 0).
4203 This may be greater than 256. */
4204
4205 escape = 0;
4206
4207 } /* End of backslash handling */
4208
4209 /* A character may be followed by '-' to form a range. However, Perl does
4210 not permit ']' to be the end of the range. A '-' character at the end is
4211 treated as a literal. Perl ignores orphaned \E sequences entirely. The
4212 code for handling \Q and \E is messy. */
4213
4214 CHECK_RANGE:
4215 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4216 {
4217 inescq = FALSE;
4218 ptr += 2;
4219 }
4220 oldptr = ptr;
4221
4222 /* Remember if \r or \n were explicitly used */
4223
4224 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4225
4226 /* Check for range */
4227
4228 if (!inescq && ptr[1] == CHAR_MINUS)
4229 {
4230 pcre_uint32 d;
4231 ptr += 2;
4232 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4233
4234 /* If we hit \Q (not followed by \E) at this point, go into escaped
4235 mode. */
4236
4237 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4238 {
4239 ptr += 2;
4240 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4241 { ptr += 2; continue; }
4242 inescq = TRUE;
4243 break;
4244 }
4245
4246 /* Minus (hyphen) at the end of a class is treated as a literal, so put
4247 back the pointer and jump to handle the character that preceded it. */
4248
4249 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4250 {
4251 ptr = oldptr;
4252 goto CLASS_SINGLE_CHARACTER;
4253 }
4254
4255 /* Otherwise, we have a potential range; pick up the next character */
4256
4257 #ifdef SUPPORT_UTF
4258 if (utf)
4259 { /* Braces are required because the */
4260 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4261 }
4262 else
4263 #endif
4264 d = *ptr; /* Not UTF-8 mode */
4265
4266 /* The second part of a range can be a single-character escape, but
4267 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4268 in such circumstances. */
4269
4270 if (!inescq && d == CHAR_BACKSLASH)
4271 {
4272 int descape;
4273 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
4274 if (*errorcodeptr != 0) goto FAILED;
4275
4276 /* \b is backspace; any other special means the '-' was literal. */
4277
4278 if (descape != 0)
4279 {
4280 if (descape == ESC_b) d = CHAR_BS; else
4281 {
4282 ptr = oldptr;
4283 goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4284 }
4285 }
4286 }
4287
4288 /* Check that the two values are in the correct order. Optimize
4289 one-character ranges. */
4290
4291 if (d < c)
4292 {
4293 *errorcodeptr = ERR8;
4294 goto FAILED;
4295 }
4296 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4297
4298 /* We have found a character range, so single character optimizations
4299 cannot be done anymore. Any value greater than 1 indicates that there
4300 is more than one character. */
4301
4302 class_one_char = 2;
4303
4304 /* Remember an explicit \r or \n, and add the range to the class. */
4305
4306 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4307
4308 class_has_8bitchar +=
4309 add_to_class(classbits, &class_uchardata, options, cd, c, d);
4310
4311 continue; /* Go get the next char in the class */
4312 }
4313
4314 /* Handle a single character - we can get here for a normal non-escape
4315 char, or after \ that introduces a single character or for an apparent
4316 range that isn't. Only the value 1 matters for class_one_char, so don't
4317 increase it if it is already 2 or more ... just in case there's a class
4318 with a zillion characters in it. */
4319
4320 CLASS_SINGLE_CHARACTER:
4321 if (class_one_char < 2) class_one_char++;
4322
4323 /* If class_one_char is 1, we have the first single character in the
4324 class, and there have been no prior ranges, or XCLASS items generated by
4325 escapes. If this is the final character in the class, we can optimize by
4326 turning the item into a 1-character OP_CHAR[I] if it's positive, or
4327 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
4328 to be set. Otherwise, there can be no first char if this item is first,
4329 whatever repeat count may follow. In the case of reqchar, save the
4330 previous value for reinstating. */
4331
4332 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4333 {
4334 ptr++;
4335 zeroreqchar = reqchar;
4336 zeroreqcharflags = reqcharflags;
4337
4338 if (negate_class)
4339 {
4340 #ifdef SUPPORT_UCP
4341 int d;
4342 #endif
4343 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4344 zerofirstchar = firstchar;
4345 zerofirstcharflags = firstcharflags;
4346
4347 /* For caseless UTF-8 mode when UCP support is available, check
4348 whether this character has more than one other case. If so, generate
4349 a special OP_NOTPROP item instead of OP_NOTI. */
4350
4351 #ifdef SUPPORT_UCP
4352 if (utf && (options & PCRE_CASELESS) != 0 &&
4353 (d = UCD_CASESET(c)) != 0)
4354 {
4355 *code++ = OP_NOTPROP;
4356 *code++ = PT_CLIST;
4357 *code++ = d;
4358 }
4359 else
4360 #endif
4361 /* Char has only one other case, or UCP not available */
4362
4363 {
4364 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4365 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4366 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4367 code += PRIV(ord2utf)(c, code);
4368 else
4369 #endif
4370 *code++ = c;
4371 }
4372
4373 /* We are finished with this character class */
4374
4375 goto END_CLASS;
4376 }
4377
4378 /* For a single, positive character, get the value into mcbuffer, and
4379 then we can handle this with the normal one-character code. */
4380
4381 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4382 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4383 mclength = PRIV(ord2utf)(c, mcbuffer);
4384 else
4385 #endif
4386 {
4387 mcbuffer[0] = c;
4388 mclength = 1;
4389 }
4390 goto ONE_CHAR;
4391 } /* End of 1-char optimization */
4392
4393 /* There is more than one character in the class, or an XCLASS item
4394 has been generated. Add this character to the class. */
4395
4396 class_has_8bitchar +=
4397 add_to_class(classbits, &class_uchardata, options, cd, c, c);
4398 }
4399
4400 /* Loop until ']' reached. This "while" is the end of the "do" far above.
4401 If we are at the end of an internal nested string, revert to the outer
4402 string. */
4403
4404 while (((c = *(++ptr)) != CHAR_NULL ||
4405 (nestptr != NULL &&
4406 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
4407 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4408
4409 /* Check for missing terminating ']' */
4410
4411 if (c == CHAR_NULL)
4412 {
4413 *errorcodeptr = ERR6;
4414 goto FAILED;
4415 }
4416
4417 /* We will need an XCLASS if data has been placed in class_uchardata. In
4418 the second phase this is a sufficient test. However, in the pre-compile
4419 phase, class_uchardata gets emptied to prevent workspace overflow, so it
4420 only if the very last character in the class needs XCLASS will it contain
4421 anything at this point. For this reason, xclass gets set TRUE above when
4422 uchar_classdata is emptied, and that's why this code is the way it is here
4423 instead of just doing a test on class_uchardata below. */
4424
4425 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4426 if (class_uchardata > class_uchardata_base) xclass = TRUE;
4427 #endif
4428
4429 /* If this is the first thing in the branch, there can be no first char
4430 setting, whatever the repeat count. Any reqchar setting must remain
4431 unchanged after any kind of repeat. */
4432
4433 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4434 zerofirstchar = firstchar;
4435 zerofirstcharflags = firstcharflags;
4436 zeroreqchar = reqchar;
4437 zeroreqcharflags = reqcharflags;
4438
4439 /* If there are characters with values > 255, we have to compile an
4440 extended class, with its own opcode, unless there was a negated special
4441 such as \S in the class, and PCRE_UCP is not set, because in that case all
4442 characters > 255 are in the class, so any that were explicitly given as
4443 well can be ignored. If (when there are explicit characters > 255 that must
4444 be listed) there are no characters < 256, we can omit the bitmap in the
4445 actual compiled code. */
4446
4447 #ifdef SUPPORT_UTF
4448 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4449 #elif !defined COMPILE_PCRE8
4450 if (xclass && !should_flip_negation)
4451 #endif
4452 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4453 {
4454 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
4455 *code++ = OP_XCLASS;
4456 code += LINK_SIZE;
4457 *code = negate_class? XCL_NOT:0;
4458
4459 /* If the map is required, move up the extra data to make room for it;
4460 otherwise just move the code pointer to the end of the extra data. */
4461
4462 if (class_has_8bitchar > 0)
4463 {
4464 *code++ |= XCL_MAP;
4465 memmove(code + (32 / sizeof(pcre_uchar)), code,
4466 IN_UCHARS(class_uchardata - code));
4467 memcpy(code, classbits, 32);
4468 code = class_uchardata + (32 / sizeof(pcre_uchar));
4469 }
4470 else code = class_uchardata;
4471
4472 /* Now fill in the complete length of the item */
4473
4474 PUT(previous, 1, (int)(code - previous));
4475 break; /* End of class handling */
4476 }
4477 #endif
4478
4479 /* If there are no characters > 255, or they are all to be included or
4480 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4481 whole class was negated and whether there were negative specials such as \S
4482 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4483 negating it if necessary. */
4484
4485 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4486 if (lengthptr == NULL) /* Save time in the pre-compile phase */
4487 {
4488 if (negate_class)
4489 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4490 memcpy(code, classbits, 32);
4491 }
4492 code += 32 / sizeof(pcre_uchar);
4493
4494 END_CLASS:
4495 break;
4496
4497
4498 /* ===================================================================*/
4499 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4500 has been tested above. */
4501
4502 case CHAR_LEFT_CURLY_BRACKET:
4503 if (!is_quantifier) goto NORMAL_CHAR;
4504 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4505 if (*errorcodeptr != 0) goto FAILED;
4506 goto REPEAT;
4507
4508 case CHAR_ASTERISK:
4509 repeat_min = 0;
4510 repeat_max = -1;
4511 goto REPEAT;
4512
4513 case CHAR_PLUS:
4514 repeat_min = 1;
4515 repeat_max = -1;
4516 goto REPEAT;
4517
4518 case CHAR_QUESTION_MARK:
4519 repeat_min = 0;
4520 repeat_max = 1;
4521
4522 REPEAT:
4523 if (previous == NULL)
4524 {
4525 *errorcodeptr = ERR9;
4526 goto FAILED;
4527 }
4528
4529 if (repeat_min == 0)
4530 {
4531 firstchar = zerofirstchar; /* Adjust for zero repeat */
4532 firstcharflags = zerofirstcharflags;
4533 reqchar = zeroreqchar; /* Ditto */
4534 reqcharflags = zeroreqcharflags;
4535 }
4536
4537 /* Remember whether this is a variable length repeat */
4538
4539 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4540
4541 op_type = 0; /* Default single-char op codes */
4542 possessive_quantifier = FALSE; /* Default not possessive quantifier */
4543
4544 /* Save start of previous item, in case we have to move it up in order to
4545 insert something before it. */
4546
4547 tempcode = previous;
4548
4549 /* If the next character is '+', we have a possessive quantifier. This
4550 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4551 If the next character is '?' this is a minimizing repeat, by default,
4552 but if PCRE_UNGREEDY is set, it works the other way round. We change the
4553 repeat type to the non-default. */
4554
4555 if (ptr[1] == CHAR_PLUS)
4556 {
4557 repeat_type = 0; /* Force greedy */
4558 possessive_quantifier = TRUE;
4559 ptr++;
4560 }
4561 else if (ptr[1] == CHAR_QUESTION_MARK)
4562 {
4563 repeat_type = greedy_non_default;
4564 ptr++;
4565 }
4566 else repeat_type = greedy_default;
4567
4568 /* If previous was a recursion call, wrap it in atomic brackets so that
4569 previous becomes the atomic group. All recursions were so wrapped in the
4570 past, but it no longer happens for non-repeated recursions. In fact, the
4571 repeated ones could be re-implemented independently so as not to need this,
4572 but for the moment we rely on the code for repeating groups. */
4573
4574 if (*previous == OP_RECURSE)
4575 {
4576 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4577 *previous = OP_ONCE;
4578 PUT(previous, 1, 2 + 2*LINK_SIZE);
4579 previous[2 + 2*LINK_SIZE] = OP_KET;
4580 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4581 code += 2 + 2 * LINK_SIZE;
4582 length_prevgroup = 3 + 3*LINK_SIZE;
4583
4584 /* When actually compiling, we need to check whether this was a forward
4585 reference, and if so, adjust the offset. */
4586
4587 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4588 {
4589 int offset = GET(cd->hwm, -LINK_SIZE);
4590 if (offset == previous + 1 - cd->start_code)
4591 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4592 }
4593 }
4594
4595 /* Now handle repetition for the different types of item. */
4596
4597 /* If previous was a character or negated character match, abolish the item
4598 and generate a repeat item instead. If a char item has a minimum of more
4599 than one, ensure that it is set in reqchar - it might not be if a sequence
4600 such as x{3} is the first thing in a branch because the x will have gone
4601 into firstchar instead. */
4602
4603 if (*previous == OP_CHAR || *previous == OP_CHARI
4604 || *previous == OP_NOT || *previous == OP_NOTI)
4605 {
4606 switch (*previous)
4607 {
4608 default: /* Make compiler happy. */
4609 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
4610 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4611 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
4612 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
4613 }
4614
4615 /* Deal with UTF characters that take up more than one character. It's
4616 easier to write this out separately than try to macrify it. Use c to
4617 hold the length of the character in bytes, plus UTF_LENGTH to flag that
4618 it's a length rather than a small character. */
4619
4620 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4621 if (utf && NOT_FIRSTCHAR(code[-1]))
4622 {
4623 pcre_uchar *lastchar = code - 1;
4624 BACKCHAR(lastchar);
4625 c = (int)(code - lastchar); /* Length of UTF-8 character */
4626 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4627 c |= UTF_LENGTH; /* Flag c as a length */
4628 }
4629 else
4630 #endif /* SUPPORT_UTF */
4631
4632 /* Handle the case of a single charater - either with no UTF support, or
4633 with UTF disabled, or for a single character UTF character. */
4634 {
4635 c = code[-1];
4636 if (*previous <= OP_CHARI && repeat_min > 1)
4637 {
4638 reqchar = c;
4639 reqcharflags = req_caseopt | cd->req_varyopt;
4640 }
4641 }
4642
4643 /* If the repetition is unlimited, it pays to see if the next thing on
4644 the line is something that cannot possibly match this character. If so,
4645 automatically possessifying this item gains some performance in the case
4646 where the match fails. */
4647
4648 if (!possessive_quantifier &&
4649 repeat_max < 0 &&
4650 check_auto_possessive(previous, utf, ptr + 1, options, cd))
4651 {
4652 repeat_type = 0; /* Force greedy */
4653 possessive_quantifier = TRUE;
4654 }
4655
4656 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
4657 }
4658
4659 /* If previous was a character type match (\d or similar), abolish it and
4660 create a suitable repeat item. The code is shared with single-character
4661 repeats by setting op_type to add a suitable offset into repeat_type. Note
4662 the the Unicode property types will be present only when SUPPORT_UCP is
4663 defined, but we don't wrap the little bits of code here because it just
4664 makes it horribly messy. */
4665
4666 else if (*previous < OP_EODN)
4667 {
4668 pcre_uchar *oldcode;
4669 int prop_type, prop_value;
4670 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
4671 c = *previous;
4672
4673 if (!possessive_quantifier &&
4674 repeat_max < 0 &&
4675 check_auto_possessive(previous, utf, ptr + 1, options, cd))
4676 {
4677 repeat_type = 0; /* Force greedy */
4678 possessive_quantifier = TRUE;
4679 }
4680
4681 OUTPUT_SINGLE_REPEAT:
4682 if (*previous == OP_PROP || *previous == OP_NOTPROP)
4683 {
4684 prop_type = previous[1];
4685 prop_value = previous[2];
4686 }
4687 else prop_type = prop_value = -1;
4688
4689 oldcode = code;
4690 code = previous; /* Usually overwrite previous item */
4691
4692 /* If the maximum is zero then the minimum must also be zero; Perl allows
4693 this case, so we do too - by simply omitting the item altogether. */
4694
4695 if (repeat_max == 0) goto END_REPEAT;
4696
4697 /* Combine the op_type with the repeat_type */
4698
4699 repeat_type += op_type;
4700
4701 /* A minimum of zero is handled either as the special case * or ?, or as
4702 an UPTO, with the maximum given. */
4703
4704 if (repeat_min == 0)
4705 {
4706 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4707 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4708 else
4709 {
4710 *code++ = OP_UPTO + repeat_type;
4711 PUT2INC(code, 0, repeat_max);
4712 }
4713 }
4714
4715 /* A repeat minimum of 1 is optimized into some special cases. If the
4716 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4717 left in place and, if the maximum is greater than 1, we use OP_UPTO with
4718 one less than the maximum. */
4719
4720 else if (repeat_min == 1)
4721 {
4722 if (repeat_max == -1)
4723 *code++ = OP_PLUS + repeat_type;
4724 else
4725 {
4726 code = oldcode; /* leave previous item in place */
4727 if (repeat_max == 1) goto END_REPEAT;
4728 *code++ = OP_UPTO + repeat_type;
4729 PUT2INC(code, 0, repeat_max - 1);
4730 }
4731 }
4732
4733 /* The case {n,n} is just an EXACT, while the general case {n,m} is
4734 handled as an EXACT followed by an UPTO. */
4735
4736 else
4737 {
4738 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4739 PUT2INC(code, 0, repeat_min);
4740
4741 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4742 we have to insert the character for the previous code. For a repeated
4743 Unicode property match, there are two extra bytes that define the
4744 required property. In UTF-8 mode, long characters have their length in
4745 c, with the UTF_LENGTH bit as a flag. */
4746
4747 if (repeat_max < 0)
4748 {
4749 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4750 if (utf && (c & UTF_LENGTH) != 0)
4751 {
4752 memcpy(code, utf_chars, IN_UCHARS(c & 7));
4753 code += c & 7;
4754 }
4755 else
4756 #endif
4757 {
4758 *code++ = c;
4759 if (prop_type >= 0)
4760 {
4761 *code++ = prop_type;
4762 *code++ = prop_value;
4763 }
4764 }
4765 *code++ = OP_STAR + repeat_type;
4766 }
4767
4768 /* Else insert an UPTO if the max is greater than the min, again
4769 preceded by the character, for the previously inserted code. If the
4770 UPTO is just for 1 instance, we can use QUERY instead. */
4771
4772 else if (repeat_max != repeat_min)
4773 {
4774 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4775 if (utf && (c & UTF_LENGTH) != 0)
4776 {
4777 memcpy(code, utf_chars, IN_UCHARS(c & 7));
4778 code += c & 7;
4779 }
4780 else
4781 #endif
4782 *code++ = c;
4783 if (prop_type >= 0)
4784 {
4785 *code++ = prop_type;
4786 *code++ = prop_value;
4787 }
4788 repeat_max -= repeat_min;
4789
4790 if (repeat_max == 1)
4791 {
4792 *code++ = OP_QUERY + repeat_type;
4793 }
4794 else
4795 {
4796 *code++ = OP_UPTO + repeat_type;
4797 PUT2INC(code, 0, repeat_max);
4798 }
4799 }
4800 }
4801
4802 /* The character or character type itself comes last in all cases. */
4803
4804 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4805 if (utf && (c & UTF_LENGTH) != 0)
4806 {
4807 memcpy(code, utf_chars, IN_UCHARS(c & 7));
4808 code += c & 7;
4809 }
4810 else
4811 #endif
4812 *code++ = c;
4813
4814 /* For a repeated Unicode property match, there are two extra bytes that
4815 define the required property. */
4816
4817 #ifdef SUPPORT_UCP
4818 if (prop_type >= 0)
4819 {
4820 *code++ = prop_type;
4821 *code++ = prop_value;
4822 }
4823 #endif
4824 }
4825
4826 /* If previous was a character class or a back reference, we put the repeat
4827 stuff after it, but just skip the item if the repeat was {0,0}. */
4828
4829 else if (*previous == OP_CLASS ||
4830 *previous == OP_NCLASS ||
4831 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4832 *previous == OP_XCLASS ||
4833 #endif
4834 *previous == OP_REF ||
4835 *previous == OP_REFI)
4836 {
4837 if (repeat_max == 0)
4838 {
4839 code = previous;
4840 goto END_REPEAT;
4841 }
4842
4843 if (repeat_min == 0 && repeat_max == -1)
4844 *code++ = OP_CRSTAR + repeat_type;
4845 else if (repeat_min == 1 && repeat_max == -1)
4846 *code++ = OP_CRPLUS + repeat_type;
4847 else if (repeat_min == 0 && repeat_max == 1)
4848 *code++ = OP_CRQUERY + repeat_type;
4849 else
4850 {
4851 *code++ = OP_CRRANGE + repeat_type;
4852 PUT2INC(code, 0, repeat_min);
4853 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4854 PUT2INC(code, 0, repeat_max);
4855 }
4856 }
4857
4858 /* If previous was a bracket group, we may have to replicate it in certain
4859 cases. Note that at this point we can encounter only the "basic" bracket
4860 opcodes such as BRA and CBRA, as this is the place where they get converted
4861 into the more special varieties such as BRAPOS and SBRA. A test for >=
4862 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
4863 ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
4864 repetition of assertions, but now it does, for Perl compatibility. */
4865
4866 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
4867 {
4868 register int i;
4869 int len = (int)(code - previous);
4870 pcre_uchar *bralink = NULL;
4871 pcre_uchar *brazeroptr = NULL;
4872
4873 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
4874 we just ignore the repeat. */
4875
4876 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4877 goto END_REPEAT;
4878
4879 /* There is no sense in actually repeating assertions. The only potential
4880 use of repetition is in cases when the assertion is optional. Therefore,
4881 if the minimum is greater than zero, just ignore the repeat. If the
4882 maximum is not not zero or one, set it to 1. */
4883
4884 if (*previous < OP_ONCE) /* Assertion */
4885 {
4886 if (repeat_min > 0) goto END_REPEAT;
4887 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
4888 }
4889
4890 /* The case of a zero minimum is special because of the need to stick
4891 OP_BRAZERO in front of it, and because the group appears once in the
4892 data, whereas in other cases it appears the minimum number of times. For
4893 this reason, it is simplest to treat this case separately, as otherwise
4894 the code gets far too messy. There are several special subcases when the
4895 minimum is zero. */
4896
4897 if (repeat_min == 0)
4898 {
4899 /* If the maximum is also zero, we used to just omit the group from the
4900 output altogether, like this:
4901
4902 ** if (repeat_max == 0)
4903 ** {
4904 ** code = previous;
4905 ** goto END_REPEAT;
4906 ** }
4907
4908 However, that fails when a group or a subgroup within it is referenced
4909 as a subroutine from elsewhere in the pattern, so now we stick in
4910 OP_SKIPZERO in front of it so that it is skipped on execution. As we
4911 don't have a list of which groups are referenced, we cannot do this
4912 selectively.
4913
4914 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4915 and do no more at this point. However, we do need to adjust any
4916 OP_RECURSE calls inside the group that refer to the group itself or any
4917 internal or forward referenced group, because the offset is from the
4918 start of the whole regex. Temporarily terminate the pattern while doing
4919 this. */
4920
4921 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
4922 {
4923 *code = OP_END;
4924 adjust_recurse(previous, 1, utf, cd, save_hwm);
4925 memmove(previous + 1, previous, IN_UCHARS(len));
4926 code++;
4927 if (repeat_max == 0)
4928 {
4929 *previous++ = OP_SKIPZERO;
4930 goto END_REPEAT;
4931 }
4932 brazeroptr = previous; /* Save for possessive optimizing */
4933 *previous++ = OP_BRAZERO + repeat_type;
4934 }
4935
4936 /* If the maximum is greater than 1 and limited, we have to replicate
4937 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4938 The first one has to be handled carefully because it's the original
4939 copy, which has to be moved up. The remainder can be handled by code
4940 that is common with the non-zero minimum case below. We have to
4941 adjust the value or repeat_max, since one less copy is required. Once
4942 again, we may have to adjust any OP_RECURSE calls inside the group. */
4943
4944 else
4945 {
4946 int offset;
4947 *code = OP_END;
4948 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
4949 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
4950 code += 2 + LINK_SIZE;
4951 *previous++ = OP_BRAZERO + repeat_type;
4952 *previous++ = OP_BRA;
4953
4954 /* We chain together the bracket offset fields that have to be
4955 filled in later when the ends of the brackets are reached. */
4956
4957 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
4958 bralink = previous;
4959 PUTINC(previous, 0, offset);
4960 }
4961
4962 repeat_max--;
4963 }
4964
4965 /* If the minimum is greater than zero, replicate the group as many
4966 times as necessary, and adjust the maximum to the number of subsequent
4967 copies that we need. If we set a first char from the group, and didn't
4968 set a required char, copy the latter from the former. If there are any
4969 forward reference subroutine calls in the group, there will be entries on
4970 the workspace list; replicate these with an appropriate increment. */
4971
4972 else
4973 {
4974 if (repeat_min > 1)
4975 {
4976 /* In the pre-compile phase, we don't actually do the replication. We
4977 just adjust the length as if we had. Do some paranoid checks for
4978 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4979 integer type when available, otherwise double. */
4980
4981 if (lengthptr != NULL)
4982 {
4983 int delta = (repeat_min - 1)*length_prevgroup;
4984 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4985 (INT64_OR_DOUBLE)length_prevgroup >
4986 (INT64_OR_DOUBLE)INT_MAX ||
4987 OFLOW_MAX - *lengthptr < delta)
4988 {
4989 *errorcodeptr = ERR20;
4990 goto FAILED;
4991 }
4992 *lengthptr += delta;
4993 }
4994
4995 /* This is compiling for real. If there is a set first byte for
4996 the group, and we have not yet set a "required byte", set it. Make
4997 sure there is enough workspace for copying forward references before
4998 doing the copy. */
4999
5000 else
5001 {
5002 if (groupsetfirstchar && reqcharflags < 0)
5003 {
5004 reqchar = firstchar;
5005 reqcharflags = firstcharflags;
5006 }
5007
5008 for (i = 1; i < repeat_min; i++)
5009 {
5010 pcre_uchar *hc;
5011 pcre_uchar *this_hwm = cd->hwm;
5012 memcpy(code, previous, IN_UCHARS(len));
5013
5014 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5015 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5016 {
5017 int save_offset = save_hwm - cd->start_workspace;
5018 int this_offset = this_hwm - cd->start_workspace;
5019 *errorcodeptr = expand_workspace(cd);
5020 if (*errorcodeptr != 0) goto FAILED;
5021 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5022 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5023 }
5024
5025 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5026 {
5027 PUT(cd->hwm, 0, GET(hc, 0) + len);
5028 cd->hwm += LINK_SIZE;
5029 }
5030 save_hwm = this_hwm;
5031 code += len;
5032 }
5033 }
5034 }
5035
5036 if (repeat_max > 0) repeat_max -= repeat_min;
5037 }
5038
5039 /* This code is common to both the zero and non-zero minimum cases. If
5040 the maximum is limited, it replicates the group in a nested fashion,
5041 remembering the bracket starts on a stack. In the case of a zero minimum,
5042 the first one was set up above. In all cases the repeat_max now specifies
5043 the number of additional copies needed. Again, we must remember to
5044 replicate entries on the forward reference list. */
5045
5046 if (repeat_max >= 0)
5047 {
5048 /* In the pre-compile phase, we don't actually do the replication. We
5049 just adjust the length as if we had. For each repetition we must add 1
5050 to the length for BRAZERO and for all but the last repetition we must
5051 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5052 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5053 a 64-bit integer type when available, otherwise double. */
5054
5055 if (lengthptr != NULL && repeat_max > 0)
5056 {
5057 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5058 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5059 if ((INT64_OR_DOUBLE)repeat_max *
5060 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5061 > (INT64_OR_DOUBLE)INT_MAX ||
5062 OFLOW_MAX - *lengthptr < delta)
5063 {
5064 *errorcodeptr = ERR20;
5065 goto FAILED;
5066 }
5067 *lengthptr += delta;
5068 }
5069
5070 /* This is compiling for real */
5071
5072 else for (i = repeat_max - 1; i >= 0; i--)
5073 {
5074 pcre_uchar *hc;
5075 pcre_uchar *this_hwm = cd->hwm;
5076
5077 *code++ = OP_BRAZERO + repeat_type;
5078
5079 /* All but the final copy start a new nesting, maintaining the
5080 chain of brackets outstanding. */
5081
5082 if (i != 0)
5083 {
5084 int offset;
5085 *code++ = OP_BRA;
5086 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5087 bralink = code;
5088 PUTINC(code, 0, offset);
5089 }
5090
5091 memcpy(code, previous, IN_UCHARS(len));
5092
5093 /* Ensure there is enough workspace for forward references before
5094 copying them. */
5095
5096 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5097 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5098 {
5099 int save_offset = save_hwm - cd->start_workspace;
5100 int this_offset = this_hwm - cd->start_workspace;
5101 *errorcodeptr = expand_workspace(cd);
5102 if (*errorcodeptr != 0) goto FAILED;
5103 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5104 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5105 }
5106
5107 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5108 {
5109 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5110 cd->hwm += LINK_SIZE;
5111 }
5112 save_hwm = this_hwm;
5113 code += len;
5114 }
5115
5116 /* Now chain through the pending brackets, and fill in their length
5117 fields (which are holding the chain links pro tem). */
5118
5119 while (bralink != NULL)
5120 {
5121 int oldlinkoffset;
5122 int offset = (int)(code - bralink + 1);
5123 pcre_uchar *bra = code - offset;
5124 oldlinkoffset = GET(bra, 1);
5125 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5126 *code++ = OP_KET;
5127 PUTINC(code, 0, offset);
5128 PUT(bra, 1, offset);
5129 }
5130 }
5131
5132 /* If the maximum is unlimited, set a repeater in the final copy. For
5133 ONCE brackets, that's all we need to do. However, possessively repeated
5134 ONCE brackets can be converted into non-capturing brackets, as the
5135 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5136 deal with possessive ONCEs specially.
5137
5138 Otherwise, when we are doing the actual compile phase, check to see
5139 whether this group is one that could match an empty string. If so,
5140 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5141 that runtime checking can be done. [This check is also applied to ONCE
5142 groups at runtime, but in a different way.]
5143
5144 Then, if the quantifier was possessive and the bracket is not a
5145 conditional, we convert the BRA code to the POS form, and the KET code to
5146 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5147 subpattern at both the start and at the end.) The use of special opcodes
5148 makes it possible to reduce greatly the stack usage in pcre_exec(). If
5149 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5150
5151 Then, if the minimum number of matches is 1 or 0, cancel the possessive
5152 flag so that the default action below, of wrapping everything inside
5153 atomic brackets, does not happen. When the minimum is greater than 1,
5154 there will be earlier copies of the group, and so we still have to wrap
5155 the whole thing. */
5156
5157 else
5158 {
5159 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5160 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5161
5162 /* Convert possessive ONCE brackets to non-capturing */
5163
5164 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5165 possessive_quantifier) *bracode = OP_BRA;
5166
5167 /* For non-possessive ONCE brackets, all we need to do is to
5168 set the KET. */
5169
5170 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5171 *ketcode = OP_KETRMAX + repeat_type;
5172
5173 /* Handle non-ONCE brackets and possessive ONCEs (which have been
5174 converted to non-capturing above). */
5175
5176 else
5177 {
5178 /* In the compile phase, check for empty string matching. */
5179
5180 if (lengthptr == NULL)
5181 {
5182 pcre_uchar *scode = bracode;
5183 do
5184 {
5185 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
5186 {
5187 *bracode += OP_SBRA - OP_BRA;
5188 break;
5189 }
5190 scode += GET(scode, 1);
5191 }
5192 while (*scode == OP_ALT);
5193 }
5194
5195 /* Handle possessive quantifiers. */
5196
5197 if (possessive_quantifier)
5198 {
5199 /* For COND brackets, we wrap the whole thing in a possessively
5200 repeated non-capturing bracket, because we have not invented POS
5201 versions of the COND opcodes. Because we are moving code along, we
5202 must ensure that any pending recursive references are updated. */
5203
5204 if (*bracode == OP_COND || *bracode == OP_SCOND)
5205 {
5206 int nlen = (int)(code - bracode);
5207 *code = OP_END;
5208 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5209 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5210 code += 1 + LINK_SIZE;
5211 nlen += 1 + LINK_SIZE;
5212 *bracode = OP_BRAPOS;
5213 *code++ = OP_KETRPOS;
5214 PUTINC(code, 0, nlen);
5215 PUT(bracode, 1, nlen);
5216 }
5217
5218 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5219
5220 else
5221 {
5222 *bracode += 1; /* Switch to xxxPOS opcodes */
5223 *ketcode = OP_KETRPOS;
5224 }
5225
5226 /* If the minimum is zero, mark it as possessive, then unset the
5227 possessive flag when the minimum is 0 or 1. */
5228
5229 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5230 if (repeat_min < 2) possessive_quantifier = FALSE;
5231 }
5232
5233 /* Non-possessive quantifier */
5234
5235 else *ketcode = OP_KETRMAX + repeat_type;
5236 }
5237 }
5238 }
5239
5240 /* If previous is OP_FAIL, it was generated by an empty class [] in
5241 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5242 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5243 error above. We can just ignore the repeat in JS case. */
5244
5245 else if (*previous == OP_FAIL) goto END_REPEAT;
5246
5247 /* Else there's some kind of shambles */
5248
5249 else
5250 {
5251 *errorcodeptr = ERR11;
5252 goto FAILED;
5253 }
5254
5255 /* If the character following a repeat is '+', or if certain optimization
5256 tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5257 there are special alternative opcodes for this case. For anything else, we
5258 wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5259 notation is just syntactic sugar, taken from Sun's Java package, but the
5260 special opcodes can optimize it.
5261
5262 Some (but not all) possessively repeated subpatterns have already been
5263 completely handled in the code just above. For them, possessive_quantifier
5264 is always FALSE at this stage.
5265
5266 Note that the repeated item starts at tempcode, not at previous, which
5267 might be the first part of a string whose (former) last char we repeated.
5268
5269 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5270 an 'upto' may follow. We skip over an 'exact' item, and then test the
5271 length of what remains before proceeding. */
5272
5273 if (possessive_quantifier)
5274 {
5275 int len;
5276
5277 if (*tempcode == OP_TYPEEXACT)
5278 tempcode += PRIV(OP_lengths)[*tempcode] +
5279 ((tempcode[1 + IMM2_SIZE] == OP_PROP
5280 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5281
5282 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5283 {
5284 tempcode += PRIV(OP_lengths)[*tempcode];
5285 #ifdef SUPPORT_UTF
5286 if (utf && HAS_EXTRALEN(tempcode[-1]))
5287 tempcode += GET_EXTRALEN(tempcode[-1]);
5288 #endif
5289 }
5290
5291 len = (int)(code - tempcode);
5292 if (len > 0) switch (*tempcode)
5293 {
5294 case OP_STAR: *tempcode = OP_POSSTAR; break;
5295 case OP_PLUS: *tempcode = OP_POSPLUS; break;
5296 case OP_QUERY: *tempcode = OP_POSQUERY; break;
5297 case OP_UPTO: *tempcode = OP_POSUPTO; break;
5298
5299 case OP_STARI: *tempcode = OP_POSSTARI; break;
5300 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
5301 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5302 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
5303
5304 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
5305 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
5306 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5307 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
5308
5309 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
5310 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
5311 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5312 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
5313
5314 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
5315 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
5316 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5317 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
5318
5319 /* Because we are moving code along, we must ensure that any
5320 pending recursive references are updated. */
5321
5322 default:
5323 *code = OP_END;
5324 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5325 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5326 code += 1 + LINK_SIZE;
5327 len += 1 + LINK_SIZE;
5328 tempcode[0] = OP_ONCE;
5329 *code++ = OP_KET;
5330 PUTINC(code, 0, len);
5331 PUT(tempcode, 1, len);
5332 break;
5333 }
5334 }
5335
5336 /* In all case we no longer have a previous item. We also set the
5337 "follows varying string" flag for subsequently encountered reqchars if
5338 it isn't already set and we have just passed a varying length item. */
5339
5340 END_REPEAT:
5341 previous = NULL;
5342 cd->req_varyopt |= reqvary;
5343 break;
5344
5345
5346 /* ===================================================================*/
5347 /* Start of nested parenthesized sub-expression, or comment or lookahead or
5348 lookbehind or option setting or condition or all the other extended
5349 parenthesis forms. */
5350
5351 case CHAR_LEFT_PARENTHESIS:
5352 newoptions = options;
5353 skipbytes = 0;
5354 bravalue = OP_CBRA;
5355 save_hwm = cd->hwm;
5356 reset_bracount = FALSE;
5357
5358 /* First deal with various "verbs" that can be introduced by '*'. */
5359
5360 ptr++;
5361 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5362 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5363 {
5364 int i, namelen;
5365 int arglen = 0;
5366 const char *vn = verbnames;
5367 const pcre_uchar *name = ptr + 1;
5368 const pcre_uchar *arg = NULL;
5369 previous = NULL;
5370 ptr++;
5371 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5372 namelen = (int)(ptr - name);
5373
5374 /* It appears that Perl allows any characters whatsoever, other than
5375 a closing parenthesis, to appear in arguments, so we no longer insist on
5376 letters, digits, and underscores. */
5377
5378 if (*ptr == CHAR_COLON)
5379 {
5380 arg = ++ptr;
5381 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5382 arglen = (int)(ptr - arg);
5383 if ((unsigned int)arglen > MAX_MARK)
5384 {
5385 *errorcodeptr = ERR75;
5386 goto FAILED;
5387 }
5388 }
5389
5390 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5391 {
5392 *errorcodeptr = ERR60;
5393 goto FAILED;
5394 }
5395
5396 /* Scan the table of verb names */
5397
5398 for (i = 0; i < verbcount; i++)
5399 {
5400 if (namelen == verbs[i].len &&
5401 STRNCMP_UC_C8(name, vn, namelen) == 0)
5402 {
5403 int setverb;
5404
5405 /* Check for open captures before ACCEPT and convert it to
5406 ASSERT_ACCEPT if in an assertion. */
5407
5408 if (verbs[i].op == OP_ACCEPT)
5409 {
5410 open_capitem *oc;
5411 if (arglen != 0)
5412 {
5413 *errorcodeptr = ERR59;
5414 goto FAILED;
5415 }
5416 cd->had_accept = TRUE;
5417 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5418 {
5419 *code++ = OP_CLOSE;
5420 PUT2INC(code, 0, oc->number);
5421 }
5422 setverb = *code++ =
5423 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5424
5425 /* Do not set firstchar after *ACCEPT */
5426 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5427 }
5428
5429 /* Handle other cases with/without an argument */
5430
5431 else if (arglen == 0)
5432 {
5433 if (verbs[i].op < 0) /* Argument is mandatory */
5434 {
5435 *errorcodeptr = ERR66;
5436 goto FAILED;
5437 }
5438 setverb = *code++ = verbs[i].op;
5439 }
5440
5441 else
5442 {
5443 if (verbs[i].op_arg < 0) /* Argument is forbidden */
5444 {
5445 *errorcodeptr = ERR59;
5446 goto FAILED;
5447 }
5448 setverb = *code++ = verbs[i].op_arg;
5449 *code++ = arglen;
5450 memcpy(code, arg, IN_UCHARS(arglen));
5451 code += arglen;
5452 *code++ = 0;
5453 }
5454
5455 switch (setverb)
5456 {
5457 case OP_THEN:
5458 case OP_THEN_ARG:
5459 cd->external_flags |= PCRE_HASTHEN;
5460 break;
5461
5462 case OP_PRUNE:
5463 case OP_PRUNE_ARG:
5464 case OP_SKIP:
5465 case OP_SKIP_ARG:
5466 cd->had_pruneorskip = TRUE;
5467 break;
5468 }
5469
5470 break; /* Found verb, exit loop */
5471 }
5472
5473 vn += verbs[i].len + 1;
5474 }
5475
5476 if (i < verbcount) continue; /* Successfully handled a verb */
5477 *errorcodeptr = ERR60; /* Verb not recognized */
5478 goto FAILED;
5479 }
5480
5481 /* Deal with the extended parentheses; all are introduced by '?', and the
5482 appearance of any of them means that this is not a capturing group. */
5483
5484 else if (*ptr == CHAR_QUESTION_MARK)
5485 {
5486 int i, set, unset, namelen;
5487 int *optset;
5488 const pcre_uchar *name;
5489 pcre_uchar *slot;
5490
5491 switch (*(++ptr))
5492 {
5493 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
5494 ptr++;
5495 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5496 if (*ptr == CHAR_NULL)
5497 {
5498 *errorcodeptr = ERR18;
5499 goto FAILED;
5500 }
5501 continue;
5502
5503
5504 /* ------------------------------------------------------------ */
5505 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
5506 reset_bracount = TRUE;
5507 /* Fall through */
5508
5509 /* ------------------------------------------------------------ */
5510 case CHAR_COLON: /* Non-capturing bracket */
5511 bravalue = OP_BRA;
5512 ptr++;
5513 break;
5514
5515
5516 /* ------------------------------------------------------------ */
5517 case CHAR_LEFT_PARENTHESIS:
5518 bravalue = OP_COND; /* Conditional group */
5519 tempptr = ptr;
5520
5521 /* A condition can be an assertion, a number (referring to a numbered
5522 group), a name (referring to a named group), or 'R', referring to
5523 recursion. R<digits> and R&name are also permitted for recursion tests.
5524
5525 There are several syntaxes for testing a named group: (?(name)) is used
5526 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5527
5528 There are two unfortunate ambiguities, caused by history. (a) 'R' can
5529 be the recursive thing or the name 'R' (and similarly for 'R' followed
5530 by digits), and (b) a number could be a name that consists of digits.
5531 In both cases, we look for a name first; if not found, we try the other
5532 cases.
5533
5534 For compatibility with auto-callouts, we allow a callout to be
5535 specified before a condition that is an assertion. First, check for the
5536 syntax of a callout; if found, adjust the temporary pointer that is
5537 used to check for an assertion condition. That's all that is needed! */
5538
5539 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
5540 {
5541 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
5542 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
5543 tempptr += i + 1;
5544 }
5545
5546 /* For conditions that are assertions, check the syntax, and then exit
5547 the switch. This will take control down to where bracketed groups,
5548 including assertions, are processed. */
5549
5550 if (tempptr[1] == CHAR_QUESTION_MARK &&
5551 (tempptr[2] == CHAR_EQUALS_SIGN ||
5552 tempptr[2] == CHAR_EXCLAMATION_MARK ||
5553 tempptr[2] == CHAR_LESS_THAN_SIGN))
5554 break;
5555
5556 /* Most other conditions use OP_CREF (a couple change to OP_RREF
5557 below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5558
5559 code[1+LINK_SIZE] = OP_CREF;
5560 skipbytes = 1+IMM2_SIZE;
5561 refsign = -1;
5562
5563 /* Check for a test for recursion in a named group. */
5564
5565 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
5566 {
5567 terminator = -1;
5568 ptr += 2;
5569 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
5570 }
5571
5572 /* Check for a test for a named group's having been set, using the Perl
5573 syntax (?(<name>) or (?('name') */
5574
5575 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
5576 {
5577 terminator = CHAR_GREATER_THAN_SIGN;
5578 ptr++;
5579 }
5580 else if (ptr[1] == CHAR_APOSTROPHE)
5581 {
5582 terminator = CHAR_APOSTROPHE;
5583 ptr++;
5584 }
5585 else
5586 {
5587 terminator = CHAR_NULL;
5588 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5589 }
5590
5591 /* We now expect to read a name; any thing else is an error */
5592
5593 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5594 {
5595 ptr += 1; /* To get the right offset */
5596 *errorcodeptr = ERR28;
5597 goto FAILED;
5598 }
5599
5600 /* Read the name, but also get it as a number if it's all digits */
5601
5602 recno = 0;
5603 name = ++ptr;
5604 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5605 {
5606 if (recno >= 0)
5607 recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
5608 ptr++;
5609 }
5610 namelen = (int)(ptr - name);
5611
5612 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
5613 *ptr++ != CHAR_RIGHT_PARENTHESIS)
5614 {
5615 ptr--; /* Error offset */
5616 *errorcodeptr = ERR26;
5617 goto FAILED;
5618 }
5619
5620 /* Do no further checking in the pre-compile phase. */
5621
5622 if (lengthptr != NULL) break;
5623
5624 /* In the real compile we do the work of looking for the actual
5625 reference. If the string started with "+" or "-" we require the rest to
5626 be digits, in which case recno will be set. */
5627
5628 if (refsign > 0)
5629 {
5630 if (recno <= 0)
5631 {
5632 *errorcodeptr = ERR58;
5633 goto FAILED;
5634 }
5635 recno = (refsign == CHAR_MINUS)?
5636 cd->bracount - recno + 1 : recno +cd->bracount;
5637 if (recno <= 0 || recno > cd->final_bracount)
5638 {
5639 *errorcodeptr = ERR15;
5640 goto FAILED;
5641 }
5642 PUT2(code, 2+LINK_SIZE, recno);
5643 break;
5644 }
5645
5646 /* Otherwise (did not start with "+" or "-"), start by looking for the
5647 name. If we find a name, add one to the opcode to change OP_CREF or
5648 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5649 except they record that the reference was originally to a name. The
5650 information is used to check duplicate names. */
5651
5652 slot = cd->name_table;
5653 for (i = 0; i < cd->names_found; i++)
5654 {
5655 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5656 slot += cd->name_entry_size;
5657 }
5658
5659 /* Found the named subpattern */
5660
5661 if (i < cd->names_found)
5662 {
5663 recno = GET2(slot, 0);
5664 PUT2(code, 2+LINK_SIZE, recno);
5665 code[1+LINK_SIZE]++;
5666 }
5667
5668 /* If terminator == CHAR_NULL it means that the name followed directly
5669 after the opening parenthesis [e.g. (?(abc)...] and in this case there
5670 are some further alternatives to try. For the cases where terminator !=
5671 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5672 now checked all the possibilities, so give an error. */
5673
5674 else if (terminator != CHAR_NULL)
5675 {
5676 *errorcodeptr = ERR15;
5677 goto FAILED;
5678 }
5679
5680 /* Check for (?(R) for recursion. Allow digits after R to specify a
5681 specific group number. */
5682
5683 else if (*name == CHAR_R)
5684 {
5685 recno = 0;
5686 for (i = 1; i < namelen; i++)
5687 {
5688 if (!IS_DIGIT(name[i]))
5689 {
5690 *errorcodeptr = ERR15;
5691 goto FAILED;
5692 }
5693 recno = recno * 10 + name[i] - CHAR_0;
5694 }
5695 if (recno == 0) recno = RREF_ANY;
5696 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
5697 PUT2(code, 2+LINK_SIZE, recno);
5698 }
5699
5700 /* Similarly, check for the (?(DEFINE) "condition", which is always
5701 false. */
5702
5703 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5704 {
5705 code[1+LINK_SIZE] = OP_DEF;
5706 skipbytes = 1;
5707 }
5708
5709 /* Check for the "name" actually being a subpattern number. We are
5710 in the second pass here, so final_bracount is set. */
5711
5712 else if (recno > 0 && recno <= cd->final_bracount)
5713 {
5714 PUT2(code, 2+LINK_SIZE, recno);
5715 }
5716
5717 /* Either an unidentified subpattern, or a reference to (?(0) */
5718
5719 else
5720 {
5721 *errorcodeptr = (recno == 0)? ERR35: ERR15;
5722 goto FAILED;
5723 }
5724 break;
5725
5726
5727 /* ------------------------------------------------------------ */
5728 case CHAR_EQUALS_SIGN: /* Positive lookahead */
5729 bravalue = OP_ASSERT;
5730 cd->assert_depth += 1;
5731 ptr++;
5732 break;
5733
5734
5735 /* ------------------------------------------------------------ */
5736 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
5737 ptr++;
5738 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
5739 {
5740 *code++ = OP_FAIL;
5741 previous = NULL;
5742 continue;
5743 }
5744 bravalue = OP_ASSERT_NOT;
5745 cd->assert_depth += 1;
5746 break;
5747
5748
5749 /* ------------------------------------------------------------ */
5750 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
5751 switch (ptr[1])
5752 {
5753 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
5754 bravalue = OP_ASSERTBACK;
5755 cd->assert_depth += 1;
5756 ptr += 2;
5757 break;
5758
5759 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
5760 bravalue = OP_ASSERTBACK_NOT;
5761 cd->assert_depth += 1;
5762 ptr += 2;
5763 break;
5764
5765 default: /* Could be name define, else bad */
5766 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5767 goto DEFINE_NAME;
5768 ptr++; /* Correct offset for error */
5769 *errorcodeptr = ERR24;
5770 goto FAILED;
5771 }
5772 break;
5773
5774
5775 /* ------------------------------------------------------------ */
5776 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
5777 bravalue = OP_ONCE;
5778 ptr++;
5779 break;
5780
5781
5782 /* ------------------------------------------------------------ */
5783 case CHAR_C: /* Callout - may be followed by digits; */
5784 previous_callout = code; /* Save for later completion */
5785 after_manual_callout = 1; /* Skip one item before completing */
5786 *code++ = OP_CALLOUT;
5787 {
5788 int n = 0;
5789 ptr++;
5790 while(IS_DIGIT(*ptr))
5791 n = n * 10 + *ptr++ - CHAR_0;
5792 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5793 {
5794 *errorcodeptr = ERR39;
5795 goto FAILED;
5796 }
5797 if (n > 255)
5798 {
5799 *errorcodeptr = ERR38;
5800 goto FAILED;
5801 }
5802 *code++ = n;
5803 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
5804 PUT(code, LINK_SIZE, 0); /* Default length */
5805 code += 2 * LINK_SIZE;
5806 }
5807 previous = NULL;
5808 continue;
5809
5810
5811 /* ------------------------------------------------------------ */
5812 case CHAR_P: /* Python-style named subpattern handling */
5813 if (*(++ptr) == CHAR_EQUALS_SIGN ||
5814 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
5815 {
5816 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
5817 terminator = CHAR_RIGHT_PARENTHESIS;
5818 goto NAMED_REF_OR_RECURSE;
5819 }
5820 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
5821 {
5822 *errorcodeptr = ERR41;
5823 goto FAILED;
5824 }
5825 /* Fall through to handle (?P< as (?< is handled */
5826
5827
5828 /* ------------------------------------------------------------ */
5829 DEFINE_NAME: /* Come here from (?< handling */
5830 case CHAR_APOSTROPHE:
5831 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
5832 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5833 name = ++ptr;
5834
5835 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5836 namelen = (int)(ptr - name);
5837
5838 /* In the pre-compile phase, do a syntax check, remember the longest
5839 name, and then remember the group in a vector, expanding it if
5840 necessary. Duplicates for the same number are skipped; other duplicates
5841 are checked for validity. In the actual compile, there is nothing to
5842 do. */
5843
5844 if (lengthptr != NULL)
5845 {
5846 named_group *ng;
5847 pcre_uint32 number = cd->bracount + 1;
5848
5849 if (*ptr != (pcre_uchar)terminator)
5850 {
5851 *errorcodeptr = ERR42;
5852 goto FAILED;
5853 }
5854
5855 if (cd->names_found >= MAX_NAME_COUNT)
5856 {
5857 *errorcodeptr = ERR49;
5858 goto FAILED;
5859 }
5860
5861 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
5862 {
5863 cd->name_entry_size = namelen + IMM2_SIZE + 1;
5864 if (namelen > MAX_NAME_SIZE)
5865 {
5866 *errorcodeptr = ERR48;
5867 goto FAILED;
5868 }
5869 }
5870
5871 /* Scan the list to check for duplicates. For duplicate names, if the
5872 number is the same, break the loop, which causes the name to be
5873 discarded; otherwise, if DUPNAMES is not set, give an error.
5874 If it is set, allow the name with a different number, but continue
5875 scanning in case this is a duplicate with the same number. For
5876 non-duplicate names, give an error if the number is duplicated. */
5877
5878 ng = cd->named_groups;
5879 for (i = 0; i < cd->names_found; i++, ng++)
5880 {
5881 if (namelen == ng->length &&
5882 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
5883 {
5884 if (ng->number == number) break;
5885 if ((options & PCRE_DUPNAMES) == 0)
5886 {
5887 *errorcodeptr = ERR43;
5888 goto FAILED;
5889 }
5890 }
5891 else if (ng->number == number)
5892 {
5893 *errorcodeptr = ERR65;
5894 goto FAILED;
5895 }
5896 }
5897
5898 if (i >= cd->names_found) /* Not a duplicate with same number */
5899 {
5900 /* Increase the list size if necessary */
5901
5902 if (cd->names_found >= cd->named_group_list_size)
5903 {
5904 int newsize = cd->named_group_list_size * 2;
5905 named_group *newspace = (PUBL(malloc))
5906 (newsize * sizeof(named_group));
5907
5908 if (newspace == NULL)
5909 {
5910 *errorcodeptr = ERR21;
5911 goto FAILED;
5912 }
5913
5914 memcpy(newspace, cd->named_groups,
5915 cd->named_group_list_size * sizeof(named_group));
5916 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5917 (PUBL(free))((void *)cd->named_groups);
5918 cd->named_groups = newspace;
5919 cd->named_group_list_size = newsize;
5920 }
5921
5922 cd->named_groups[cd->names_found].name = name;
5923 cd->named_groups[cd->names_found].length = namelen;
5924 cd->named_groups[cd->names_found].number = number;
5925 cd->names_found++;
5926 }
5927 }
5928
5929 ptr++; /* Move past > or ' in both passes. */
5930 goto NUMBERED_GROUP;
5931
5932
5933 /* ------------------------------------------------------------ */
5934 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
5935 terminator = CHAR_RIGHT_PARENTHESIS;
5936 is_recurse = TRUE;
5937 /* Fall through */
5938
5939 /* We come here from the Python syntax above that handles both
5940 references (?P=name) and recursion (?P>name), as well as falling
5941 through from the Perl recursion syntax (?&name). We also come here from
5942 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5943 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5944
5945 NAMED_REF_OR_RECURSE:
5946 name = ++ptr;
5947 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5948 namelen = (int)(ptr - name);
5949
5950 /* In the pre-compile phase, do a syntax check. We used to just set
5951 a dummy reference number, because it was not used in the first pass.
5952 However, with the change of recursive back references to be atomic,
5953 we have to look for the number so that this state can be identified, as
5954 otherwise the incorrect length is computed. If it's not a backwards
5955 reference, the dummy number will do. */
5956
5957 if (lengthptr != NULL)
5958 {
5959 named_group *ng;
5960
5961 if (namelen == 0)
5962 {
5963 *errorcodeptr = ERR62;
5964 goto FAILED;
5965 }
5966 if (*ptr != (pcre_uchar)terminator)
5967 {
5968 *errorcodeptr = ERR42;
5969 goto FAILED;
5970 }
5971 if (namelen > MAX_NAME_SIZE)
5972 {
5973 *errorcodeptr = ERR48;
5974 goto FAILED;
5975 }
5976
5977 /* The name table does not exist in the first pass; instead we must
5978 scan the list of names encountered so far in order to get the
5979 number. The number may be negative if it is for a name that may be
5980 duplicated. If the name is not found, set the value to 0 for a
5981 forward reference. */
5982
5983 ng = cd->named_groups;
5984 for (i = 0; i < cd->names_found; i++, ng++)
5985 {
5986 if (namelen == ng->length &&
5987 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
5988 break;
5989 }
5990 recno = (i < cd->names_found)? ng->number : 0;
5991 }
5992
5993 /* In the real compile, search the name table. We check the name
5994 first, and then check that we have reached the end of the name in the
5995 table. That way, if the name is longer than any in the table, the
5996 comparison will fail without reading beyond the table entry. */
5997
5998 else
5999 {
6000 slot = cd->name_table;
6001 for (i = 0; i < cd->names_found; i++)
6002 {
6003 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6004 slot[IMM2_SIZE+namelen] == 0)
6005 break;
6006 slot += cd->name_entry_size;
6007 }
6008
6009 if (i < cd->names_found)
6010 {
6011 recno = GET2(slot, 0);
6012 }
6013 else
6014 {
6015 *errorcodeptr = ERR15;
6016 goto FAILED;
6017 }
6018 }
6019
6020 /* In both phases, we can now go to the code than handles numerical
6021 recursion or backreferences. */
6022
6023 if (is_recurse) goto HANDLE_RECURSION;
6024 else goto HANDLE_REFERENCE;
6025
6026
6027 /* ------------------------------------------------------------ */
6028 case CHAR_R: /* Recursion */
6029 ptr++; /* Same as (?0) */
6030 /* Fall through */
6031
6032
6033 /* ------------------------------------------------------------ */
6034 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
6035 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6036 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6037 {
6038 const pcre_uchar *called;
6039 terminator = CHAR_RIGHT_PARENTHESIS;
6040
6041 /* Come here from the \g<...> and \g'...' code (Oniguruma
6042 compatibility). However, the syntax has been checked to ensure that
6043 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6044 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6045 ever be taken. */
6046
6047 HANDLE_NUMERICAL_RECURSION:
6048
6049 if ((refsign = *ptr) == CHAR_PLUS)
6050 {
6051 ptr++;
6052 if (!IS_DIGIT(*ptr))
6053 {
6054 *errorcodeptr = ERR63;
6055 goto FAILED;
6056 }
6057 }
6058 else if (refsign == CHAR_MINUS)
6059 {
6060 if (!IS_DIGIT(ptr[1]))
6061 goto OTHER_CHAR_AFTER_QUERY;
6062 ptr++;
6063 }
6064
6065 recno = 0;
6066 while(IS_DIGIT(*ptr))
6067 recno = recno * 10 + *ptr++ - CHAR_0;
6068
6069 if (*ptr != (pcre_uchar)terminator)
6070 {
6071 *errorcodeptr = ERR29;
6072 goto FAILED;
6073 }
6074
6075 if (refsign == CHAR_MINUS)
6076 {
6077 if (recno == 0)
6078 {
6079 *errorcodeptr = ERR58;
6080 goto FAILED;
6081 }
6082 recno = cd->bracount - recno + 1;
6083 if (recno <= 0)
6084 {
6085 *errorcodeptr = ERR15;
6086 goto FAILED;
6087 }
6088 }
6089 else if (refsign == CHAR_PLUS)
6090 {
6091 if (recno == 0)
6092 {
6093 *errorcodeptr = ERR58;
6094 goto FAILED;
6095 }
6096 recno += cd->bracount;
6097 }
6098
6099 /* Come here from code above that handles a named recursion */
6100
6101 HANDLE_RECURSION:
6102
6103 previous = code;
6104 called = cd->start_code;
6105
6106 /* When we are actually compiling, find the bracket that is being
6107 referenced. Temporarily end the regex in case it doesn't exist before
6108 this point. If we end up with a forward reference, first check that
6109 the bracket does occur later so we can give the error (and position)
6110 now. Then remember this forward reference in the workspace so it can
6111 be filled in at the end. */
6112
6113 if (lengthptr == NULL)
6114 {
6115 *code = OP_END;
6116 if (recno != 0)
6117 called = PRIV(find_bracket)(cd->start_code, utf, recno);
6118
6119 /* Forward reference */
6120
6121 if (called == NULL)
6122 {
6123 if (recno > cd->final_bracount)
6124 {
6125 *errorcodeptr = ERR15;
6126 goto FAILED;
6127 }
6128
6129 /* Fudge the value of "called" so that when it is inserted as an
6130 offset below, what it actually inserted is the reference number
6131 of the group. Then remember the forward reference. */
6132
6133 called = cd->start_code + recno;
6134 if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6135 WORK_SIZE_SAFETY_MARGIN)
6136 {
6137 *errorcodeptr = expand_workspace(cd);
6138 if (*errorcodeptr != 0) goto FAILED;
6139 }
6140 PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6141 }
6142
6143 /* If not a forward reference, and the subpattern is still open,
6144 this is a recursive call. We check to see if this is a left
6145 recursion that could loop for ever, and diagnose that case. We
6146 must not, however, do this check if we are in a conditional
6147 subpattern because the condition might be testing for recursion in
6148 a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
6149 Forever loops are also detected at runtime, so those that occur in
6150 conditional subpatterns will be picked up then. */
6151
6152 else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6153 could_be_empty(called, code, bcptr, utf, cd))
6154 {
6155 *errorcodeptr = ERR40;
6156 goto FAILED;
6157 }
6158 }
6159
6160 /* Insert the recursion/subroutine item. It does not have a set first
6161 character (relevant if it is repeated, because it will then be
6162 wrapped with ONCE brackets). */
6163
6164 *code = OP_RECURSE;
6165 PUT(code, 1, (int)(called - cd->start_code));
6166 code += 1 + LINK_SIZE;
6167 groupsetfirstchar = FALSE;
6168 }
6169
6170 /* Can't determine a first byte now */
6171
6172 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6173 continue;
6174
6175
6176 /* ------------------------------------------------------------ */
6177 default: /* Other characters: check option setting */
6178 OTHER_CHAR_AFTER_QUERY:
6179 set = unset = 0;
6180 optset = &set;
6181
6182 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6183 {
6184 switch (*ptr++)
6185 {
6186 case CHAR_MINUS: optset = &unset; break;
6187
6188 case CHAR_J: /* Record that it changed in the external options */
6189 *optset |= PCRE_DUPNAMES;
6190 cd->external_flags |= PCRE_JCHANGED;
6191 break;
6192
6193 case CHAR_i: *optset |= PCRE_CASELESS; break;
6194 case CHAR_m: *optset |= PCRE_MULTILINE; break;
6195 case CHAR_s: *optset |= PCRE_DOTALL; break;
6196 case CHAR_x: *optset |= PCRE_EXTENDED; break;
6197 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
6198 case CHAR_X: *optset |= PCRE_EXTRA; break;
6199
6200 default: *errorcodeptr = ERR12;
6201 ptr--; /* Correct the offset */
6202 goto FAILED;
6203 }
6204 }
6205
6206 /* Set up the changed option bits, but don't change anything yet. */
6207
6208 newoptions = (options | set) & (~unset);
6209
6210 /* If the options ended with ')' this is not the start of a nested
6211 group with option changes, so the options change at this level. If this
6212 item is right at the start of the pattern, the options can be
6213 abstracted and made external in the pre-compile phase, and ignored in
6214 the compile phase. This can be helpful when matching -- for instance in
6215 caseless checking of required bytes.
6216
6217 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
6218 definitely *not* at the start of the pattern because something has been
6219 compiled. In the pre-compile phase, however, the code pointer can have
6220 that value after the start, because it gets reset as code is discarded
6221 during the pre-compile. However, this can happen only at top level - if
6222 we are within parentheses, the starting BRA will still be present. At
6223 any parenthesis level, the length value can be used to test if anything
6224 has been compiled at that level. Thus, a test for both these conditions
6225 is necessary to ensure we correctly detect the start of the pattern in
6226 both phases.
6227
6228 If we are not at the pattern start, reset the greedy defaults and the
6229 case value for firstchar and reqchar. */
6230
6231 if (*ptr == CHAR_RIGHT_PARENTHESIS)
6232 {
6233 if (code == cd->start_code + 1 + LINK_SIZE &&
6234 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
6235 {
6236 cd->external_options = newoptions;
6237 }
6238 else
6239 {
6240 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6241 greedy_non_default = greedy_default ^ 1;
6242 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6243 }
6244
6245 /* Change options at this level, and pass them back for use
6246 in subsequent branches. */
6247
6248 *optionsptr = options = newoptions;
6249 previous = NULL; /* This item can't be repeated */
6250 continue; /* It is complete */
6251 }
6252
6253 /* If the options ended with ':' we are heading into a nested group
6254 with possible change of options. Such groups are non-capturing and are
6255 not assertions of any kind. All we need to do is skip over the ':';
6256 the newoptions value is handled below. */
6257
6258 bravalue = OP_BRA;
6259 ptr++;
6260 } /* End of switch for character following (? */
6261 } /* End of (? handling */
6262
6263 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
6264 is set, all unadorned brackets become non-capturing and behave like (?:...)
6265 brackets. */
6266
6267 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
6268 {
6269 bravalue = OP_BRA;
6270 }
6271
6272 /* Else we have a capturing group. */
6273
6274 else
6275 {
6276 NUMBERED_GROUP:
6277 cd->bracount += 1;
6278 PUT2(code, 1+LINK_SIZE, cd->bracount);
6279 skipbytes = IMM2_SIZE;
6280 }
6281
6282 /* Process nested bracketed regex. Assertions used not to be repeatable,
6283 but this was changed for Perl compatibility, so all kinds can now be
6284 repeated. We copy code into a non-register variable (tempcode) in order to
6285 be able to pass its address because some compilers complain otherwise. */
6286
6287 previous = code; /* For handling repetition */
6288 *code = bravalue;
6289 tempcode = code;
6290 tempreqvary = cd->req_varyopt; /* Save value before bracket */
6291 tempbracount = cd->bracount; /* Save value before bracket */
6292 length_prevgroup = 0; /* Initialize for pre-compile phase */
6293
6294 if (!compile_regex(
6295 newoptions, /* The complete new option state */
6296 &tempcode, /* Where to put code (updated) */
6297 &ptr, /* Input pointer (updated) */
6298 errorcodeptr, /* Where to put an error message */
6299 (bravalue == OP_ASSERTBACK ||
6300 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
6301 reset_bracount, /* True if (?| group */
6302 skipbytes, /* Skip over bracket number */
6303 cond_depth +
6304 ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
6305 &subfirstchar, /* For possible first char */
6306 &subfirstcharflags,
6307 &subreqchar, /* For possible last char */
6308 &subreqcharflags,
6309 bcptr, /* Current branch chain */
6310 cd, /* Tables block */
6311 (lengthptr == NULL)? NULL : /* Actual compile phase */
6312 &length_prevgroup /* Pre-compile phase */
6313 ))
6314 goto FAILED;
6315
6316 /* If this was an atomic group and there are no capturing groups within it,
6317 generate OP_ONCE_NC instead of OP_ONCE. */
6318
6319 if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
6320 *code = OP_ONCE_NC;
6321
6322 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6323 cd->assert_depth -= 1;
6324
6325 /* At the end of compiling, code is still pointing to the start of the
6326 group, while tempcode has been updated to point past the end of the group.
6327 The pattern pointer (ptr) is on the bracket.
6328
6329 If this is a conditional bracket, check that there are no more than
6330 two branches in the group, or just one if it's a DEFINE group. We do this
6331 in the real compile phase, not in the pre-pass, where the whole group may
6332 not be available. */
6333
6334 if (bravalue == OP_COND && lengthptr == NULL)
6335 {
6336 pcre_uchar *tc = code;
6337 int condcount = 0;
6338
6339 do {
6340 condcount++;
6341 tc += GET(tc,1);
6342 }
6343 while (*tc != OP_KET);
6344
6345 /* A DEFINE group is never obeyed inline (the "condition" is always
6346 false). It must have only one branch. */
6347
6348 if (code[LINK_SIZE+1] == OP_DEF)
6349 {
6350 if (condcount > 1)
6351 {
6352 *errorcodeptr = ERR54;
6353 goto FAILED;
6354 }
6355 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
6356 }
6357
6358 /* A "normal" conditional group. If there is just one branch, we must not
6359 make use of its firstchar or reqchar, because this is equivalent to an
6360 empty second branch. */
6361
6362 else
6363 {
6364 if (condcount > 2)
6365 {
6366 *errorcodeptr = ERR27;
6367 goto FAILED;
6368 }
6369 if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
6370 }
6371 }
6372
6373 /* Error if hit end of pattern */
6374
6375 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6376 {
6377 *errorcodeptr = ERR14;
6378 goto FAILED;
6379 }
6380
6381 /* In the pre-compile phase, update the length by the length of the group,
6382 less the brackets at either end. Then reduce the compiled code to just a
6383 set of non-capturing brackets so that it doesn't use much memory if it is
6384 duplicated by a quantifier.*/
6385
6386 if (lengthptr != NULL)
6387 {
6388 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6389 {
6390 *errorcodeptr = ERR20;
6391 goto FAILED;
6392 }
6393 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6394 code++; /* This already contains bravalue */
6395 PUTINC(code, 0, 1 + LINK_SIZE);
6396 *code++ = OP_KET;
6397 PUTINC(code, 0, 1 + LINK_SIZE);
6398 break; /* No need to waste time with special character handling */
6399 }
6400
6401 /* Otherwise update the main code pointer to the end of the group. */
6402
6403 code = tempcode;
6404
6405 /* For a DEFINE group, required and first character settings are not
6406 relevant. */
6407
6408 if (bravalue == OP_DEF) break;
6409
6410 /* Handle updating of the required and first characters for other types of
6411 group. Update for normal brackets of all kinds, and conditions with two
6412 branches (see code above). If the bracket is followed by a quantifier with
6413 zero repeat, we have to back off. Hence the definition of zeroreqchar and
6414 zerofirstchar outside the main loop so that they can be accessed for the
6415 back off. */
6416
6417 zeroreqchar = reqchar;
6418 zeroreqcharflags = reqcharflags;
6419 zerofirstchar = firstchar;
6420 zerofirstcharflags = firstcharflags;
6421 groupsetfirstchar = FALSE;
6422
6423 if (bravalue >= OP_ONCE)
6424 {
6425 /* If we have not yet set a firstchar in this branch, take it from the
6426 subpattern, remembering that it was set here so that a repeat of more
6427 than one can replicate it as reqchar if necessary. If the subpattern has
6428 no firstchar, set "none" for the whole branch. In both cases, a zero
6429 repeat forces firstchar to "none". */
6430
6431 if (firstcharflags == REQ_UNSET)
6432 {
6433 if (subfirstcharflags >= 0)
6434 {
6435 firstchar = subfirstchar;
6436 firstcharflags = subfirstcharflags;
6437 groupsetfirstchar = TRUE;
6438 }
6439 else firstcharflags = REQ_NONE;
6440 zerofirstcharflags = REQ_NONE;
6441 }
6442
6443 /* If firstchar was previously set, convert the subpattern's firstchar
6444 into reqchar if there wasn't one, using the vary flag that was in
6445 existence beforehand. */
6446
6447 else if (subfirstcharflags >= 0 && subreqcharflags < 0)
6448 {
6449 subreqchar = subfirstchar;
6450 subreqcharflags = subfirstcharflags | tempreqvary;
6451 }
6452
6453 /* If the subpattern set a required byte (or set a first byte that isn't
6454 really the first byte - see above), set it. */
6455
6456 if (subreqcharflags >= 0)
6457 {
6458 reqchar = subreqchar;
6459 reqcharflags = subreqcharflags;
6460 }
6461 }
6462
6463 /* For a forward assertion, we take the reqchar, if set. This can be
6464 helpful if the pattern that follows the assertion doesn't set a different
6465 char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
6466 for an assertion, however because it leads to incorrect effect for patterns
6467 such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
6468 of a firstchar. This is overcome by a scan at the end if there's no
6469 firstchar, looking for an asserted first char. */
6470
6471 else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
6472 {
6473 reqchar = subreqchar;
6474 reqcharflags = subreqcharflags;
6475 }
6476 break; /* End of processing '(' */
6477
6478
6479 /* ===================================================================*/
6480 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
6481 are arranged to be the negation of the corresponding OP_values in the
6482 default case when PCRE_UCP is not set. For the back references, the values
6483 are negative the reference number. Only back references and those types
6484 that consume a character may be repeated. We can test for values between
6485 ESC_b and ESC_Z for the latter; this may have to change if any new ones are
6486 ever created. */
6487
6488 case CHAR_BACKSLASH:
6489 tempptr = ptr;
6490 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
6491 if (*errorcodeptr != 0) goto FAILED;
6492
6493 if (escape == 0) /* The escape coded a single character */
6494 c = ec;
6495 else
6496 {
6497 if (escape == ESC_Q) /* Handle start of quoted string */
6498 {
6499 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
6500 ptr += 2; /* avoid empty string */
6501 else inescq = TRUE;
6502 continue;
6503 }
6504
6505 if (escape == ESC_E) continue; /* Perl ignores an orphan \E */
6506
6507 /* For metasequences that actually match a character, we disable the
6508 setting of a first character if it hasn't already been set. */
6509
6510 if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
6511 firstcharflags = REQ_NONE;
6512
6513 /* Set values to reset to if this is followed by a zero repeat. */
6514
6515 zerofirstchar = firstchar;
6516 zerofirstcharflags = firstcharflags;
6517 zeroreqchar = reqchar;
6518 zeroreqcharflags = reqcharflags;
6519
6520 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6521 is a subroutine call by number (Oniguruma syntax). In fact, the value
6522 ESC_g is returned only for these cases. So we don't need to check for <
6523 or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
6524 -n, and for the Perl syntax \g{name} the result is ESC_k (as
6525 that is a synonym for a named back reference). */
6526
6527 if (escape == ESC_g)
6528 {
6529 const pcre_uchar *p;
6530 save_hwm = cd->hwm; /* Normally this is set when '(' is read */
6531 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6532 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6533
6534 /* These two statements stop the compiler for warning about possibly
6535 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
6536 fact, because we actually check for a number below, the paths that
6537 would actually be in error are never taken. */
6538
6539 skipbytes = 0;
6540 reset_bracount = FALSE;
6541
6542 /* Test for a name */
6543
6544 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6545 {
6546 BOOL is_a_number = TRUE;
6547 for (p = ptr + 1; *p != CHAR_NULL && *p != (pcre_uchar)terminator; p++)
6548 {
6549 if (!MAX_255(*p)) { is_a_number = FALSE; break; }
6550 if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;
6551 if ((cd->ctypes[*p] & ctype_word) == 0) break;
6552 }
6553 if (*p != (pcre_uchar)terminator)
6554 {
6555 *errorcodeptr = ERR57;
6556 break;
6557 }
6558 if (is_a_number)
6559 {
6560 ptr++;
6561 goto HANDLE_NUMERICAL_RECURSION;
6562 }
6563 is_recurse = TRUE;
6564 goto NAMED_REF_OR_RECURSE;
6565 }
6566
6567 /* Test a signed number in angle brackets or quotes. */
6568
6569 p = ptr + 2;
6570 while (IS_DIGIT(*p)) p++;
6571 if (*p != (pcre_uchar)terminator)
6572 {
6573 *errorcodeptr = ERR57;
6574 break;
6575 }
6576 ptr++;
6577 goto HANDLE_NUMERICAL_RECURSION;
6578 }
6579
6580 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6581 We also support \k{name} (.NET syntax). */
6582
6583 if (escape == ESC_k)
6584 {
6585 if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6586 ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6587 {
6588 *errorcodeptr = ERR69;
6589 break;
6590 }
6591 is_recurse = FALSE;
6592 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6593 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6594 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6595 goto NAMED_REF_OR_RECURSE;
6596 }
6597
6598 /* Back references are handled specially; must disable firstchar if
6599 not set to cope with cases like (?=(\w+))\1: which would otherwise set
6600 ':' later. */
6601
6602 if (escape < 0)
6603 {
6604 open_capitem *oc;
6605 recno = -escape;
6606
6607 HANDLE_REFERENCE: /* Come here from named backref handling */
6608 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6609 previous = code;
6610 *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
6611 PUT2INC(code, 0, recno);
6612 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6613 if (recno > cd->top_backref) cd->top_backref = recno;
6614
6615 /* Check to see if this back reference is recursive, that it, it
6616 is inside the group that it references. A flag is set so that the
6617 group can be made atomic. */
6618
6619 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6620 {
6621 if (oc->number == recno)
6622 {
6623 oc->flag = TRUE;
6624 break;
6625 }
6626 }
6627 }
6628
6629 /* So are Unicode property matches, if supported. */
6630
6631 #ifdef SUPPORT_UCP
6632 else if (escape == ESC_P || escape == ESC_p)
6633 {
6634 BOOL negated;
6635 unsigned int ptype = 0, pdata = 0;
6636 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
6637 goto FAILED;
6638 previous = code;
6639 *code++ = ((escape == ESC_p)