/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 826 - (show annotations)
Tue Dec 27 09:42:33 2011 UTC (7 years, 7 months ago) by zherczeg
File MIME type: text/plain
File size: 268869 byte(s)
fix horizontal and vertical white space ranges in 16 bit mode
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80
81 /*************************************************
82 * Code parameters and static tables *
83 *************************************************/
84
85 /* This value specifies the size of stack workspace that is used during the
86 first pre-compile phase that determines how much memory is required. The regex
87 is partly compiled into this space, but the compiled parts are discarded as
88 soon as they can be, so that hopefully there will never be an overrun. The code
89 does, however, check for an overrun. The largest amount I've seen used is 218,
90 so this number is very generous.
91
92 The same workspace is used during the second, actual compile phase for
93 remembering forward references to groups so that they can be filled in at the
94 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95 is 4 there is plenty of room for most patterns. However, the memory can get
96 filled up by repetitions of forward references, for example patterns like
97 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98 that the workspace is expanded using malloc() in this situation. The value
99 below is therefore a minimum, and we put a maximum on it for safety. The
100 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101 kicks in at the same number of forward references in all cases. */
102
103 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105
106 /* The overrun tests check for a slightly smaller size so that they detect the
107 overrun before it actually does run off the end of the data block. */
108
109 #define WORK_SIZE_SAFETY_MARGIN (100)
110
111 /* Private flags added to firstchar and reqchar. */
112
113 #define REQ_CASELESS 0x10000000l /* Indicates caselessness */
114 #define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */
115
116 /* Repeated character flags. */
117
118 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
119
120 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121 are simple data values; negative values are for special things like \d and so
122 on. Zero means further processing is needed (for things like \x), or the escape
123 is invalid. */
124
125 #ifndef EBCDIC
126
127 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
128 in UTF-8 mode. */
129
130 static const short int escapes[] = {
131 0, 0,
132 0, 0,
133 0, 0,
134 0, 0,
135 0, 0,
136 CHAR_COLON, CHAR_SEMICOLON,
137 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
138 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
139 CHAR_COMMERCIAL_AT, -ESC_A,
140 -ESC_B, -ESC_C,
141 -ESC_D, -ESC_E,
142 0, -ESC_G,
143 -ESC_H, 0,
144 0, -ESC_K,
145 0, 0,
146 -ESC_N, 0,
147 -ESC_P, -ESC_Q,
148 -ESC_R, -ESC_S,
149 0, 0,
150 -ESC_V, -ESC_W,
151 -ESC_X, 0,
152 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
153 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
154 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
155 CHAR_GRAVE_ACCENT, 7,
156 -ESC_b, 0,
157 -ESC_d, ESC_e,
158 ESC_f, 0,
159 -ESC_h, 0,
160 0, -ESC_k,
161 0, 0,
162 ESC_n, 0,
163 -ESC_p, 0,
164 ESC_r, -ESC_s,
165 ESC_tee, 0,
166 -ESC_v, -ESC_w,
167 0, 0,
168 -ESC_z
169 };
170
171 #else
172
173 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
174
175 static const short int escapes[] = {
176 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
177 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
178 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
179 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
180 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
181 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
182 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
183 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
184 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
185 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
186 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
187 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
188 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
189 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
190 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
191 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
192 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
193 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
194 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
195 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
196 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
197 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
198 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
199 };
200 #endif
201
202
203 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
204 searched linearly. Put all the names into a single string, in order to reduce
205 the number of relocations when a shared library is dynamically linked. The
206 string is built from string macros so that it works in UTF-8 mode on EBCDIC
207 platforms. */
208
209 typedef struct verbitem {
210 int len; /* Length of verb name */
211 int op; /* Op when no arg, or -1 if arg mandatory */
212 int op_arg; /* Op when arg present, or -1 if not allowed */
213 } verbitem;
214
215 static const char verbnames[] =
216 "\0" /* Empty name is a shorthand for MARK */
217 STRING_MARK0
218 STRING_ACCEPT0
219 STRING_COMMIT0
220 STRING_F0
221 STRING_FAIL0
222 STRING_PRUNE0
223 STRING_SKIP0
224 STRING_THEN;
225
226 static const verbitem verbs[] = {
227 { 0, -1, OP_MARK },
228 { 4, -1, OP_MARK },
229 { 6, OP_ACCEPT, -1 },
230 { 6, OP_COMMIT, -1 },
231 { 1, OP_FAIL, -1 },
232 { 4, OP_FAIL, -1 },
233 { 5, OP_PRUNE, OP_PRUNE_ARG },
234 { 4, OP_SKIP, OP_SKIP_ARG },
235 { 4, OP_THEN, OP_THEN_ARG }
236 };
237
238 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
239
240
241 /* Tables of names of POSIX character classes and their lengths. The names are
242 now all in a single string, to reduce the number of relocations when a shared
243 library is dynamically loaded. The list of lengths is terminated by a zero
244 length entry. The first three must be alpha, lower, upper, as this is assumed
245 for handling case independence. */
246
247 static const char posix_names[] =
248 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
249 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
250 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251 STRING_word0 STRING_xdigit;
252
253 static const pcre_uint8 posix_name_lengths[] = {
254 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255
256 /* Table of class bit maps for each POSIX class. Each class is formed from a
257 base map, with an optional addition or removal of another map. Then, for some
258 classes, there is some additional tweaking: for [:blank:] the vertical space
259 characters are removed, and for [:alpha:] and [:alnum:] the underscore
260 character is removed. The triples in the table consist of the base map offset,
261 second map offset or -1 if no second map, and a non-negative value for map
262 addition or a negative value for map subtraction (if there are two maps). The
263 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
264 remove vertical space characters, 2 => remove underscore. */
265
266 static const int posix_class_maps[] = {
267 cbit_word, cbit_digit, -2, /* alpha */
268 cbit_lower, -1, 0, /* lower */
269 cbit_upper, -1, 0, /* upper */
270 cbit_word, -1, 2, /* alnum - word without underscore */
271 cbit_print, cbit_cntrl, 0, /* ascii */
272 cbit_space, -1, 1, /* blank - a GNU extension */
273 cbit_cntrl, -1, 0, /* cntrl */
274 cbit_digit, -1, 0, /* digit */
275 cbit_graph, -1, 0, /* graph */
276 cbit_print, -1, 0, /* print */
277 cbit_punct, -1, 0, /* punct */
278 cbit_space, -1, 0, /* space */
279 cbit_word, -1, 0, /* word - a Perl extension */
280 cbit_xdigit,-1, 0 /* xdigit */
281 };
282
283 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
284 substitutes must be in the order of the names, defined above, and there are
285 both positive and negative cases. NULL means no substitute. */
286
287 #ifdef SUPPORT_UCP
288 static const pcre_uchar string_PNd[] = {
289 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291 static const pcre_uchar string_pNd[] = {
292 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294 static const pcre_uchar string_PXsp[] = {
295 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297 static const pcre_uchar string_pXsp[] = {
298 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300 static const pcre_uchar string_PXwd[] = {
301 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303 static const pcre_uchar string_pXwd[] = {
304 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306
307 static const pcre_uchar *substitutes[] = {
308 string_PNd, /* \D */
309 string_pNd, /* \d */
310 string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
311 string_pXsp, /* \s */
312 string_PXwd, /* \W */
313 string_pXwd /* \w */
314 };
315
316 static const pcre_uchar string_pL[] = {
317 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319 static const pcre_uchar string_pLl[] = {
320 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322 static const pcre_uchar string_pLu[] = {
323 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325 static const pcre_uchar string_pXan[] = {
326 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328 static const pcre_uchar string_h[] = {
329 CHAR_BACKSLASH, CHAR_h, '\0' };
330 static const pcre_uchar string_pXps[] = {
331 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333 static const pcre_uchar string_PL[] = {
334 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336 static const pcre_uchar string_PLl[] = {
337 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339 static const pcre_uchar string_PLu[] = {
340 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342 static const pcre_uchar string_PXan[] = {
343 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345 static const pcre_uchar string_H[] = {
346 CHAR_BACKSLASH, CHAR_H, '\0' };
347 static const pcre_uchar string_PXps[] = {
348 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350
351 static const pcre_uchar *posix_substitutes[] = {
352 string_pL, /* alpha */
353 string_pLl, /* lower */
354 string_pLu, /* upper */
355 string_pXan, /* alnum */
356 NULL, /* ascii */
357 string_h, /* blank */
358 NULL, /* cntrl */
359 string_pNd, /* digit */
360 NULL, /* graph */
361 NULL, /* print */
362 NULL, /* punct */
363 string_pXps, /* space */ /* NOTE: Xps is POSIX space */
364 string_pXwd, /* word */
365 NULL, /* xdigit */
366 /* Negated cases */
367 string_PL, /* ^alpha */
368 string_PLl, /* ^lower */
369 string_PLu, /* ^upper */
370 string_PXan, /* ^alnum */
371 NULL, /* ^ascii */
372 string_H, /* ^blank */
373 NULL, /* ^cntrl */
374 string_PNd, /* ^digit */
375 NULL, /* ^graph */
376 NULL, /* ^print */
377 NULL, /* ^punct */
378 string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
379 string_PXwd, /* ^word */
380 NULL /* ^xdigit */
381 };
382 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383 #endif
384
385 #define STRING(a) # a
386 #define XSTRING(s) STRING(s)
387
388 /* The texts of compile-time error messages. These are "char *" because they
389 are passed to the outside world. Do not ever re-use any error number, because
390 they are documented. Always add a new error instead. Messages marked DEAD below
391 are no longer used. This used to be a table of strings, but in order to reduce
392 the number of relocations needed when a shared library is loaded dynamically,
393 it is now one long string. We cannot use a table of offsets, because the
394 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
395 simply count through to the one we want - this isn't a performance issue
396 because these strings are used only when there is a compilation error.
397
398 Each substring ends with \0 to insert a null character. This includes the final
399 substring, so that the whole string ends with \0\0, which can be detected when
400 counting through. */
401
402 static const char error_texts[] =
403 "no error\0"
404 "\\ at end of pattern\0"
405 "\\c at end of pattern\0"
406 "unrecognized character follows \\\0"
407 "numbers out of order in {} quantifier\0"
408 /* 5 */
409 "number too big in {} quantifier\0"
410 "missing terminating ] for character class\0"
411 "invalid escape sequence in character class\0"
412 "range out of order in character class\0"
413 "nothing to repeat\0"
414 /* 10 */
415 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
416 "internal error: unexpected repeat\0"
417 "unrecognized character after (? or (?-\0"
418 "POSIX named classes are supported only within a class\0"
419 "missing )\0"
420 /* 15 */
421 "reference to non-existent subpattern\0"
422 "erroffset passed as NULL\0"
423 "unknown option bit(s) set\0"
424 "missing ) after comment\0"
425 "parentheses nested too deeply\0" /** DEAD **/
426 /* 20 */
427 "regular expression is too large\0"
428 "failed to get memory\0"
429 "unmatched parentheses\0"
430 "internal error: code overflow\0"
431 "unrecognized character after (?<\0"
432 /* 25 */
433 "lookbehind assertion is not fixed length\0"
434 "malformed number or name after (?(\0"
435 "conditional group contains more than two branches\0"
436 "assertion expected after (?(\0"
437 "(?R or (?[+-]digits must be followed by )\0"
438 /* 30 */
439 "unknown POSIX class name\0"
440 "POSIX collating elements are not supported\0"
441 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
442 "spare error\0" /** DEAD **/
443 "character value in \\x{...} sequence is too large\0"
444 /* 35 */
445 "invalid condition (?(0)\0"
446 "\\C not allowed in lookbehind assertion\0"
447 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
448 "number after (?C is > 255\0"
449 "closing ) for (?C expected\0"
450 /* 40 */
451 "recursive call could loop indefinitely\0"
452 "unrecognized character after (?P\0"
453 "syntax error in subpattern name (missing terminator)\0"
454 "two named subpatterns have the same name\0"
455 "invalid UTF-8 string\0"
456 /* 45 */
457 "support for \\P, \\p, and \\X has not been compiled\0"
458 "malformed \\P or \\p sequence\0"
459 "unknown property name after \\P or \\p\0"
460 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
461 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
462 /* 50 */
463 "repeated subpattern is too long\0" /** DEAD **/
464 "octal value is greater than \\377 (not in UTF-8 mode)\0"
465 "internal error: overran compiling workspace\0"
466 "internal error: previously-checked referenced subpattern not found\0"
467 "DEFINE group contains more than one branch\0"
468 /* 55 */
469 "repeating a DEFINE group is not allowed\0" /** DEAD **/
470 "inconsistent NEWLINE options\0"
471 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
472 "a numbered reference must not be zero\0"
473 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
474 /* 60 */
475 "(*VERB) not recognized\0"
476 "number is too big\0"
477 "subpattern name expected\0"
478 "digit expected after (?+\0"
479 "] is an invalid data character in JavaScript compatibility mode\0"
480 /* 65 */
481 "different names for subpatterns of the same number are not allowed\0"
482 "(*MARK) must have an argument\0"
483 "this version of PCRE is not compiled with PCRE_UCP support\0"
484 "\\c must be followed by an ASCII character\0"
485 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486 /* 70 */
487 "internal error: unknown opcode in find_fixedlength()\0"
488 "\\N is not supported in a class\0"
489 "too many forward references\0"
490 "disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff)\0"
491 ;
492
493 /* Table to identify digits and hex digits. This is used when compiling
494 patterns. Note that the tables in chartables are dependent on the locale, and
495 may mark arbitrary characters as digits - but the PCRE compiling code expects
496 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
497 a private table here. It costs 256 bytes, but it is a lot faster than doing
498 character value tests (at least in some simple cases I timed), and in some
499 applications one wants PCRE to compile efficiently as well as match
500 efficiently.
501
502 For convenience, we use the same bit definitions as in chartables:
503
504 0x04 decimal digit
505 0x08 hexadecimal digit
506
507 Then we can use ctype_digit and ctype_xdigit in the code. */
508
509 /* Using a simple comparison for decimal numbers rather than a memory read
510 is much faster, and the resulting code is simpler (the compiler turns it
511 into a subtraction and unsigned comparison). */
512
513 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
514
515 #ifndef EBCDIC
516
517 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
518 UTF-8 mode. */
519
520 static const pcre_uint8 digitab[] =
521 {
522 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
523 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
524 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
525 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
526 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
527 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
528 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
529 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
530 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
531 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
532 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
533 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
534 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
535 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
536 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
537 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
538 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
539 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
540 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
541 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
542 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
543 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
544 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
545 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
546 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
547 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
548 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
549 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
550 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
551 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
552 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
553 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
554
555 #else
556
557 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
558
559 static const pcre_uint8 digitab[] =
560 {
561 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
562 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
563 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
564 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
565 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
566 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
577 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
585 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
591 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
592 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
593
594 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
595 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
596 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
597 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
599 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
603 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
604 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
606 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
608 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
611 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
612 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
613 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
614 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
615 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
616 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
617 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
618 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
619 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
620 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
621 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
622 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
623 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
624 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
625 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
626 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
627 #endif
628
629
630 /* Definition to allow mutual recursion */
631
632 static BOOL
633 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
634 int *, int *, branch_chain *, compile_data *, int *);
635
636
637
638 /*************************************************
639 * Find an error text *
640 *************************************************/
641
642 /* The error texts are now all in one long string, to save on relocations. As
643 some of the text is of unknown length, we can't use a table of offsets.
644 Instead, just count through the strings. This is not a performance issue
645 because it happens only when there has been a compilation error.
646
647 Argument: the error number
648 Returns: pointer to the error string
649 */
650
651 static const char *
652 find_error_text(int n)
653 {
654 const char *s = error_texts;
655 for (; n > 0; n--)
656 {
657 while (*s++ != 0) {};
658 if (*s == 0) return "Error text not found (please report)";
659 }
660 return s;
661 }
662
663
664 /*************************************************
665 * Expand the workspace *
666 *************************************************/
667
668 /* This function is called during the second compiling phase, if the number of
669 forward references fills the existing workspace, which is originally a block on
670 the stack. A larger block is obtained from malloc() unless the ultimate limit
671 has been reached or the increase will be rather small.
672
673 Argument: pointer to the compile data block
674 Returns: 0 if all went well, else an error number
675 */
676
677 static int
678 expand_workspace(compile_data *cd)
679 {
680 pcre_uchar *newspace;
681 int newsize = cd->workspace_size * 2;
682
683 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
684 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
685 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
686 return ERR72;
687
688 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
689 if (newspace == NULL) return ERR21;
690 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
691 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
692 if (cd->workspace_size > COMPILE_WORK_SIZE)
693 (PUBL(free))((void *)cd->start_workspace);
694 cd->start_workspace = newspace;
695 cd->workspace_size = newsize;
696 return 0;
697 }
698
699
700
701 /*************************************************
702 * Check for counted repeat *
703 *************************************************/
704
705 /* This function is called when a '{' is encountered in a place where it might
706 start a quantifier. It looks ahead to see if it really is a quantifier or not.
707 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
708 where the ddds are digits.
709
710 Arguments:
711 p pointer to the first char after '{'
712
713 Returns: TRUE or FALSE
714 */
715
716 static BOOL
717 is_counted_repeat(const pcre_uchar *p)
718 {
719 if (!IS_DIGIT(*p)) return FALSE;
720 p++;
721 while (IS_DIGIT(*p)) p++;
722 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
723
724 if (*p++ != CHAR_COMMA) return FALSE;
725 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
726
727 if (!IS_DIGIT(*p)) return FALSE;
728 p++;
729 while (IS_DIGIT(*p)) p++;
730
731 return (*p == CHAR_RIGHT_CURLY_BRACKET);
732 }
733
734
735
736 /*************************************************
737 * Handle escapes *
738 *************************************************/
739
740 /* This function is called when a \ has been encountered. It either returns a
741 positive value for a simple escape such as \n, or a negative value which
742 encodes one of the more complicated things such as \d. A backreference to group
743 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
744 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
745 ptr is pointing at the \. On exit, it is on the final character of the escape
746 sequence.
747
748 Arguments:
749 ptrptr points to the pattern position pointer
750 errorcodeptr points to the errorcode variable
751 bracount number of previous extracting brackets
752 options the options bits
753 isclass TRUE if inside a character class
754
755 Returns: zero or positive => a data character
756 negative => a special escape sequence
757 on error, errorcodeptr is set
758 */
759
760 static int
761 check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
762 int options, BOOL isclass)
763 {
764 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
765 BOOL utf = (options & PCRE_UTF8) != 0;
766 const pcre_uchar *ptr = *ptrptr + 1;
767 pcre_int32 c;
768 int i;
769
770 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
771 ptr--; /* Set pointer back to the last byte */
772
773 /* If backslash is at the end of the pattern, it's an error. */
774
775 if (c == 0) *errorcodeptr = ERR1;
776
777 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
778 in a table. A non-zero result is something that can be returned immediately.
779 Otherwise further processing may be required. */
780
781 #ifndef EBCDIC /* ASCII/UTF-8 coding */
782 /* Not alphanumeric */
783 else if (c < CHAR_0 || c > CHAR_z) {}
784 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
785
786 #else /* EBCDIC coding */
787 /* Not alphanumeric */
788 else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
789 else if ((i = escapes[c - 0x48]) != 0) c = i;
790 #endif
791
792 /* Escapes that need further processing, or are illegal. */
793
794 else
795 {
796 const pcre_uchar *oldptr;
797 BOOL braced, negated;
798
799 switch (c)
800 {
801 /* A number of Perl escapes are not handled by PCRE. We give an explicit
802 error. */
803
804 case CHAR_l:
805 case CHAR_L:
806 *errorcodeptr = ERR37;
807 break;
808
809 case CHAR_u:
810 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
811 {
812 /* In JavaScript, \u must be followed by four hexadecimal numbers.
813 Otherwise it is a lowercase u letter. */
814 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
815 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
816 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
817 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
818 {
819 c = 0;
820 for (i = 0; i < 4; ++i)
821 {
822 register int cc = *(++ptr);
823 #ifndef EBCDIC /* ASCII/UTF-8 coding */
824 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
825 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
826 #else /* EBCDIC coding */
827 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
828 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
829 #endif
830 }
831 }
832 }
833 else
834 *errorcodeptr = ERR37;
835 break;
836
837 case CHAR_U:
838 /* In JavaScript, \U is an uppercase U letter. */
839 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
840 break;
841
842 /* In a character class, \g is just a literal "g". Outside a character
843 class, \g must be followed by one of a number of specific things:
844
845 (1) A number, either plain or braced. If positive, it is an absolute
846 backreference. If negative, it is a relative backreference. This is a Perl
847 5.10 feature.
848
849 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
850 is part of Perl's movement towards a unified syntax for back references. As
851 this is synonymous with \k{name}, we fudge it up by pretending it really
852 was \k.
853
854 (3) For Oniguruma compatibility we also support \g followed by a name or a
855 number either in angle brackets or in single quotes. However, these are
856 (possibly recursive) subroutine calls, _not_ backreferences. Just return
857 the -ESC_g code (cf \k). */
858
859 case CHAR_g:
860 if (isclass) break;
861 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
862 {
863 c = -ESC_g;
864 break;
865 }
866
867 /* Handle the Perl-compatible cases */
868
869 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
870 {
871 const pcre_uchar *p;
872 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
873 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
874 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
875 {
876 c = -ESC_k;
877 break;
878 }
879 braced = TRUE;
880 ptr++;
881 }
882 else braced = FALSE;
883
884 if (ptr[1] == CHAR_MINUS)
885 {
886 negated = TRUE;
887 ptr++;
888 }
889 else negated = FALSE;
890
891 /* The integer range is limited by the machine's int representation. */
892 c = 0;
893 while (IS_DIGIT(ptr[1]))
894 {
895 if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
896 {
897 c = -1;
898 break;
899 }
900 c = c * 10 + *(++ptr) - CHAR_0;
901 }
902 if (((unsigned int)c) > INT_MAX) /* Integer overflow */
903 {
904 while (IS_DIGIT(ptr[1]))
905 ptr++;
906 *errorcodeptr = ERR61;
907 break;
908 }
909
910 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
911 {
912 *errorcodeptr = ERR57;
913 break;
914 }
915
916 if (c == 0)
917 {
918 *errorcodeptr = ERR58;
919 break;
920 }
921
922 if (negated)
923 {
924 if (c > bracount)
925 {
926 *errorcodeptr = ERR15;
927 break;
928 }
929 c = bracount - (c - 1);
930 }
931
932 c = -(ESC_REF + c);
933 break;
934
935 /* The handling of escape sequences consisting of a string of digits
936 starting with one that is not zero is not straightforward. By experiment,
937 the way Perl works seems to be as follows:
938
939 Outside a character class, the digits are read as a decimal number. If the
940 number is less than 10, or if there are that many previous extracting
941 left brackets, then it is a back reference. Otherwise, up to three octal
942 digits are read to form an escaped byte. Thus \123 is likely to be octal
943 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
944 value is greater than 377, the least significant 8 bits are taken. Inside a
945 character class, \ followed by a digit is always an octal number. */
946
947 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
948 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
949
950 if (!isclass)
951 {
952 oldptr = ptr;
953 /* The integer range is limited by the machine's int representation. */
954 c -= CHAR_0;
955 while (IS_DIGIT(ptr[1]))
956 {
957 if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
958 {
959 c = -1;
960 break;
961 }
962 c = c * 10 + *(++ptr) - CHAR_0;
963 }
964 if (((unsigned int)c) > INT_MAX) /* Integer overflow */
965 {
966 while (IS_DIGIT(ptr[1]))
967 ptr++;
968 *errorcodeptr = ERR61;
969 break;
970 }
971 if (c < 10 || c <= bracount)
972 {
973 c = -(ESC_REF + c);
974 break;
975 }
976 ptr = oldptr; /* Put the pointer back and fall through */
977 }
978
979 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
980 generates a binary zero byte and treats the digit as a following literal.
981 Thus we have to pull back the pointer by one. */
982
983 if ((c = *ptr) >= CHAR_8)
984 {
985 ptr--;
986 c = 0;
987 break;
988 }
989
990 /* \0 always starts an octal number, but we may drop through to here with a
991 larger first octal digit. The original code used just to take the least
992 significant 8 bits of octal numbers (I think this is what early Perls used
993 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
994 than 3 octal digits. */
995
996 case CHAR_0:
997 c -= CHAR_0;
998 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
999 c = c * 8 + *(++ptr) - CHAR_0;
1000 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1001 break;
1002
1003 /* \x is complicated. \x{ddd} is a character number which can be greater
1004 than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1005 If not, { is treated as a data character. */
1006
1007 case CHAR_x:
1008 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1009 {
1010 /* In JavaScript, \x must be followed by two hexadecimal numbers.
1011 Otherwise it is a lowercase x letter. */
1012 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1013 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1014 {
1015 c = 0;
1016 for (i = 0; i < 2; ++i)
1017 {
1018 register int cc = *(++ptr);
1019 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1020 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1021 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1022 #else /* EBCDIC coding */
1023 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1024 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1025 #endif
1026 }
1027 }
1028 break;
1029 }
1030
1031 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1032 {
1033 const pcre_uchar *pt = ptr + 2;
1034
1035 c = 0;
1036 while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1037 {
1038 register int cc = *pt++;
1039 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1040
1041 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1042 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1043 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1044 #else /* EBCDIC coding */
1045 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1046 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1047 #endif
1048
1049 #ifdef COMPILE_PCRE8
1050 if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1051 #else
1052 #ifdef COMPILE_PCRE16
1053 if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1054 #endif
1055 #endif
1056 }
1057
1058 if (c < 0)
1059 {
1060 while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1061 *errorcodeptr = ERR34;
1062 }
1063
1064 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1065 {
1066 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1067 ptr = pt;
1068 break;
1069 }
1070
1071 /* If the sequence of hex digits does not end with '}', then we don't
1072 recognize this construct; fall through to the normal \x handling. */
1073 }
1074
1075 /* Read just a single-byte hex-defined char */
1076
1077 c = 0;
1078 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1079 {
1080 int cc; /* Some compilers don't like */
1081 cc = *(++ptr); /* ++ in initializers */
1082 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1083 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1084 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1085 #else /* EBCDIC coding */
1086 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1087 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1088 #endif
1089 }
1090 break;
1091
1092 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1093 An error is given if the byte following \c is not an ASCII character. This
1094 coding is ASCII-specific, but then the whole concept of \cx is
1095 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1096
1097 case CHAR_c:
1098 c = *(++ptr);
1099 if (c == 0)
1100 {
1101 *errorcodeptr = ERR2;
1102 break;
1103 }
1104 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1105 if (c > 127) /* Excludes all non-ASCII in either mode */
1106 {
1107 *errorcodeptr = ERR68;
1108 break;
1109 }
1110 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1111 c ^= 0x40;
1112 #else /* EBCDIC coding */
1113 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1114 c ^= 0xC0;
1115 #endif
1116 break;
1117
1118 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1119 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1120 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1121 odd, but there used to be some cases other than the default, and there may
1122 be again in future, so I haven't "optimized" it. */
1123
1124 default:
1125 if ((options & PCRE_EXTRA) != 0) switch(c)
1126 {
1127 default:
1128 *errorcodeptr = ERR3;
1129 break;
1130 }
1131 break;
1132 }
1133 }
1134
1135 /* Perl supports \N{name} for character names, as well as plain \N for "not
1136 newline". PCRE does not support \N{name}. However, it does support
1137 quantification such as \N{2,3}. */
1138
1139 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1140 !is_counted_repeat(ptr+2))
1141 *errorcodeptr = ERR37;
1142
1143 /* If PCRE_UCP is set, we change the values for \d etc. */
1144
1145 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1146 c -= (ESC_DU - ESC_D);
1147
1148 /* Set the pointer to the final character before returning. */
1149
1150 *ptrptr = ptr;
1151 return c;
1152 }
1153
1154
1155
1156 #ifdef SUPPORT_UCP
1157 /*************************************************
1158 * Handle \P and \p *
1159 *************************************************/
1160
1161 /* This function is called after \P or \p has been encountered, provided that
1162 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1163 pointing at the P or p. On exit, it is pointing at the final character of the
1164 escape sequence.
1165
1166 Argument:
1167 ptrptr points to the pattern position pointer
1168 negptr points to a boolean that is set TRUE for negation else FALSE
1169 dptr points to an int that is set to the detailed property value
1170 errorcodeptr points to the error code variable
1171
1172 Returns: type value from ucp_type_table, or -1 for an invalid type
1173 */
1174
1175 static int
1176 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1177 {
1178 int c, i, bot, top;
1179 const pcre_uchar *ptr = *ptrptr;
1180 pcre_uchar name[32];
1181
1182 c = *(++ptr);
1183 if (c == 0) goto ERROR_RETURN;
1184
1185 *negptr = FALSE;
1186
1187 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1188 negation. */
1189
1190 if (c == CHAR_LEFT_CURLY_BRACKET)
1191 {
1192 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1193 {
1194 *negptr = TRUE;
1195 ptr++;
1196 }
1197 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1198 {
1199 c = *(++ptr);
1200 if (c == 0) goto ERROR_RETURN;
1201 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1202 name[i] = c;
1203 }
1204 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1205 name[i] = 0;
1206 }
1207
1208 /* Otherwise there is just one following character */
1209
1210 else
1211 {
1212 name[0] = c;
1213 name[1] = 0;
1214 }
1215
1216 *ptrptr = ptr;
1217
1218 /* Search for a recognized property name using binary chop */
1219
1220 bot = 0;
1221 top = PRIV(utt_size);
1222
1223 while (bot < top)
1224 {
1225 i = (bot + top) >> 1;
1226 c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1227 if (c == 0)
1228 {
1229 *dptr = PRIV(utt)[i].value;
1230 return PRIV(utt)[i].type;
1231 }
1232 if (c > 0) bot = i + 1; else top = i;
1233 }
1234
1235 *errorcodeptr = ERR47;
1236 *ptrptr = ptr;
1237 return -1;
1238
1239 ERROR_RETURN:
1240 *errorcodeptr = ERR46;
1241 *ptrptr = ptr;
1242 return -1;
1243 }
1244 #endif
1245
1246
1247
1248
1249 /*************************************************
1250 * Read repeat counts *
1251 *************************************************/
1252
1253 /* Read an item of the form {n,m} and return the values. This is called only
1254 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1255 so the syntax is guaranteed to be correct, but we need to check the values.
1256
1257 Arguments:
1258 p pointer to first char after '{'
1259 minp pointer to int for min
1260 maxp pointer to int for max
1261 returned as -1 if no max
1262 errorcodeptr points to error code variable
1263
1264 Returns: pointer to '}' on success;
1265 current ptr on error, with errorcodeptr set non-zero
1266 */
1267
1268 static const pcre_uchar *
1269 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1270 {
1271 int min = 0;
1272 int max = -1;
1273
1274 /* Read the minimum value and do a paranoid check: a negative value indicates
1275 an integer overflow. */
1276
1277 while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1278 if (min < 0 || min > 65535)
1279 {
1280 *errorcodeptr = ERR5;
1281 return p;
1282 }
1283
1284 /* Read the maximum value if there is one, and again do a paranoid on its size.
1285 Also, max must not be less than min. */
1286
1287 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1288 {
1289 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1290 {
1291 max = 0;
1292 while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1293 if (max < 0 || max > 65535)
1294 {
1295 *errorcodeptr = ERR5;
1296 return p;
1297 }
1298 if (max < min)
1299 {
1300 *errorcodeptr = ERR4;
1301 return p;
1302 }
1303 }
1304 }
1305
1306 /* Fill in the required variables, and pass back the pointer to the terminating
1307 '}'. */
1308
1309 *minp = min;
1310 *maxp = max;
1311 return p;
1312 }
1313
1314
1315
1316 /*************************************************
1317 * Subroutine for finding forward reference *
1318 *************************************************/
1319
1320 /* This recursive function is called only from find_parens() below. The
1321 top-level call starts at the beginning of the pattern. All other calls must
1322 start at a parenthesis. It scans along a pattern's text looking for capturing
1323 subpatterns, and counting them. If it finds a named pattern that matches the
1324 name it is given, it returns its number. Alternatively, if the name is NULL, it
1325 returns when it reaches a given numbered subpattern. Recursion is used to keep
1326 track of subpatterns that reset the capturing group numbers - the (?| feature.
1327
1328 This function was originally called only from the second pass, in which we know
1329 that if (?< or (?' or (?P< is encountered, the name will be correctly
1330 terminated because that is checked in the first pass. There is now one call to
1331 this function in the first pass, to check for a recursive back reference by
1332 name (so that we can make the whole group atomic). In this case, we need check
1333 only up to the current position in the pattern, and that is still OK because
1334 and previous occurrences will have been checked. To make this work, the test
1335 for "end of pattern" is a check against cd->end_pattern in the main loop,
1336 instead of looking for a binary zero. This means that the special first-pass
1337 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1338 processing items within the loop are OK, because afterwards the main loop will
1339 terminate.)
1340
1341 Arguments:
1342 ptrptr address of the current character pointer (updated)
1343 cd compile background data
1344 name name to seek, or NULL if seeking a numbered subpattern
1345 lorn name length, or subpattern number if name is NULL
1346 xmode TRUE if we are in /x mode
1347 utf TRUE if we are in UTF-8 / UTF-16 mode
1348 count pointer to the current capturing subpattern number (updated)
1349
1350 Returns: the number of the named subpattern, or -1 if not found
1351 */
1352
1353 static int
1354 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1355 BOOL xmode, BOOL utf, int *count)
1356 {
1357 pcre_uchar *ptr = *ptrptr;
1358 int start_count = *count;
1359 int hwm_count = start_count;
1360 BOOL dup_parens = FALSE;
1361
1362 /* If the first character is a parenthesis, check on the type of group we are
1363 dealing with. The very first call may not start with a parenthesis. */
1364
1365 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1366 {
1367 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1368
1369 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1370
1371 /* Handle a normal, unnamed capturing parenthesis. */
1372
1373 else if (ptr[1] != CHAR_QUESTION_MARK)
1374 {
1375 *count += 1;
1376 if (name == NULL && *count == lorn) return *count;
1377 ptr++;
1378 }
1379
1380 /* All cases now have (? at the start. Remember when we are in a group
1381 where the parenthesis numbers are duplicated. */
1382
1383 else if (ptr[2] == CHAR_VERTICAL_LINE)
1384 {
1385 ptr += 3;
1386 dup_parens = TRUE;
1387 }
1388
1389 /* Handle comments; all characters are allowed until a ket is reached. */
1390
1391 else if (ptr[2] == CHAR_NUMBER_SIGN)
1392 {
1393 for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1394 goto FAIL_EXIT;
1395 }
1396
1397 /* Handle a condition. If it is an assertion, just carry on so that it
1398 is processed as normal. If not, skip to the closing parenthesis of the
1399 condition (there can't be any nested parens). */
1400
1401 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1402 {
1403 ptr += 2;
1404 if (ptr[1] != CHAR_QUESTION_MARK)
1405 {
1406 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1407 if (*ptr != 0) ptr++;
1408 }
1409 }
1410
1411 /* Start with (? but not a condition. */
1412
1413 else
1414 {
1415 ptr += 2;
1416 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1417
1418 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1419
1420 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1421 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1422 {
1423 int term;
1424 const pcre_uchar *thisname;
1425 *count += 1;
1426 if (name == NULL && *count == lorn) return *count;
1427 term = *ptr++;
1428 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1429 thisname = ptr;
1430 while (*ptr != term) ptr++;
1431 if (name != NULL && lorn == ptr - thisname &&
1432 STRNCMP_UC_UC(name, thisname, lorn) == 0)
1433 return *count;
1434 term++;
1435 }
1436 }
1437 }
1438
1439 /* Past any initial parenthesis handling, scan for parentheses or vertical
1440 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1441 first-pass call when this value is temporarily adjusted to stop at the current
1442 position. So DO NOT change this to a test for binary zero. */
1443
1444 for (; ptr < cd->end_pattern; ptr++)
1445 {
1446 /* Skip over backslashed characters and also entire \Q...\E */
1447
1448 if (*ptr == CHAR_BACKSLASH)
1449 {
1450 if (*(++ptr) == 0) goto FAIL_EXIT;
1451 if (*ptr == CHAR_Q) for (;;)
1452 {
1453 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1454 if (*ptr == 0) goto FAIL_EXIT;
1455 if (*(++ptr) == CHAR_E) break;
1456 }
1457 continue;
1458 }
1459
1460 /* Skip over character classes; this logic must be similar to the way they
1461 are handled for real. If the first character is '^', skip it. Also, if the
1462 first few characters (either before or after ^) are \Q\E or \E we skip them
1463 too. This makes for compatibility with Perl. Note the use of STR macros to
1464 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1465
1466 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1467 {
1468 BOOL negate_class = FALSE;
1469 for (;;)
1470 {
1471 if (ptr[1] == CHAR_BACKSLASH)
1472 {
1473 if (ptr[2] == CHAR_E)
1474 ptr+= 2;
1475 else if (STRNCMP_UC_C8(ptr + 2,
1476 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1477 ptr += 4;
1478 else
1479 break;
1480 }
1481 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1482 {
1483 negate_class = TRUE;
1484 ptr++;
1485 }
1486 else break;
1487 }
1488
1489 /* If the next character is ']', it is a data character that must be
1490 skipped, except in JavaScript compatibility mode. */
1491
1492 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1493 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1494 ptr++;
1495
1496 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1497 {
1498 if (*ptr == 0) return -1;
1499 if (*ptr == CHAR_BACKSLASH)
1500 {
1501 if (*(++ptr) == 0) goto FAIL_EXIT;
1502 if (*ptr == CHAR_Q) for (;;)
1503 {
1504 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1505 if (*ptr == 0) goto FAIL_EXIT;
1506 if (*(++ptr) == CHAR_E) break;
1507 }
1508 continue;
1509 }
1510 }
1511 continue;
1512 }
1513
1514 /* Skip comments in /x mode */
1515
1516 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1517 {
1518 ptr++;
1519 while (*ptr != 0)
1520 {
1521 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1522 ptr++;
1523 #ifdef SUPPORT_UTF
1524 if (utf) FORWARDCHAR(ptr);
1525 #endif
1526 }
1527 if (*ptr == 0) goto FAIL_EXIT;
1528 continue;
1529 }
1530
1531 /* Check for the special metacharacters */
1532
1533 if (*ptr == CHAR_LEFT_PARENTHESIS)
1534 {
1535 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1536 if (rc > 0) return rc;
1537 if (*ptr == 0) goto FAIL_EXIT;
1538 }
1539
1540 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1541 {
1542 if (dup_parens && *count < hwm_count) *count = hwm_count;
1543 goto FAIL_EXIT;
1544 }
1545
1546 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1547 {
1548 if (*count > hwm_count) hwm_count = *count;
1549 *count = start_count;
1550 }
1551 }
1552
1553 FAIL_EXIT:
1554 *ptrptr = ptr;
1555 return -1;
1556 }
1557
1558
1559
1560
1561 /*************************************************
1562 * Find forward referenced subpattern *
1563 *************************************************/
1564
1565 /* This function scans along a pattern's text looking for capturing
1566 subpatterns, and counting them. If it finds a named pattern that matches the
1567 name it is given, it returns its number. Alternatively, if the name is NULL, it
1568 returns when it reaches a given numbered subpattern. This is used for forward
1569 references to subpatterns. We used to be able to start this scan from the
1570 current compiling point, using the current count value from cd->bracount, and
1571 do it all in a single loop, but the addition of the possibility of duplicate
1572 subpattern numbers means that we have to scan from the very start, in order to
1573 take account of such duplicates, and to use a recursive function to keep track
1574 of the different types of group.
1575
1576 Arguments:
1577 cd compile background data
1578 name name to seek, or NULL if seeking a numbered subpattern
1579 lorn name length, or subpattern number if name is NULL
1580 xmode TRUE if we are in /x mode
1581 utf TRUE if we are in UTF-8 / UTF-16 mode
1582
1583 Returns: the number of the found subpattern, or -1 if not found
1584 */
1585
1586 static int
1587 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1588 BOOL utf)
1589 {
1590 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1591 int count = 0;
1592 int rc;
1593
1594 /* If the pattern does not start with an opening parenthesis, the first call
1595 to find_parens_sub() will scan right to the end (if necessary). However, if it
1596 does start with a parenthesis, find_parens_sub() will return when it hits the
1597 matching closing parens. That is why we have to have a loop. */
1598
1599 for (;;)
1600 {
1601 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1602 if (rc > 0 || *ptr++ == 0) break;
1603 }
1604
1605 return rc;
1606 }
1607
1608
1609
1610
1611 /*************************************************
1612 * Find first significant op code *
1613 *************************************************/
1614
1615 /* This is called by several functions that scan a compiled expression looking
1616 for a fixed first character, or an anchoring op code etc. It skips over things
1617 that do not influence this. For some calls, it makes sense to skip negative
1618 forward and all backward assertions, and also the \b assertion; for others it
1619 does not.
1620
1621 Arguments:
1622 code pointer to the start of the group
1623 skipassert TRUE if certain assertions are to be skipped
1624
1625 Returns: pointer to the first significant opcode
1626 */
1627
1628 static const pcre_uchar*
1629 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1630 {
1631 for (;;)
1632 {
1633 switch ((int)*code)
1634 {
1635 case OP_ASSERT_NOT:
1636 case OP_ASSERTBACK:
1637 case OP_ASSERTBACK_NOT:
1638 if (!skipassert) return code;
1639 do code += GET(code, 1); while (*code == OP_ALT);
1640 code += PRIV(OP_lengths)[*code];
1641 break;
1642
1643 case OP_WORD_BOUNDARY:
1644 case OP_NOT_WORD_BOUNDARY:
1645 if (!skipassert) return code;
1646 /* Fall through */
1647
1648 case OP_CALLOUT:
1649 case OP_CREF:
1650 case OP_NCREF:
1651 case OP_RREF:
1652 case OP_NRREF:
1653 case OP_DEF:
1654 code += PRIV(OP_lengths)[*code];
1655 break;
1656
1657 default:
1658 return code;
1659 }
1660 }
1661 /* Control never reaches here */
1662 }
1663
1664
1665
1666
1667 /*************************************************
1668 * Find the fixed length of a branch *
1669 *************************************************/
1670
1671 /* Scan a branch and compute the fixed length of subject that will match it,
1672 if the length is fixed. This is needed for dealing with backward assertions.
1673 In UTF8 mode, the result is in characters rather than bytes. The branch is
1674 temporarily terminated with OP_END when this function is called.
1675
1676 This function is called when a backward assertion is encountered, so that if it
1677 fails, the error message can point to the correct place in the pattern.
1678 However, we cannot do this when the assertion contains subroutine calls,
1679 because they can be forward references. We solve this by remembering this case
1680 and doing the check at the end; a flag specifies which mode we are running in.
1681
1682 Arguments:
1683 code points to the start of the pattern (the bracket)
1684 utf TRUE in UTF-8 / UTF-16 mode
1685 atend TRUE if called when the pattern is complete
1686 cd the "compile data" structure
1687
1688 Returns: the fixed length,
1689 or -1 if there is no fixed length,
1690 or -2 if \C was encountered (in UTF-8 mode only)
1691 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1692 or -4 if an unknown opcode was encountered (internal error)
1693 */
1694
1695 static int
1696 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1697 {
1698 int length = -1;
1699
1700 register int branchlength = 0;
1701 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1702
1703 /* Scan along the opcodes for this branch. If we get to the end of the
1704 branch, check the length against that of the other branches. */
1705
1706 for (;;)
1707 {
1708 int d;
1709 pcre_uchar *ce, *cs;
1710 register int op = *cc;
1711
1712 switch (op)
1713 {
1714 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1715 OP_BRA (normal non-capturing bracket) because the other variants of these
1716 opcodes are all concerned with unlimited repeated groups, which of course
1717 are not of fixed length. */
1718
1719 case OP_CBRA:
1720 case OP_BRA:
1721 case OP_ONCE:
1722 case OP_ONCE_NC:
1723 case OP_COND:
1724 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1725 if (d < 0) return d;
1726 branchlength += d;
1727 do cc += GET(cc, 1); while (*cc == OP_ALT);
1728 cc += 1 + LINK_SIZE;
1729 break;
1730
1731 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1732 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1733 an ALT. If it is END it's the end of the outer call. All can be handled by
1734 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1735 because they all imply an unlimited repeat. */
1736
1737 case OP_ALT:
1738 case OP_KET:
1739 case OP_END:
1740 case OP_ACCEPT:
1741 case OP_ASSERT_ACCEPT:
1742 if (length < 0) length = branchlength;
1743 else if (length != branchlength) return -1;
1744 if (*cc != OP_ALT) return length;
1745 cc += 1 + LINK_SIZE;
1746 branchlength = 0;
1747 break;
1748
1749 /* A true recursion implies not fixed length, but a subroutine call may
1750 be OK. If the subroutine is a forward reference, we can't deal with
1751 it until the end of the pattern, so return -3. */
1752
1753 case OP_RECURSE:
1754 if (!atend) return -3;
1755 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1756 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1757 if (cc > cs && cc < ce) return -1; /* Recursion */
1758 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1759 if (d < 0) return d;
1760 branchlength += d;
1761 cc += 1 + LINK_SIZE;
1762 break;
1763
1764 /* Skip over assertive subpatterns */
1765
1766 case OP_ASSERT:
1767 case OP_ASSERT_NOT:
1768 case OP_ASSERTBACK:
1769 case OP_ASSERTBACK_NOT:
1770 do cc += GET(cc, 1); while (*cc == OP_ALT);
1771 cc += PRIV(OP_lengths)[*cc];
1772 break;
1773
1774 /* Skip over things that don't match chars */
1775
1776 case OP_MARK:
1777 case OP_PRUNE_ARG:
1778 case OP_SKIP_ARG:
1779 case OP_THEN_ARG:
1780 cc += cc[1] + PRIV(OP_lengths)[*cc];
1781 break;
1782
1783 case OP_CALLOUT:
1784 case OP_CIRC:
1785 case OP_CIRCM:
1786 case OP_CLOSE:
1787 case OP_COMMIT:
1788 case OP_CREF:
1789 case OP_DEF:
1790 case OP_DOLL:
1791 case OP_DOLLM:
1792 case OP_EOD:
1793 case OP_EODN:
1794 case OP_FAIL:
1795 case OP_NCREF:
1796 case OP_NRREF:
1797 case OP_NOT_WORD_BOUNDARY:
1798 case OP_PRUNE:
1799 case OP_REVERSE:
1800 case OP_RREF:
1801 case OP_SET_SOM:
1802 case OP_SKIP:
1803 case OP_SOD:
1804 case OP_SOM:
1805 case OP_THEN:
1806 case OP_WORD_BOUNDARY:
1807 cc += PRIV(OP_lengths)[*cc];
1808 break;
1809
1810 /* Handle literal characters */
1811
1812 case OP_CHAR:
1813 case OP_CHARI:
1814 case OP_NOT:
1815 case OP_NOTI:
1816 branchlength++;
1817 cc += 2;
1818 #ifdef SUPPORT_UTF
1819 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1820 #endif
1821 break;
1822
1823 /* Handle exact repetitions. The count is already in characters, but we
1824 need to skip over a multibyte character in UTF8 mode. */
1825
1826 case OP_EXACT:
1827 case OP_EXACTI:
1828 case OP_NOTEXACT:
1829 case OP_NOTEXACTI:
1830 branchlength += GET2(cc,1);
1831 cc += 2 + IMM2_SIZE;
1832 #ifdef SUPPORT_UTF
1833 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1834 #endif
1835 break;
1836
1837 case OP_TYPEEXACT:
1838 branchlength += GET2(cc,1);
1839 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1840 cc += 1 + IMM2_SIZE + 1;
1841 break;
1842
1843 /* Handle single-char matchers */
1844
1845 case OP_PROP:
1846 case OP_NOTPROP:
1847 cc += 2;
1848 /* Fall through */
1849
1850 case OP_HSPACE:
1851 case OP_VSPACE:
1852 case OP_NOT_HSPACE:
1853 case OP_NOT_VSPACE:
1854 case OP_NOT_DIGIT:
1855 case OP_DIGIT:
1856 case OP_NOT_WHITESPACE:
1857 case OP_WHITESPACE:
1858 case OP_NOT_WORDCHAR:
1859 case OP_WORDCHAR:
1860 case OP_ANY:
1861 case OP_ALLANY:
1862 branchlength++;
1863 cc++;
1864 break;
1865
1866 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1867 otherwise \C is coded as OP_ALLANY. */
1868
1869 case OP_ANYBYTE:
1870 return -2;
1871
1872 /* Check a class for variable quantification */
1873
1874 #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1875 case OP_XCLASS:
1876 cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1877 /* Fall through */
1878 #endif
1879
1880 case OP_CLASS:
1881 case OP_NCLASS:
1882 cc += PRIV(OP_lengths)[OP_CLASS];
1883
1884 switch (*cc)
1885 {
1886 case OP_CRPLUS:
1887 case OP_CRMINPLUS:
1888 case OP_CRSTAR:
1889 case OP_CRMINSTAR:
1890 case OP_CRQUERY:
1891 case OP_CRMINQUERY:
1892 return -1;
1893
1894 case OP_CRRANGE:
1895 case OP_CRMINRANGE:
1896 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1897 branchlength += GET2(cc,1);
1898 cc += 1 + 2 * IMM2_SIZE;
1899 break;
1900
1901 default:
1902 branchlength++;
1903 }
1904 break;
1905
1906 /* Anything else is variable length */
1907
1908 case OP_ANYNL:
1909 case OP_BRAMINZERO:
1910 case OP_BRAPOS:
1911 case OP_BRAPOSZERO:
1912 case OP_BRAZERO:
1913 case OP_CBRAPOS:
1914 case OP_EXTUNI:
1915 case OP_KETRMAX:
1916 case OP_KETRMIN:
1917 case OP_KETRPOS:
1918 case OP_MINPLUS:
1919 case OP_MINPLUSI:
1920 case OP_MINQUERY:
1921 case OP_MINQUERYI:
1922 case OP_MINSTAR:
1923 case OP_MINSTARI:
1924 case OP_MINUPTO:
1925 case OP_MINUPTOI:
1926 case OP_NOTMINPLUS:
1927 case OP_NOTMINPLUSI:
1928 case OP_NOTMINQUERY:
1929 case OP_NOTMINQUERYI:
1930 case OP_NOTMINSTAR:
1931 case OP_NOTMINSTARI:
1932 case OP_NOTMINUPTO:
1933 case OP_NOTMINUPTOI:
1934 case OP_NOTPLUS:
1935 case OP_NOTPLUSI:
1936 case OP_NOTPOSPLUS:
1937 case OP_NOTPOSPLUSI:
1938 case OP_NOTPOSQUERY:
1939 case OP_NOTPOSQUERYI:
1940 case OP_NOTPOSSTAR:
1941 case OP_NOTPOSSTARI:
1942 case OP_NOTPOSUPTO:
1943 case OP_NOTPOSUPTOI:
1944 case OP_NOTQUERY:
1945 case OP_NOTQUERYI:
1946 case OP_NOTSTAR:
1947 case OP_NOTSTARI:
1948 case OP_NOTUPTO:
1949 case OP_NOTUPTOI:
1950 case OP_PLUS:
1951 case OP_PLUSI:
1952 case OP_POSPLUS:
1953 case OP_POSPLUSI:
1954 case OP_POSQUERY:
1955 case OP_POSQUERYI:
1956 case OP_POSSTAR:
1957 case OP_POSSTARI:
1958 case OP_POSUPTO:
1959 case OP_POSUPTOI:
1960 case OP_QUERY:
1961 case OP_QUERYI:
1962 case OP_REF:
1963 case OP_REFI:
1964 case OP_SBRA:
1965 case OP_SBRAPOS:
1966 case OP_SCBRA:
1967 case OP_SCBRAPOS:
1968 case OP_SCOND:
1969 case OP_SKIPZERO:
1970 case OP_STAR:
1971 case OP_STARI:
1972 case OP_TYPEMINPLUS:
1973 case OP_TYPEMINQUERY:
1974 case OP_TYPEMINSTAR:
1975 case OP_TYPEMINUPTO:
1976 case OP_TYPEPLUS:
1977 case OP_TYPEPOSPLUS:
1978 case OP_TYPEPOSQUERY:
1979 case OP_TYPEPOSSTAR:
1980 case OP_TYPEPOSUPTO:
1981 case OP_TYPEQUERY:
1982 case OP_TYPESTAR:
1983 case OP_TYPEUPTO:
1984 case OP_UPTO:
1985 case OP_UPTOI:
1986 return -1;
1987
1988 /* Catch unrecognized opcodes so that when new ones are added they
1989 are not forgotten, as has happened in the past. */
1990
1991 default:
1992 return -4;
1993 }
1994 }
1995 /* Control never gets here */
1996 }
1997
1998
1999
2000
2001 /*************************************************
2002 * Scan compiled regex for specific bracket *
2003 *************************************************/
2004
2005 /* This little function scans through a compiled pattern until it finds a
2006 capturing bracket with the given number, or, if the number is negative, an
2007 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2008 so that it can be called from pcre_study() when finding the minimum matching
2009 length.
2010
2011 Arguments:
2012 code points to start of expression
2013 utf TRUE in UTF-8 / UTF-16 mode
2014 number the required bracket number or negative to find a lookbehind
2015
2016 Returns: pointer to the opcode for the bracket, or NULL if not found
2017 */
2018
2019 const pcre_uchar *
2020 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2021 {
2022 for (;;)
2023 {
2024 register int c = *code;
2025
2026 if (c == OP_END) return NULL;
2027
2028 /* XCLASS is used for classes that cannot be represented just by a bit
2029 map. This includes negated single high-valued characters. The length in
2030 the table is zero; the actual length is stored in the compiled code. */
2031
2032 if (c == OP_XCLASS) code += GET(code, 1);
2033
2034 /* Handle recursion */
2035
2036 else if (c == OP_REVERSE)
2037 {
2038 if (number < 0) return (pcre_uchar *)code;
2039 code += PRIV(OP_lengths)[c];
2040 }
2041
2042 /* Handle capturing bracket */
2043
2044 else if (c == OP_CBRA || c == OP_SCBRA ||
2045 c == OP_CBRAPOS || c == OP_SCBRAPOS)
2046 {
2047 int n = GET2(code, 1+LINK_SIZE);
2048 if (n == number) return (pcre_uchar *)code;
2049 code += PRIV(OP_lengths)[c];
2050 }
2051
2052 /* Otherwise, we can get the item's length from the table, except that for
2053 repeated character types, we have to test for \p and \P, which have an extra
2054 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2055 must add in its length. */
2056
2057 else
2058 {
2059 switch(c)
2060 {
2061 case OP_TYPESTAR:
2062 case OP_TYPEMINSTAR:
2063 case OP_TYPEPLUS:
2064 case OP_TYPEMINPLUS:
2065 case OP_TYPEQUERY:
2066 case OP_TYPEMINQUERY:
2067 case OP_TYPEPOSSTAR:
2068 case OP_TYPEPOSPLUS:
2069 case OP_TYPEPOSQUERY:
2070 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2071 break;
2072
2073 case OP_TYPEUPTO:
2074 case OP_TYPEMINUPTO:
2075 case OP_TYPEEXACT:
2076 case OP_TYPEPOSUPTO:
2077 if (code[1 + IMM2_SIZE] == OP_PROP
2078 || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2079 break;
2080
2081 case OP_MARK:
2082 case OP_PRUNE_ARG:
2083 case OP_SKIP_ARG:
2084 code += code[1];
2085 break;
2086
2087 case OP_THEN_ARG:
2088 code += code[1];
2089 break;
2090 }
2091
2092 /* Add in the fixed length from the table */
2093
2094 code += PRIV(OP_lengths)[c];
2095
2096 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2097 a multi-byte character. The length in the table is a minimum, so we have to
2098 arrange to skip the extra bytes. */
2099
2100 #ifdef SUPPORT_UTF
2101 if (utf) switch(c)
2102 {
2103 case OP_CHAR:
2104 case OP_CHARI:
2105 case OP_EXACT:
2106 case OP_EXACTI:
2107 case OP_UPTO:
2108 case OP_UPTOI:
2109 case OP_MINUPTO:
2110 case OP_MINUPTOI:
2111 case OP_POSUPTO:
2112 case OP_POSUPTOI:
2113 case OP_STAR:
2114 case OP_STARI:
2115 case OP_MINSTAR:
2116 case OP_MINSTARI:
2117 case OP_POSSTAR:
2118 case OP_POSSTARI:
2119 case OP_PLUS:
2120 case OP_PLUSI:
2121 case OP_MINPLUS:
2122 case OP_MINPLUSI:
2123 case OP_POSPLUS:
2124 case OP_POSPLUSI:
2125 case OP_QUERY:
2126 case OP_QUERYI:
2127 case OP_MINQUERY:
2128 case OP_MINQUERYI:
2129 case OP_POSQUERY:
2130 case OP_POSQUERYI:
2131 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2132 break;
2133 }
2134 #else
2135 (void)(utf); /* Keep compiler happy by referencing function argument */
2136 #endif
2137 }
2138 }
2139 }
2140
2141
2142
2143 /*************************************************
2144 * Scan compiled regex for recursion reference *
2145 *************************************************/
2146
2147 /* This little function scans through a compiled pattern until it finds an
2148 instance of OP_RECURSE.
2149
2150 Arguments:
2151 code points to start of expression
2152 utf TRUE in UTF-8 / UTF-16 mode
2153
2154 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2155 */
2156
2157 static const pcre_uchar *
2158 find_recurse(const pcre_uchar *code, BOOL utf)
2159 {
2160 for (;;)
2161 {
2162 register int c = *code;
2163 if (c == OP_END) return NULL;
2164 if (c == OP_RECURSE) return code;
2165
2166 /* XCLASS is used for classes that cannot be represented just by a bit
2167 map. This includes negated single high-valued characters. The length in
2168 the table is zero; the actual length is stored in the compiled code. */
2169
2170 if (c == OP_XCLASS) code += GET(code, 1);
2171
2172 /* Otherwise, we can get the item's length from the table, except that for
2173 repeated character types, we have to test for \p and \P, which have an extra
2174 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2175 must add in its length. */
2176
2177 else
2178 {
2179 switch(c)
2180 {
2181 case OP_TYPESTAR:
2182 case OP_TYPEMINSTAR:
2183 case OP_TYPEPLUS:
2184 case OP_TYPEMINPLUS:
2185 case OP_TYPEQUERY:
2186 case OP_TYPEMINQUERY:
2187 case OP_TYPEPOSSTAR:
2188 case OP_TYPEPOSPLUS:
2189 case OP_TYPEPOSQUERY:
2190 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2191 break;
2192
2193 case OP_TYPEPOSUPTO:
2194 case OP_TYPEUPTO:
2195 case OP_TYPEMINUPTO:
2196 case OP_TYPEEXACT:
2197 if (code[1 + IMM2_SIZE] == OP_PROP
2198 || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2199 break;
2200
2201 case OP_MARK:
2202 case OP_PRUNE_ARG:
2203 case OP_SKIP_ARG:
2204 code += code[1];
2205 break;
2206
2207 case OP_THEN_ARG:
2208 code += code[1];
2209 break;
2210 }
2211
2212 /* Add in the fixed length from the table */
2213
2214 code += PRIV(OP_lengths)[c];
2215
2216 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2217 by a multi-byte character. The length in the table is a minimum, so we have
2218 to arrange to skip the extra bytes. */
2219
2220 #ifdef SUPPORT_UTF
2221 if (utf) switch(c)
2222 {
2223 case OP_CHAR:
2224 case OP_CHARI:
2225 case OP_EXACT:
2226 case OP_EXACTI:
2227 case OP_UPTO:
2228 case OP_UPTOI:
2229 case OP_MINUPTO:
2230 case OP_MINUPTOI:
2231 case OP_POSUPTO:
2232 case OP_POSUPTOI:
2233 case OP_STAR:
2234 case OP_STARI:
2235 case OP_MINSTAR:
2236 case OP_MINSTARI:
2237 case OP_POSSTAR:
2238 case OP_POSSTARI:
2239 case OP_PLUS:
2240 case OP_PLUSI:
2241 case OP_MINPLUS:
2242 case OP_MINPLUSI:
2243 case OP_POSPLUS:
2244 case OP_POSPLUSI:
2245 case OP_QUERY:
2246 case OP_QUERYI:
2247 case OP_MINQUERY:
2248 case OP_MINQUERYI:
2249 case OP_POSQUERY:
2250 case OP_POSQUERYI:
2251 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2252 break;
2253 }
2254 #else
2255 (void)(utf); /* Keep compiler happy by referencing function argument */
2256 #endif
2257 }
2258 }
2259 }
2260
2261
2262
2263 /*************************************************
2264 * Scan compiled branch for non-emptiness *
2265 *************************************************/
2266
2267 /* This function scans through a branch of a compiled pattern to see whether it
2268 can match the empty string or not. It is called from could_be_empty()
2269 below and from compile_branch() when checking for an unlimited repeat of a
2270 group that can match nothing. Note that first_significant_code() skips over
2271 backward and negative forward assertions when its final argument is TRUE. If we
2272 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2273 bracket whose current branch will already have been scanned.
2274
2275 Arguments:
2276 code points to start of search
2277 endcode points to where to stop
2278 utf TRUE if in UTF-8 / UTF-16 mode
2279 cd contains pointers to tables etc.
2280
2281 Returns: TRUE if what is matched could be empty
2282 */
2283
2284 static BOOL
2285 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2286 BOOL utf, compile_data *cd)
2287 {
2288 register int c;
2289 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2290 code < endcode;
2291 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2292 {
2293 const pcre_uchar *ccode;
2294
2295 c = *code;
2296
2297 /* Skip over forward assertions; the other assertions are skipped by
2298 first_significant_code() with a TRUE final argument. */
2299
2300 if (c == OP_ASSERT)
2301 {
2302 do code += GET(code, 1); while (*code == OP_ALT);
2303 c = *code;
2304 continue;
2305 }
2306
2307 /* For a recursion/subroutine call, if its end has been reached, which
2308 implies a backward reference subroutine call, we can scan it. If it's a
2309 forward reference subroutine call, we can't. To detect forward reference
2310 we have to scan up the list that is kept in the workspace. This function is
2311 called only when doing the real compile, not during the pre-compile that
2312 measures the size of the compiled pattern. */
2313
2314 if (c == OP_RECURSE)
2315 {
2316 const pcre_uchar *scode;
2317 BOOL empty_branch;
2318
2319 /* Test for forward reference */
2320
2321 for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2322 if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2323
2324 /* Not a forward reference, test for completed backward reference */
2325
2326 empty_branch = FALSE;
2327 scode = cd->start_code + GET(code, 1);
2328 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2329
2330 /* Completed backwards reference */
2331
2332 do
2333 {
2334 if (could_be_empty_branch(scode, endcode, utf, cd))
2335 {
2336 empty_branch = TRUE;
2337 break;
2338 }
2339 scode += GET(scode, 1);
2340 }
2341 while (*scode == OP_ALT);
2342
2343 if (!empty_branch) return FALSE; /* All branches are non-empty */
2344 continue;
2345 }
2346
2347 /* Groups with zero repeats can of course be empty; skip them. */
2348
2349 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2350 c == OP_BRAPOSZERO)
2351 {
2352 code += PRIV(OP_lengths)[c];
2353 do code += GET(code, 1); while (*code == OP_ALT);
2354 c = *code;
2355 continue;
2356 }
2357
2358 /* A nested group that is already marked as "could be empty" can just be
2359 skipped. */
2360
2361 if (c == OP_SBRA || c == OP_SBRAPOS ||
2362 c == OP_SCBRA || c == OP_SCBRAPOS)
2363 {
2364 do code += GET(code, 1); while (*code == OP_ALT);
2365 c = *code;
2366 continue;
2367 }
2368
2369 /* For other groups, scan the branches. */
2370
2371 if (c == OP_BRA || c == OP_BRAPOS ||
2372 c == OP_CBRA || c == OP_CBRAPOS ||
2373 c == OP_ONCE || c == OP_ONCE_NC ||
2374 c == OP_COND)
2375 {
2376 BOOL empty_branch;
2377 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2378
2379 /* If a conditional group has only one branch, there is a second, implied,
2380 empty branch, so just skip over the conditional, because it could be empty.
2381 Otherwise, scan the individual branches of the group. */
2382
2383 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2384 code += GET(code, 1);
2385 else
2386 {
2387 empty_branch = FALSE;
2388 do
2389 {
2390 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2391 empty_branch = TRUE;
2392 code += GET(code, 1);
2393 }
2394 while (*code == OP_ALT);
2395 if (!empty_branch) return FALSE; /* All branches are non-empty */
2396 }
2397
2398 c = *code;
2399 continue;
2400 }
2401
2402 /* Handle the other opcodes */
2403
2404 switch (c)
2405 {
2406 /* Check for quantifiers after a class. XCLASS is used for classes that
2407 cannot be represented just by a bit map. This includes negated single
2408 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2409 actual length is stored in the compiled code, so we must update "code"
2410 here. */
2411
2412 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2413 case OP_XCLASS:
2414 ccode = code += GET(code, 1);
2415 goto CHECK_CLASS_REPEAT;
2416 #endif
2417
2418 case OP_CLASS:
2419 case OP_NCLASS:
2420 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2421
2422 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2423 CHECK_CLASS_REPEAT:
2424 #endif
2425
2426 switch (*ccode)
2427 {
2428 case OP_CRSTAR: /* These could be empty; continue */
2429 case OP_CRMINSTAR:
2430 case OP_CRQUERY:
2431 case OP_CRMINQUERY:
2432 break;
2433
2434 default: /* Non-repeat => class must match */
2435 case OP_CRPLUS: /* These repeats aren't empty */
2436 case OP_CRMINPLUS:
2437 return FALSE;
2438
2439 case OP_CRRANGE:
2440 case OP_CRMINRANGE:
2441 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2442 break;
2443 }
2444 break;
2445
2446 /* Opcodes that must match a character */
2447
2448 case OP_PROP:
2449 case OP_NOTPROP:
2450 case OP_EXTUNI:
2451 case OP_NOT_DIGIT:
2452 case OP_DIGIT:
2453 case OP_NOT_WHITESPACE:
2454 case OP_WHITESPACE:
2455 case OP_NOT_WORDCHAR:
2456 case OP_WORDCHAR:
2457 case OP_ANY:
2458 case OP_ALLANY:
2459 case OP_ANYBYTE:
2460 case OP_CHAR:
2461 case OP_CHARI:
2462 case OP_NOT:
2463 case OP_NOTI:
2464 case OP_PLUS:
2465 case OP_MINPLUS:
2466 case OP_POSPLUS:
2467 case OP_EXACT:
2468 case OP_NOTPLUS:
2469 case OP_NOTMINPLUS:
2470 case OP_NOTPOSPLUS:
2471 case OP_NOTEXACT:
2472 case OP_TYPEPLUS:
2473 case OP_TYPEMINPLUS:
2474 case OP_TYPEPOSPLUS:
2475 case OP_TYPEEXACT:
2476 return FALSE;
2477
2478 /* These are going to continue, as they may be empty, but we have to
2479 fudge the length for the \p and \P cases. */
2480
2481 case OP_TYPESTAR:
2482 case OP_TYPEMINSTAR:
2483 case OP_TYPEPOSSTAR:
2484 case OP_TYPEQUERY:
2485 case OP_TYPEMINQUERY:
2486 case OP_TYPEPOSQUERY:
2487 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2488 break;
2489
2490 /* Same for these */
2491
2492 case OP_TYPEUPTO:
2493 case OP_TYPEMINUPTO:
2494 case OP_TYPEPOSUPTO:
2495 if (code[1 + IMM2_SIZE] == OP_PROP
2496 || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2497 break;
2498
2499 /* End of branch */
2500
2501 case OP_KET:
2502 case OP_KETRMAX:
2503 case OP_KETRMIN:
2504 case OP_KETRPOS:
2505 case OP_ALT:
2506 return TRUE;
2507
2508 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2509 MINUPTO, and POSUPTO may be followed by a multibyte character */
2510
2511 #ifdef SUPPORT_UTF
2512 case OP_STAR:
2513 case OP_STARI:
2514 case OP_MINSTAR:
2515 case OP_MINSTARI:
2516 case OP_POSSTAR:
2517 case OP_POSSTARI:
2518 case OP_QUERY:
2519 case OP_QUERYI:
2520 case OP_MINQUERY:
2521 case OP_MINQUERYI:
2522 case OP_POSQUERY:
2523 case OP_POSQUERYI:
2524 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2525 break;
2526
2527 case OP_UPTO:
2528 case OP_UPTOI:
2529 case OP_MINUPTO:
2530 case OP_MINUPTOI:
2531 case OP_POSUPTO:
2532 case OP_POSUPTOI:
2533 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2534 break;
2535 #endif
2536
2537 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2538 string. */
2539
2540 case OP_MARK:
2541 case OP_PRUNE_ARG:
2542 case OP_SKIP_ARG:
2543 code += code[1];
2544 break;
2545
2546 case OP_THEN_ARG:
2547 code += code[1];
2548 break;
2549
2550 /* None of the remaining opcodes are required to match a character. */
2551
2552 default:
2553 break;
2554 }
2555 }
2556
2557 return TRUE;
2558 }
2559
2560
2561
2562 /*************************************************
2563 * Scan compiled regex for non-emptiness *
2564 *************************************************/
2565
2566 /* This function is called to check for left recursive calls. We want to check
2567 the current branch of the current pattern to see if it could match the empty
2568 string. If it could, we must look outwards for branches at other levels,
2569 stopping when we pass beyond the bracket which is the subject of the recursion.
2570 This function is called only during the real compile, not during the
2571 pre-compile.
2572
2573 Arguments:
2574 code points to start of the recursion
2575 endcode points to where to stop (current RECURSE item)
2576 bcptr points to the chain of current (unclosed) branch starts
2577 utf TRUE if in UTF-8 / UTF-16 mode
2578 cd pointers to tables etc
2579
2580 Returns: TRUE if what is matched could be empty
2581 */
2582
2583 static BOOL
2584 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2585 branch_chain *bcptr, BOOL utf, compile_data *cd)
2586 {
2587 while (bcptr != NULL && bcptr->current_branch >= code)
2588 {
2589 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2590 return FALSE;
2591 bcptr = bcptr->outer;
2592 }
2593 return TRUE;
2594 }
2595
2596
2597
2598 /*************************************************
2599 * Check for POSIX class syntax *
2600 *************************************************/
2601
2602 /* This function is called when the sequence "[:" or "[." or "[=" is
2603 encountered in a character class. It checks whether this is followed by a
2604 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2605 reach an unescaped ']' without the special preceding character, return FALSE.
2606
2607 Originally, this function only recognized a sequence of letters between the
2608 terminators, but it seems that Perl recognizes any sequence of characters,
2609 though of course unknown POSIX names are subsequently rejected. Perl gives an
2610 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2611 didn't consider this to be a POSIX class. Likewise for [:1234:].
2612
2613 The problem in trying to be exactly like Perl is in the handling of escapes. We
2614 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2615 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2616 below handles the special case of \], but does not try to do any other escape
2617 processing. This makes it different from Perl for cases such as [:l\ower:]
2618 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2619 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2620 I think.
2621
2622 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2623 It seems that the appearance of a nested POSIX class supersedes an apparent
2624 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2625 a digit.
2626
2627 In Perl, unescaped square brackets may also appear as part of class names. For
2628 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2629 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2630 seem right at all. PCRE does not allow closing square brackets in POSIX class
2631 names.
2632
2633 Arguments:
2634 ptr pointer to the initial [
2635 endptr where to return the end pointer
2636
2637 Returns: TRUE or FALSE
2638 */
2639
2640 static BOOL
2641 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2642 {
2643 int terminator; /* Don't combine these lines; the Solaris cc */
2644 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2645 for (++ptr; *ptr != 0; ptr++)
2646 {
2647 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2648 ptr++;
2649 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2650 else
2651 {
2652 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2653 {
2654 *endptr = ptr;
2655 return TRUE;
2656 }
2657 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2658 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2659 ptr[1] == CHAR_EQUALS_SIGN) &&
2660 check_posix_syntax(ptr, endptr))
2661 return FALSE;
2662 }
2663 }
2664 return FALSE;
2665 }
2666
2667
2668
2669
2670 /*************************************************
2671 * Check POSIX class name *
2672 *************************************************/
2673
2674 /* This function is called to check the name given in a POSIX-style class entry
2675 such as [:alnum:].
2676
2677 Arguments:
2678 ptr points to the first letter
2679 len the length of the name
2680
2681 Returns: a value representing the name, or -1 if unknown
2682 */
2683
2684 static int
2685 check_posix_name(const pcre_uchar *ptr, int len)
2686 {
2687 const char *pn = posix_names;
2688 register int yield = 0;
2689 while (posix_name_lengths[yield] != 0)
2690 {
2691 if (len == posix_name_lengths[yield] &&
2692 STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2693 pn += posix_name_lengths[yield] + 1;
2694 yield++;
2695 }
2696 return -1;
2697 }
2698
2699
2700 /*************************************************
2701 * Adjust OP_RECURSE items in repeated group *
2702 *************************************************/
2703
2704 /* OP_RECURSE items contain an offset from the start of the regex to the group
2705 that is referenced. This means that groups can be replicated for fixed
2706 repetition simply by copying (because the recursion is allowed to refer to
2707 earlier groups that are outside the current group). However, when a group is
2708 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2709 inserted before it, after it has been compiled. This means that any OP_RECURSE
2710 items within it that refer to the group itself or any contained groups have to
2711 have their offsets adjusted. That one of the jobs of this function. Before it
2712 is called, the partially compiled regex must be temporarily terminated with
2713 OP_END.
2714
2715 This function has been extended with the possibility of forward references for
2716 recursions and subroutine calls. It must also check the list of such references
2717 for the group we are dealing with. If it finds that one of the recursions in
2718 the current group is on this list, it adjusts the offset in the list, not the
2719 value in the reference (which is a group number).
2720
2721 Arguments:
2722 group points to the start of the group
2723 adjust the amount by which the group is to be moved
2724 utf TRUE in UTF-8 / UTF-16 mode
2725 cd contains pointers to tables etc.
2726 save_hwm the hwm forward reference pointer at the start of the group
2727
2728 Returns: nothing
2729 */
2730
2731 static void
2732 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2733 pcre_uchar *save_hwm)
2734 {
2735 pcre_uchar *ptr = group;
2736
2737 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2738 {
2739 int offset;
2740 pcre_uchar *hc;
2741
2742 /* See if this recursion is on the forward reference list. If so, adjust the
2743 reference. */
2744
2745 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2746 {
2747 offset = GET(hc, 0);
2748 if (cd->start_code + offset == ptr + 1)
2749 {
2750 PUT(hc, 0, offset + adjust);
2751 break;
2752 }
2753 }
2754
2755 /* Otherwise, adjust the recursion offset if it's after the start of this
2756 group. */
2757
2758 if (hc >= cd->hwm)
2759 {
2760 offset = GET(ptr, 1);
2761 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2762 }
2763
2764 ptr += 1 + LINK_SIZE;
2765 }
2766 }
2767
2768
2769
2770 /*************************************************
2771 * Insert an automatic callout point *
2772 *************************************************/
2773
2774 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2775 callout points before each pattern item.
2776
2777 Arguments:
2778 code current code pointer
2779 ptr current pattern pointer
2780 cd pointers to tables etc
2781
2782 Returns: new code pointer
2783 */
2784
2785 static pcre_uchar *
2786 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2787 {
2788 *code++ = OP_CALLOUT;
2789 *code++ = 255;
2790 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2791 PUT(code, LINK_SIZE, 0); /* Default length */
2792 return code + 2 * LINK_SIZE;
2793 }
2794
2795
2796
2797 /*************************************************
2798 * Complete a callout item *
2799 *************************************************/
2800
2801 /* A callout item contains the length of the next item in the pattern, which
2802 we can't fill in till after we have reached the relevant point. This is used
2803 for both automatic and manual callouts.
2804
2805 Arguments:
2806 previous_callout points to previous callout item
2807 ptr current pattern pointer
2808 cd pointers to tables etc
2809
2810 Returns: nothing
2811 */
2812
2813 static void
2814 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2815 {
2816 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2817 PUT(previous_callout, 2 + LINK_SIZE, length);
2818 }
2819
2820
2821
2822 #ifdef SUPPORT_UCP
2823 /*************************************************
2824 * Get othercase range *
2825 *************************************************/
2826
2827 /* This function is passed the start and end of a class range, in UTF-8 mode
2828 with UCP support. It searches up the characters, looking for internal ranges of
2829 characters in the "other" case. Each call returns the next one, updating the
2830 start address.
2831
2832 Arguments:
2833 cptr points to starting character value; updated
2834 d end value
2835 ocptr where to put start of othercase range
2836 odptr where to put end of othercase range
2837
2838 Yield: TRUE when range returned; FALSE when no more
2839 */
2840
2841 static BOOL
2842 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2843 unsigned int *odptr)
2844 {
2845 unsigned int c, othercase, next;
2846
2847 for (c = *cptr; c <= d; c++)
2848 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2849
2850 if (c > d) return FALSE;
2851
2852 *ocptr = othercase;
2853 next = othercase + 1;
2854
2855 for (++c; c <= d; c++)
2856 {
2857 if (UCD_OTHERCASE(c) != next) break;
2858 next++;
2859 }
2860
2861 *odptr = next - 1;
2862 *cptr = c;
2863
2864 return TRUE;
2865 }
2866
2867
2868
2869 /*************************************************
2870 * Check a character and a property *
2871 *************************************************/
2872
2873 /* This function is called by check_auto_possessive() when a property item
2874 is adjacent to a fixed character.
2875
2876 Arguments:
2877 c the character
2878 ptype the property type
2879 pdata the data for the type
2880 negated TRUE if it's a negated property (\P or \p{^)
2881
2882 Returns: TRUE if auto-possessifying is OK
2883 */
2884
2885 static BOOL
2886 check_char_prop(int c, int ptype, int pdata, BOOL negated)
2887 {
2888 const ucd_record *prop = GET_UCD(c);
2889 switch(ptype)
2890 {
2891 case PT_LAMP:
2892 return (prop->chartype == ucp_Lu ||
2893 prop->chartype == ucp_Ll ||
2894 prop->chartype == ucp_Lt) == negated;
2895
2896 case PT_GC:
2897 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2898
2899 case PT_PC:
2900 return (pdata == prop->chartype) == negated;
2901
2902 case PT_SC:
2903 return (pdata == prop->script) == negated;
2904
2905 /* These are specials */
2906
2907 case PT_ALNUM:
2908 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2909 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2910
2911 case PT_SPACE: /* Perl space */
2912 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2913 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2914 == negated;
2915
2916 case PT_PXSPACE: /* POSIX space */
2917 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2918 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2919 c == CHAR_FF || c == CHAR_CR)
2920 == negated;
2921
2922 case PT_WORD:
2923 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2924 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2925 c == CHAR_UNDERSCORE) == negated;
2926 }
2927 return FALSE;
2928 }
2929 #endif /* SUPPORT_UCP */
2930
2931
2932
2933 /*************************************************
2934 * Check if auto-possessifying is possible *
2935 *************************************************/
2936
2937 /* This function is called for unlimited repeats of certain items, to see
2938 whether the next thing could possibly match the repeated item. If not, it makes
2939 sense to automatically possessify the repeated item.
2940
2941 Arguments:
2942 previous pointer to the repeated opcode
2943 utf TRUE in UTF-8 / UTF-16 mode
2944 ptr next character in pattern
2945 options options bits
2946 cd contains pointers to tables etc.
2947
2948 Returns: TRUE if possessifying is wanted
2949 */
2950
2951 static BOOL
2952 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2953 const pcre_uchar *ptr, int options, compile_data *cd)
2954 {
2955 pcre_int32 c, next;
2956 int op_code = *previous++;
2957
2958 /* Skip whitespace and comments in extended mode */
2959
2960 if ((options & PCRE_EXTENDED) != 0)
2961 {
2962 for (;;)
2963 {
2964 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2965 if (*ptr == CHAR_NUMBER_SIGN)
2966 {
2967 ptr++;
2968 while (*ptr != 0)
2969 {
2970 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2971 ptr++;
2972 #ifdef SUPPORT_UTF
2973 if (utf) FORWARDCHAR(ptr);
2974 #endif
2975 }
2976 }
2977 else break;
2978 }
2979 }
2980
2981 /* If the next item is one that we can handle, get its value. A non-negative
2982 value is a character, a negative value is an escape value. */
2983
2984 if (*ptr == CHAR_BACKSLASH)
2985 {
2986 int temperrorcode = 0;
2987 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2988 if (temperrorcode != 0) return FALSE;
2989 ptr++; /* Point after the escape sequence */
2990 }
2991 else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
2992 {
2993 #ifdef SUPPORT_UTF
2994 if (utf) { GETCHARINC(next, ptr); } else
2995 #endif
2996 next = *ptr++;
2997 }
2998 else return FALSE;
2999
3000 /* Skip whitespace and comments in extended mode */
3001
3002 if ((options & PCRE_EXTENDED) != 0)
3003 {
3004 for (;;)
3005 {
3006 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3007 if (*ptr == CHAR_NUMBER_SIGN)
3008 {
3009 ptr++;
3010 while (*ptr != 0)
3011 {
3012 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3013 ptr++;
3014 #ifdef SUPPORT_UTF
3015 if (utf) FORWARDCHAR(ptr);
3016 #endif
3017 }
3018 }
3019 else break;
3020 }
3021 }
3022
3023 /* If the next thing is itself optional, we have to give up. */
3024
3025 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3026 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3027 return FALSE;
3028
3029 /* Now compare the next item with the previous opcode. First, handle cases when
3030 the next item is a character. */
3031
3032 if (next >= 0) switch(op_code)
3033 {
3034 case OP_CHAR:
3035 #ifdef SUPPORT_UTF
3036 GETCHARTEST(c, previous);
3037 #else
3038 c = *previous;
3039 #endif
3040 return c != next;
3041
3042 /* For CHARI (caseless character) we must check the other case. If we have
3043 Unicode property support, we can use it to test the other case of
3044 high-valued characters. */
3045
3046 case OP_CHARI:
3047 #ifdef SUPPORT_UTF
3048 GETCHARTEST(c, previous);
3049 #else
3050 c = *previous;
3051 #endif
3052 if (c == next) return FALSE;
3053 #ifdef SUPPORT_UTF
3054 if (utf)
3055 {
3056 unsigned int othercase;
3057 if (next < 128) othercase = cd->fcc[next]; else
3058 #ifdef SUPPORT_UCP
3059 othercase = UCD_OTHERCASE((unsigned int)next);
3060 #else
3061 othercase = NOTACHAR;
3062 #endif
3063 return (unsigned int)c != othercase;
3064 }
3065 else
3066 #endif /* SUPPORT_UTF */
3067 return (c != TABLE_GET(next, cd->fcc, next)); /* Non-UTF-8 mode */
3068
3069 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
3070 opcodes are not used for multi-byte characters, because they are coded using
3071 an XCLASS instead. */
3072
3073 case OP_NOT:
3074 return (c = *previous) == next;
3075
3076 case OP_NOTI:
3077 if ((c = *previous) == next) return TRUE;
3078 #ifdef SUPPORT_UTF
3079 if (utf)
3080 {
3081 unsigned int othercase;
3082 if (next < 128) othercase = cd->fcc[next]; else
3083 #ifdef SUPPORT_UCP
3084 othercase = UCD_OTHERCASE(next);
3085 #else
3086 othercase = NOTACHAR;
3087 #endif
3088 return (unsigned int)c == othercase;
3089 }
3090 else
3091 #endif /* SUPPORT_UTF */
3092 return (c == TABLE_GET(next, cd->fcc, next)); /* Non-UTF-8 mode */
3093
3094 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3095 When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3096
3097 case OP_DIGIT:
3098 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
3099
3100 case OP_NOT_DIGIT:
3101 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
3102
3103 case OP_WHITESPACE:
3104 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
3105
3106 case OP_NOT_WHITESPACE:
3107 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
3108
3109 case OP_WORDCHAR:
3110 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
3111
3112 case OP_NOT_WORDCHAR:
3113 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
3114
3115 case OP_HSPACE:
3116 case OP_NOT_HSPACE:
3117 switch(next)
3118 {
3119 case 0x09:
3120 case 0x20:
3121 case 0xa0:
3122 case 0x1680:
3123 case 0x180e:
3124 case 0x2000:
3125 case 0x2001:
3126 case 0x2002:
3127 case 0x2003:
3128 case 0x2004:
3129 case 0x2005:
3130 case 0x2006:
3131 case 0x2007:
3132 case 0x2008:
3133 case 0x2009:
3134 case 0x200A:
3135 case 0x202f:
3136 case 0x205f:
3137 case 0x3000:
3138 return op_code == OP_NOT_HSPACE;
3139 default:
3140 return op_code != OP_NOT_HSPACE;
3141 }
3142
3143 case OP_ANYNL:
3144 case OP_VSPACE:
3145 case OP_NOT_VSPACE:
3146 switch(next)
3147 {
3148 case 0x0a:
3149 case 0x0b:
3150 case 0x0c:
3151 case 0x0d:
3152 case 0x85:
3153 case 0x2028:
3154 case 0x2029:
3155 return op_code == OP_NOT_VSPACE;
3156 default:
3157 return op_code != OP_NOT_VSPACE;
3158 }
3159
3160 #ifdef SUPPORT_UCP
3161 case OP_PROP:
3162 return check_char_prop(next, previous[0], previous[1], FALSE);
3163
3164 case OP_NOTPROP:
3165 return check_char_prop(next, previous[0], previous[1], TRUE);
3166 #endif
3167
3168 default:
3169 return FALSE;
3170 }
3171
3172
3173 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3174 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3175 generated only when PCRE_UCP is *not* set, that is, when only ASCII
3176 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3177 replaced by OP_PROP codes when PCRE_UCP is set. */
3178
3179 switch(op_code)
3180 {
3181 case OP_CHAR:
3182 case OP_CHARI:
3183 #ifdef SUPPORT_UTF
3184 GETCHARTEST(c, previous);
3185 #else
3186 c = *previous;
3187 #endif
3188 switch(-next)
3189 {
3190 case ESC_d:
3191 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3192
3193 case ESC_D:
3194 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3195
3196 case ESC_s:
3197 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3198
3199 case ESC_S:
3200 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3201
3202 case ESC_w:
3203 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3204
3205 case ESC_W:
3206 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3207
3208 case ESC_h:
3209 case ESC_H:
3210 switch(c)
3211 {
3212 case 0x09:
3213 case 0x20:
3214 case 0xa0:
3215 case 0x1680:
3216 case 0x180e:
3217 case 0x2000:
3218 case 0x2001:
3219 case 0x2002:
3220 case 0x2003:
3221 case 0x2004:
3222 case 0x2005:
3223 case 0x2006:
3224 case 0x2007:
3225 case 0x2008:
3226 case 0x2009:
3227 case 0x200A:
3228 case 0x202f:
3229 case 0x205f:
3230 case 0x3000:
3231 return -next != ESC_h;
3232 default:
3233 return -next == ESC_h;
3234 }
3235
3236 case ESC_v:
3237 case ESC_V:
3238 switch(c)
3239 {
3240 case 0x0a:
3241 case 0x0b:
3242 case 0x0c:
3243 case 0x0d:
3244 case 0x85:
3245 case 0x2028:
3246 case 0x2029:
3247 return -next != ESC_v;
3248 default:
3249 return -next == ESC_v;
3250 }
3251
3252 /* When PCRE_UCP is set, these values get generated for \d etc. Find
3253 their substitutions and process them. The result will always be either
3254 -ESC_p or -ESC_P. Then fall through to process those values. */
3255
3256 #ifdef SUPPORT_UCP
3257 case ESC_du:
3258 case ESC_DU:
3259 case ESC_wu:
3260 case ESC_WU:
3261 case ESC_su:
3262 case ESC_SU:
3263 {
3264 int temperrorcode = 0;
3265 ptr = substitutes[-next - ESC_DU];
3266 next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3267 if (temperrorcode != 0) return FALSE;
3268 ptr++; /* For compatibility */
3269 }
3270 /* Fall through */
3271
3272 case ESC_p:
3273 case ESC_P:
3274 {
3275 int ptype, pdata, errorcodeptr;
3276 BOOL negated;
3277
3278 ptr--; /* Make ptr point at the p or P */
3279 ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3280 if (ptype < 0) return FALSE;
3281 ptr++; /* Point past the final curly ket */
3282
3283 /* If the property item is optional, we have to give up. (When generated
3284 from \d etc by PCRE_UCP, this test will have been applied much earlier,
3285 to the original \d etc. At this point, ptr will point to a zero byte. */
3286
3287 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3288 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3289 return FALSE;
3290
3291 /* Do the property check. */
3292
3293 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3294 }
3295 #endif
3296
3297 default:
3298 return FALSE;
3299 }
3300
3301 /* In principle, support for Unicode properties should be integrated here as
3302 well. It means re-organizing the above code so as to get hold of the property
3303 values before switching on the op-code. However, I wonder how many patterns
3304 combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3305 these op-codes are never generated.) */
3306
3307 case OP_DIGIT:
3308 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3309 next == -ESC_h || next == -ESC_v || next == -ESC_R;
3310
3311 case OP_NOT_DIGIT:
3312 return next == -ESC_d;
3313
3314 case OP_WHITESPACE:
3315 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3316
3317 case OP_NOT_WHITESPACE:
3318 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3319
3320 case OP_HSPACE:
3321 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3322 next == -ESC_w || next == -ESC_v || next == -ESC_R;
3323
3324 case OP_NOT_HSPACE:
3325 return next == -ESC_h;
3326
3327 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3328 case OP_ANYNL:
3329 case OP_VSPACE:
3330 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3331
3332 case OP_NOT_VSPACE:
3333 return next == -ESC_v || next == -ESC_R;
3334
3335 case OP_WORDCHAR:
3336 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3337 next == -ESC_v || next == -ESC_R;
3338
3339 case OP_NOT_WORDCHAR:
3340 return next == -ESC_w || next == -ESC_d;
3341
3342 default:
3343 return FALSE;
3344 }
3345
3346 /* Control does not reach here */
3347 }
3348
3349
3350
3351 /*************************************************
3352 * Compile one branch *
3353 *************************************************/
3354
3355 /* Scan the pattern, compiling it into the a vector. If the options are
3356 changed during the branch, the pointer is used to change the external options
3357 bits. This function is used during the pre-compile phase when we are trying
3358 to find out the amount of memory needed, as well as during the real compile
3359 phase. The value of lengthptr distinguishes the two phases.
3360
3361 Arguments:
3362 optionsptr pointer to the option bits
3363 codeptr points to the pointer to the current code point
3364 ptrptr points to the current pattern pointer
3365 errorcodeptr points to error code variable
3366 firstcharptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3367 reqcharptr set to the last literal character required, else < 0
3368 bcptr points to current branch chain
3369 cond_depth conditional nesting depth
3370 cd contains pointers to tables etc.
3371 lengthptr NULL during the real compile phase
3372 points to length accumulator during pre-compile phase
3373
3374 Returns: TRUE on success
3375 FALSE, with *errorcodeptr set non-zero on error
3376 */
3377
3378 static BOOL
3379 compile_branch(int *optionsptr, pcre_uchar **codeptr,
3380 const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3381 pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3382 compile_data *cd, int *lengthptr)
3383 {
3384 int repeat_type, op_type;
3385 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3386 int bravalue = 0;
3387 int greedy_default, greedy_non_default;
3388 pcre_int32 firstchar, reqchar;
3389 pcre_int32 zeroreqchar, zerofirstchar;
3390 pcre_int32 req_caseopt, reqvary, tempreqvary;
3391 int options = *optionsptr; /* May change dynamically */
3392 int after_manual_callout = 0;
3393 int length_prevgroup = 0;
3394 register int c;
3395 register pcre_uchar *code = *codeptr;
3396 pcre_uchar *last_code = code;
3397 pcre_uchar *orig_code = code;
3398 pcre_uchar *tempcode;
3399 BOOL inescq = FALSE;
3400 BOOL groupsetfirstchar = FALSE;
3401 const pcre_uchar *ptr = *ptrptr;
3402 const pcre_uchar *tempptr;
3403 const pcre_uchar *nestptr = NULL;
3404 pcre_uchar *previous = NULL;
3405 pcre_uchar *previous_callout = NULL;
3406 pcre_uchar *save_hwm = NULL;
3407 pcre_uint8 classbits[32];
3408
3409 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3410 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3411 dynamically as we process the pattern. */
3412
3413 #ifdef SUPPORT_UTF
3414 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3415 BOOL utf = (options & PCRE_UTF8) != 0;
3416 pcre_uchar utf_chars[6];
3417 #else
3418 BOOL utf = FALSE;
3419 #endif
3420
3421 /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3422
3423 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3424 BOOL xclass;
3425 pcre_uchar *class_uchardata;
3426 pcre_uchar *class_uchardata_base;
3427 #endif
3428
3429 #ifdef PCRE_DEBUG
3430 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3431 #endif
3432
3433 /* Set up the default and non-default settings for greediness */
3434
3435 greedy_default = ((options & PCRE_UNGREEDY) != 0);
3436 greedy_non_default = greedy_default ^ 1;
3437
3438 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3439 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3440 matches a non-fixed char first char; reqchar just remains unset if we never
3441 find one.
3442
3443 When we hit a repeat whose minimum is zero, we may have to adjust these values
3444 to take the zero repeat into account. This is implemented by setting them to
3445 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3446 item types that can be repeated set these backoff variables appropriately. */
3447
3448 firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3449
3450 /* The variable req_caseopt contains either the REQ_CASELESS value
3451 or zero, according to the current setting of the caseless flag. The
3452 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3453 firstchar or reqchar variables to record the case status of the
3454 value. This is used only for ASCII characters. */
3455
3456 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3457
3458 /* Switch on next character until the end of the branch */
3459
3460 for (;; ptr++)
3461 {
3462 BOOL negate_class;
3463 BOOL should_flip_negation;
3464 BOOL possessive_quantifier;
3465 BOOL is_quantifier;
3466 BOOL is_recurse;
3467 BOOL reset_bracount;
3468 int class_has_8bitchar;
3469 int class_single_char;
3470 int newoptions;
3471 int recno;
3472 int refsign;
3473 int skipbytes;
3474 int subreqchar;
3475 int subfirstchar;
3476 int terminator;
3477 int mclength;
3478 int tempbracount;
3479 pcre_uchar mcbuffer[8];
3480
3481 /* Get next character in the pattern */
3482
3483 c = *ptr;
3484
3485 /* If we are at the end of a nested substitution, revert to the outer level
3486 string. Nesting only happens one level deep. */
3487
3488 if (c == 0 && nestptr != NULL)
3489 {
3490 ptr = nestptr;
3491 nestptr = NULL;
3492 c = *ptr;
3493 }
3494
3495 /* If we are in the pre-compile phase, accumulate the length used for the
3496 previous cycle of this loop. */
3497
3498 if (lengthptr != NULL)
3499 {
3500 #ifdef PCRE_DEBUG
3501 if (code > cd->hwm) cd->hwm = code; /* High water info */
3502 #endif
3503 if (code > cd->start_workspace + cd->workspace_size -
3504 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
3505 {
3506 *errorcodeptr = ERR52;
3507 goto FAILED;
3508 }
3509
3510 /* There is at least one situation where code goes backwards: this is the
3511 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3512 the class is simply eliminated. However, it is created first, so we have to
3513 allow memory for it. Therefore, don't ever reduce the length at this point.
3514 */
3515
3516 if (code < last_code) code = last_code;
3517
3518 /* Paranoid check for integer overflow */
3519
3520 if (OFLOW_MAX - *lengthptr < code - last_code)
3521 {
3522 *errorcodeptr = ERR20;
3523 goto FAILED;
3524 }
3525
3526 *lengthptr += (int)(code - last_code);
3527 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3528 (int)(code - last_code), c, c));
3529
3530 /* If "previous" is set and it is not at the start of the work space, move
3531 it back to there, in order to avoid filling up the work space. Otherwise,
3532 if "previous" is NULL, reset the current code pointer to the start. */
3533
3534 if (previous != NULL)
3535 {
3536 if (previous > orig_code)
3537 {
3538 memmove(orig_code, previous, IN_UCHARS(code - previous));
3539 code -= previous - orig_code;
3540 previous = orig_code;
3541 }
3542 }
3543 else code = orig_code;
3544
3545 /* Remember where this code item starts so we can pick up the length
3546 next time round. */
3547
3548 last_code = code;
3549 }
3550
3551 /* In the real compile phase, just check the workspace used by the forward
3552 reference list. */
3553
3554 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3555 WORK_SIZE_SAFETY_MARGIN)
3556 {
3557 *errorcodeptr = ERR52;
3558 goto FAILED;
3559 }
3560
3561 /* If in \Q...\E, check for the end; if not, we have a literal */
3562
3563 if (inescq && c != 0)
3564 {
3565 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3566 {
3567 inescq = FALSE;
3568 ptr++;
3569 continue;
3570 }
3571 else
3572 {
3573 if (previous_callout != NULL)
3574 {
3575 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3576 complete_callout(previous_callout, ptr, cd);
3577 previous_callout = NULL;
3578 }
3579 if ((options & PCRE_AUTO_CALLOUT) != 0)
3580 {
3581 previous_callout = code;
3582 code = auto_callout(code, ptr, cd);
3583 }
3584 goto NORMAL_CHAR;
3585 }
3586 }
3587
3588 /* Fill in length of a previous callout, except when the next thing is
3589 a quantifier. */
3590
3591 is_quantifier =
3592 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3593 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3594
3595 if (!is_quantifier && previous_callout != NULL &&
3596 after_manual_callout-- <= 0)
3597 {
3598 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3599 complete_callout(previous_callout, ptr, cd);
3600 previous_callout = NULL;
3601 }
3602
3603 /* In extended mode, skip white space and comments. */
3604
3605 if ((options & PCRE_EXTENDED) != 0)
3606 {
3607 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3608 if (c == CHAR_NUMBER_SIGN)
3609 {
3610 ptr++;
3611 while (*ptr != 0)
3612 {
3613 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3614 ptr++;
3615 #ifdef SUPPORT_UTF
3616 if (utf) FORWARDCHAR(ptr);
3617 #endif
3618 }
3619 if (*ptr != 0) continue;
3620
3621 /* Else fall through to handle end of string */
3622 c = 0;
3623 }
3624 }
3625
3626 /* No auto callout for quantifiers. */
3627
3628 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3629 {
3630 previous_callout = code;
3631 code = auto_callout(code, ptr, cd);
3632 }
3633
3634 switch(c)
3635 {
3636 /* ===================================================================*/
3637 case 0: /* The branch terminates at string end */
3638 case CHAR_VERTICAL_LINE: /* or | or ) */
3639 case CHAR_RIGHT_PARENTHESIS:
3640 *firstcharptr = firstchar;
3641 *reqcharptr = reqchar;
3642 *codeptr = code;
3643 *ptrptr = ptr;
3644 if (lengthptr != NULL)
3645 {
3646 if (OFLOW_MAX - *lengthptr < code - last_code)
3647 {
3648 *errorcodeptr = ERR20;
3649 goto FAILED;
3650 }
3651 *lengthptr += (int)(code - last_code); /* To include callout length */
3652 DPRINTF((">> end branch\n"));
3653 }
3654 return TRUE;
3655
3656
3657 /* ===================================================================*/
3658 /* Handle single-character metacharacters. In multiline mode, ^ disables
3659 the setting of any following char as a first character. */
3660
3661 case CHAR_CIRCUMFLEX_ACCENT:
3662 previous = NULL;
3663 if ((options & PCRE_MULTILINE) != 0)
3664 {
3665 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3666 *code++ = OP_CIRCM;
3667 }
3668 else *code++ = OP_CIRC;
3669 break;
3670
3671 case CHAR_DOLLAR_SIGN:
3672 previous = NULL;
3673 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3674 break;
3675
3676 /* There can never be a first char if '.' is first, whatever happens about
3677 repeats. The value of reqchar doesn't change either. */
3678
3679 case CHAR_DOT:
3680 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3681 zerofirstchar = firstchar;
3682 zeroreqchar = reqchar;
3683 previous = code;
3684 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3685 break;
3686
3687
3688 /* ===================================================================*/
3689 /* Character classes. If the included characters are all < 256, we build a
3690 32-byte bitmap of the permitted characters, except in the special case
3691 where there is only one such character. For negated classes, we build the
3692 map as usual, then invert it at the end. However, we use a different opcode
3693 so that data characters > 255 can be handled correctly.
3694
3695 If the class contains characters outside the 0-255 range, a different
3696 opcode is compiled. It may optionally have a bit map for characters < 256,
3697 but those above are are explicitly listed afterwards. A flag byte tells
3698 whether the bitmap is present, and whether this is a negated class or not.
3699
3700 In JavaScript compatibility mode, an isolated ']' causes an error. In
3701 default (Perl) mode, it is treated as a data character. */
3702
3703 case CHAR_RIGHT_SQUARE_BRACKET:
3704 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3705 {
3706 *errorcodeptr = ERR64;
3707 goto FAILED;
3708 }
3709 goto NORMAL_CHAR;
3710
3711 case CHAR_LEFT_SQUARE_BRACKET:
3712 previous = code;
3713
3714 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3715 they are encountered at the top level, so we'll do that too. */
3716
3717 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3718 ptr[1] == CHAR_EQUALS_SIGN) &&
3719 check_posix_syntax(ptr, &tempptr))
3720 {
3721 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3722 goto FAILED;
3723 }
3724
3725 /* If the first character is '^', set the negation flag and skip it. Also,
3726 if the first few characters (either before or after ^) are \Q\E or \E we
3727 skip them too. This makes for compatibility with Perl. */
3728
3729 negate_class = FALSE;
3730 for (;;)
3731 {
3732 c = *(++ptr);
3733 if (c == CHAR_BACKSLASH)
3734 {
3735 if (ptr[1] == CHAR_E)
3736 ptr++;
3737 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3738 ptr += 3;
3739 else
3740 break;
3741 }
3742 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3743 negate_class = TRUE;
3744 else break;
3745 }
3746
3747 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3748 an initial ']' is taken as a data character -- the code below handles
3749 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3750 [^] must match any character, so generate OP_ALLANY. */
3751
3752 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3753 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3754 {
3755 *code++ = negate_class? OP_ALLANY : OP_FAIL;
3756 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3757 zerofirstchar = firstchar;
3758 break;
3759 }
3760
3761 /* If a class contains a negative special such as \S, we need to flip the
3762 negation flag at the end, so that support for characters > 255 works
3763 correctly (they are all included in the class). */
3764
3765 should_flip_negation = FALSE;
3766
3767 /* For optimization purposes, we track some properties of the class.
3768 class_has_8bitchar will be non-zero, if the class contains at least one
3769 < 256 character. class_single_char will be 1 if the class contains only
3770 a single character. */
3771
3772 class_has_8bitchar = 0;
3773 class_single_char = 0;
3774
3775 /* Initialize the 32-char bit map to all zeros. We build the map in a
3776 temporary bit of memory, in case the class contains only 1 character (less
3777 than 256), because in that case the compiled code doesn't use the bit map.
3778 */
3779
3780 memset(classbits, 0, 32 * sizeof(pcre_uint8));
3781
3782 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3783 xclass = FALSE; /* No chars >= 256 */
3784 class_uchardata = code + LINK_SIZE + 2; /* For UTF-8 items */
3785 class_uchardata_base = class_uchardata; /* For resetting in pass 1 */
3786 #endif
3787
3788 /* Process characters until ] is reached. By writing this as a "do" it
3789 means that an initial ] is taken as a data character. At the start of the
3790 loop, c contains the first byte of the character. */
3791
3792 if (c != 0) do
3793 {
3794 const pcre_uchar *oldptr;
3795
3796 #ifdef SUPPORT_UTF
3797 if (utf && HAS_EXTRALEN(c))
3798 { /* Braces are required because the */
3799 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3800 }
3801 #endif
3802
3803 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3804 /* In the pre-compile phase, accumulate the length of any extra
3805 data and reset the pointer. This is so that very large classes that
3806 contain a zillion > 255 characters no longer overwrite the work space
3807 (which is on the stack). */
3808
3809 if (lengthptr != NULL)
3810 {
3811 *lengthptr += class_uchardata - class_uchardata_base;
3812 class_uchardata = class_uchardata_base;
3813 }
3814 #endif
3815
3816 /* Inside \Q...\E everything is literal except \E */
3817
3818 if (inescq)
3819 {
3820 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3821 {
3822 inescq = FALSE; /* Reset literal state */
3823 ptr++; /* Skip the 'E' */
3824 continue; /* Carry on with next */
3825 }
3826 goto CHECK_RANGE; /* Could be range if \E follows */
3827 }
3828
3829 /* Handle POSIX class names. Perl allows a negation extension of the
3830 form [:^name:]. A square bracket that doesn't match the syntax is
3831 treated as a literal. We also recognize the POSIX constructions
3832 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3833 5.6 and 5.8 do. */
3834
3835 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3836 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3837 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3838 {
3839 BOOL local_negate = FALSE;
3840 int posix_class, taboffset, tabopt;
3841 register const pcre_uint8 *cbits = cd->cbits;
3842 pcre_uint8 pbits[32];
3843
3844 if (ptr[1] != CHAR_COLON)
3845 {
3846 *errorcodeptr = ERR31;
3847 goto FAILED;
3848 }
3849
3850 ptr += 2;
3851 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3852 {
3853 local_negate = TRUE;
3854 should_flip_negation = TRUE; /* Note negative special */
3855 ptr++;
3856 }
3857
3858 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3859 if (posix_class < 0)
3860 {
3861 *errorcodeptr = ERR30;
3862 goto FAILED;
3863 }
3864
3865 /* If matching is caseless, upper and lower are converted to
3866 alpha. This relies on the fact that the class table starts with
3867 alpha, lower, upper as the first 3 entries. */
3868
3869 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3870 posix_class = 0;
3871
3872 /* When PCRE_UCP is set, some of the POSIX classes are converted to
3873 different escape sequences that use Unicode properties. */
3874
3875 #ifdef SUPPORT_UCP
3876 if ((options & PCRE_UCP) != 0)
3877 {
3878 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3879 if (posix_substitutes[pc] != NULL)
3880 {
3881 nestptr = tempptr + 1;
3882 ptr = posix_substitutes[pc] - 1;
3883 continue;
3884 }
3885 }
3886 #endif
3887 /* In the non-UCP case, we build the bit map for the POSIX class in a
3888 chunk of local store because we may be adding and subtracting from it,
3889 and we don't want to subtract bits that may be in the main map already.
3890 At the end we or the result into the bit map that is being built. */
3891
3892 posix_class *= 3;
3893
3894 /* Copy in the first table (always present) */
3895
3896 memcpy(pbits, cbits + posix_class_maps[posix_class],
3897 32 * sizeof(pcre_uint8));
3898
3899 /* If there is a second table, add or remove it as required. */
3900
3901 taboffset = posix_class_maps[posix_class + 1];
3902 tabopt = posix_class_maps[posix_class + 2];
3903
3904 if (taboffset >= 0)
3905 {
3906 if (tabopt >= 0)
3907 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3908 else
3909 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3910 }
3911
3912 /* Not see if we need to remove any special characters. An option
3913 value of 1 removes vertical space and 2 removes underscore. */
3914
3915 if (tabopt < 0) tabopt = -tabopt;
3916 if (tabopt == 1) pbits[1] &= ~0x3c;
3917 else if (tabopt == 2) pbits[11] &= 0x7f;
3918
3919 /* Add the POSIX table or its complement into the main table that is
3920 being built and we are done. */
3921
3922 if (local_negate)
3923 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3924 else
3925 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3926
3927 ptr = tempptr + 1;
3928 /* Every class contains at least one < 256 characters. */
3929 class_has_8bitchar = 1;
3930 /* Every class contains at least two characters. */
3931 class_single_char = 2;
3932 continue; /* End of POSIX syntax handling */
3933 }
3934
3935 /* Backslash may introduce a single character, or it may introduce one
3936 of the specials, which just set a flag. The sequence \b is a special
3937 case. Inside a class (and only there) it is treated as backspace. We
3938 assume that other escapes have more than one character in them, so
3939 speculatively set both class_has_8bitchar and class_single_char bigger
3940 than one. Unrecognized escapes fall through and are either treated
3941 as literal characters (by default), or are faulted if
3942 PCRE_EXTRA is set. */
3943
3944 if (c == CHAR_BACKSLASH)
3945 {
3946 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3947 if (*errorcodeptr != 0) goto FAILED;
3948
3949 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3950 else if (-c == ESC_N) /* \N is not supported in a class */
3951 {
3952 *errorcodeptr = ERR71;
3953 goto FAILED;
3954 }
3955 else if (-c == ESC_Q) /* Handle start of quoted string */
3956 {
3957 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3958 {
3959 ptr += 2; /* avoid empty string */
3960 }
3961 else inescq = TRUE;
3962 continue;
3963 }
3964 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3965
3966 if (c < 0)
3967 {
3968 register const pcre_uint8 *cbits = cd->cbits;
3969 /* Every class contains at least two < 256 characters. */
3970 class_has_8bitchar++;
3971 /* Every class contains at least two characters. */
3972 class_single_char += 2;
3973
3974 switch (-c)
3975 {
3976 #ifdef SUPPORT_UCP
3977 case ESC_du: /* These are the values given for \d etc */
3978 case ESC_DU: /* when PCRE_UCP is set. We replace the */
3979 case ESC_wu: /* escape sequence with an appropriate \p */
3980 case ESC_WU: /* or \P to test Unicode properties instead */
3981 case ESC_su: /* of the default ASCII testing. */
3982 case ESC_SU:
3983 nestptr = ptr;
3984 ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3985 class_has_8bitchar--; /* Undo! */
3986 continue;
3987 #endif
3988 case ESC_d:
3989 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3990 continue;
3991
3992 case ESC_D:
3993 should_flip_negation = TRUE;
3994 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3995 continue;
3996
3997 case ESC_w:
3998 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3999 continue;
4000
4001 case ESC_W:
4002 should_flip_negation = TRUE;
4003 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4004 continue;
4005
4006 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4007 if it was previously set by something earlier in the character
4008 class. */
4009
4010 case ESC_s:
4011 classbits[0] |= cbits[cbit_space];
4012 classbits[1] |= cbits[cbit_space+1] & ~0x08;
4013 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4014 continue;
4015
4016 case ESC_S:
4017 should_flip_negation = TRUE;
4018 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4019 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
4020 continue;
4021
4022 case ESC_h:
4023 SETBIT(classbits, 0x09); /* VT */
4024 SETBIT(classbits, 0x20); /* SPACE */
4025 SETBIT(classbits, 0xa0); /* NSBP */
4026 #ifndef COMPILE_PCRE8
4027 xclass = TRUE;
4028 *class_uchardata++ = XCL_SINGLE;
4029 *class_uchardata++ = 0x1680;
4030 *class_uchardata++ = XCL_SINGLE;
4031 *class_uchardata++ = 0x180e;
4032 *class_uchardata++ = XCL_RANGE;
4033 *class_uchardata++ = 0x2000;
4034 *class_uchardata++ = 0x200a;
4035 *class_uchardata++ = XCL_SINGLE;
4036 *class_uchardata++ = 0x202f;
4037 *class_uchardata++ = XCL_SINGLE;
4038 *class_uchardata++ = 0x205f;
4039 *class_uchardata++ = XCL_SINGLE;
4040 *class_uchardata++ = 0x3000;
4041 #elif defined SUPPORT_UTF
4042 if (utf)
4043 {
4044 xclass = TRUE;
4045 *class_uchardata++ = XCL_SINGLE;
4046 class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4047 *class_uchardata++ = XCL_SINGLE;
4048 class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4049 *class_uchardata++ = XCL_RANGE;
4050 class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4051 class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4052 *class_uchardata++ = XCL_SINGLE;
4053 class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4054 *class_uchardata++ = XCL_SINGLE;
4055 class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4056 *class_uchardata++ = XCL_SINGLE;
4057 class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4058 }
4059 #endif
4060 continue;
4061
4062 case ESC_H:
4063 for (c = 0; c < 32; c++)
4064 {
4065 int x = 0xff;
4066 switch (c)
4067 {
4068 case 0x09/8: x ^= 1 << (0x09%8); break;
4069 case 0x20/8: x ^= 1 << (0x20%8); break;
4070 case 0xa0/8: x ^= 1 << (0xa0%8); break;
4071 default: break;
4072 }
4073 classbits[c] |= x;
4074 }
4075 #ifndef COMPILE_PCRE8
4076 xclass = TRUE;
4077 *class_uchardata++ = XCL_RANGE;
4078 *class_uchardata++ = 0x0100;
4079 *class_uchardata++ = 0x167f;
4080 *class_uchardata++ = XCL_RANGE;
4081 *class_uchardata++ = 0x1681;
4082 *class_uchardata++ = 0x180d;
4083 *class_uchardata++ = XCL_RANGE;
4084 *class_uchardata++ = 0x180f;
4085 *class_uchardata++ = 0x1fff;
4086 *class_uchardata++ = XCL_RANGE;
4087 *class_uchardata++ = 0x200b;
4088 *class_uchardata++ = 0x202e;
4089 *class_uchardata++ = XCL_RANGE;
4090 *class_uchardata++ = 0x2030;
4091 *class_uchardata++ = 0x205e;
4092 *class_uchardata++ = XCL_RANGE;
4093 *class_uchardata++ = 0x2060;
4094 *class_uchardata++ = 0x2fff;
4095 *class_uchardata++ = XCL_RANGE;
4096 *class_uchardata++ = 0x3001;
4097 #ifdef SUPPORT_UTF
4098 if (utf)
4099 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4100 else
4101 #endif
4102 *class_uchardata++ = 0xffff;
4103 #elif defined SUPPORT_UTF
4104 if (utf)
4105 {
4106 xclass = TRUE;
4107 *class_uchardata++ = XCL_RANGE;
4108 class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4109 class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4110 *class_uchardata++ = XCL_RANGE;
4111 class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4112 class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4113 *class_uchardata++ = XCL_RANGE;
4114 class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4115 class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4116 *class_uchardata++ = XCL_RANGE;
4117 class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4118 class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4119 *class_uchardata++ = XCL_RANGE;
4120 class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4121 class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4122 *class_uchardata++ = XCL_RANGE;
4123 class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4124 class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4125 *class_uchardata++ = XCL_RANGE;
4126 class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4127 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4128 }
4129 #endif
4130 continue;
4131
4132 case ESC_v:
4133 SETBIT(classbits, 0x0a); /* LF */
4134 SETBIT(classbits, 0x0b); /* VT */
4135 SETBIT(classbits, 0x0c); /* FF */
4136 SETBIT(classbits, 0x0d); /* CR */
4137 SETBIT(classbits, 0x85); /* NEL */
4138 #ifndef COMPILE_PCRE8
4139 xclass = TRUE;
4140 *class_uchardata++ = XCL_RANGE;
4141 *class_uchardata++ = 0x2028;
4142 *class_uchardata++ = 0x2029;
4143 #elif defined SUPPORT_UTF
4144 if (utf)
4145 {
4146 xclass = TRUE;
4147 *class_uchardata++ = XCL_RANGE;
4148 class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4149 class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4150 }
4151 #endif
4152 continue;
4153
4154 case ESC_V:
4155 for (c = 0; c < 32; c++)
4156 {
4157 int x = 0xff;
4158 switch (c)
4159 {
4160 case 0x0a/8: x ^= 1 << (0x0a%8);
4161 x ^= 1 << (0x0b%8);
4162 x ^= 1 << (0x0c%8);
4163 x ^= 1 << (0x0d%8);
4164 break;
4165 case 0x85/8: x ^= 1 << (0x85%8); break;
4166 default: break;
4167 }
4168 classbits[c] |= x;
4169 }
4170
4171 #ifndef COMPILE_PCRE8
4172 xclass = TRUE;
4173 *class_uchardata++ = XCL_RANGE;
4174 *class_uchardata++ = 0x0100;
4175 *class_uchardata++ = 0x2027;
4176 *class_uchardata++ = XCL_RANGE;
4177 *class_uchardata++ = 0x202a;
4178 #ifdef SUPPORT_UTF
4179 if (utf)
4180 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4181 else
4182 #endif
4183 *class_uchardata++ = 0xffff;
4184 #elif defined SUPPORT_UTF
4185 if (utf)
4186 {
4187 xclass = TRUE;
4188 *class_uchardata++ = XCL_RANGE;
4189 class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4190 class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4191 *class_uchardata++ = XCL_RANGE;
4192 class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4193 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4194 }
4195 #endif
4196 continue;
4197
4198 #ifdef SUPPORT_UCP
4199 case ESC_p:
4200 case ESC_P:
4201 {
4202 BOOL negated;
4203 int pdata;
4204 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4205 if (ptype < 0) goto FAILED;
4206 xclass = TRUE;
4207 *class_uchardata++ = ((-c == ESC_p) != negated)?
4208 XCL_PROP : XCL_NOTPROP;
4209 *class_uchardata++ = ptype;
4210 *class_uchardata++ = pdata;
4211 class_has_8bitchar--; /* Undo! */
4212 continue;
4213 }
4214 #endif
4215 /* Unrecognized escapes are faulted if PCRE is running in its
4216 strict mode. By default, for compatibility with Perl, they are
4217 treated as literals. */
4218
4219 default:
4220 if ((options & PCRE_EXTRA) != 0)
4221 {
4222 *errorcodeptr = ERR7;
4223 goto FAILED;
4224 }
4225 class_has_8bitchar--; /* Undo the speculative increase. */
4226 class_single_char -= 2; /* Undo the speculative increase. */
4227 c = *ptr; /* Get the final character and fall through */
4228 break;
4229 }
4230 }
4231
4232 /* Fall through if we have a single character (c >= 0). This may be
4233 greater than 256. */
4234
4235 } /* End of backslash handling */
4236
4237 /* A single character may be followed by '-' to form a range. However,
4238 Perl does not permit ']' to be the end of the range. A '-' character
4239 at the end is treated as a literal. Perl ignores orphaned \E sequences
4240 entirely. The code for handling \Q and \E is messy. */
4241
4242 CHECK_RANGE:
4243 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4244 {
4245 inescq = FALSE;
4246 ptr += 2;
4247 }
4248
4249 oldptr = ptr;
4250
4251 /* Remember \r or \n */
4252
4253 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4254
4255 /* Check for range */
4256
4257 if (!inescq && ptr[1] == CHAR_MINUS)
4258 {
4259 int d;
4260 ptr += 2;
4261 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4262
4263 /* If we hit \Q (not followed by \E) at this point, go into escaped
4264 mode. */
4265
4266 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4267 {
4268 ptr += 2;
4269 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4270 { ptr += 2; continue; }
4271 inescq = TRUE;
4272 break;
4273 }
4274
4275 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4276 {
4277 ptr = oldptr;
4278 goto LONE_SINGLE_CHARACTER;
4279 }
4280
4281 #ifdef SUPPORT_UTF
4282 if (utf)
4283 { /* Braces are required because the */
4284 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4285 }
4286 else
4287 #endif
4288 d = *ptr; /* Not UTF-8 mode */
4289
4290 /* The second part of a range can be a single-character escape, but
4291 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4292 in such circumstances. */
4293
4294 if (!inescq && d == CHAR_BACKSLASH)
4295 {
4296 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4297 if (*errorcodeptr != 0) goto FAILED;
4298
4299 /* \b is backspace; any other special means the '-' was literal */
4300
4301 if (d < 0)
4302 {
4303 if (d == -ESC_b) d = CHAR_BS; else
4304 {
4305 ptr = oldptr;
4306 goto LONE_SINGLE_CHARACTER; /* A few lines below */
4307 }
4308 }
4309 }
4310
4311 /* Check that the two values are in the correct order. Optimize
4312 one-character ranges */
4313
4314 if (d < c)
4315 {
4316 *errorcodeptr = ERR8;
4317 goto FAILED;
4318 }
4319
4320 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
4321
4322 /* Remember \r or \n */
4323
4324 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4325
4326 /* Since we found a character range, single character optimizations
4327 cannot be done anymore. */
4328 class_single_char = 2;
4329
4330 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4331 matching, we have to use an XCLASS with extra data items. Caseless
4332 matching for characters > 127 is available only if UCP support is
4333 available. */
4334
4335 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4336 if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4337 #elif defined SUPPORT_UTF
4338 if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4339 #elif !(defined COMPILE_PCRE8)
4340 if (d > 255)
4341 #endif
4342 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4343 {
4344 xclass = TRUE;
4345
4346 /* With UCP support, we can find the other case equivalents of
4347 the relevant characters. There may be several ranges. Optimize how
4348 they fit with the basic range. */
4349
4350 #ifdef SUPPORT_UCP
4351 #ifndef COMPILE_PCRE8
4352 if (utf && (options & PCRE_CASELESS) != 0)
4353 #else
4354 if ((options & PCRE_CASELESS) != 0)
4355 #endif
4356 {
4357 unsigned int occ, ocd;
4358 unsigned int cc = c;
4359 unsigned int origd = d;
4360 while (get_othercase_range(&cc, origd, &occ, &ocd))
4361 {
4362 if (occ >= (unsigned int)c &&
4363 ocd <= (unsigned int)d)
4364 continue; /* Skip embedded ranges */
4365
4366 if (occ < (unsigned int)c &&
4367 ocd >= (unsigned int)c - 1) /* Extend the basic range */
4368 { /* if there is overlap, */
4369 c = occ; /* noting that if occ < c */
4370 continue; /* we can't have ocd > d */
4371 } /* because a subrange is */
4372 if (ocd > (unsigned int)d &&
4373 occ <= (unsigned int)d + 1) /* always shorter than */
4374 { /* the basic range. */
4375 d = ocd;
4376 continue;
4377 }
4378
4379 if (occ == ocd)
4380 {
4381 *class_uchardata++ = XCL_SINGLE;
4382 }
4383 else
4384 {
4385 *class_uchardata++ = XCL_RANGE;
4386 class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4387 }
4388 class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4389 }
4390 }
4391 #endif /* SUPPORT_UCP */
4392
4393 /* Now record the original range, possibly modified for UCP caseless
4394 overlapping ranges. */
4395
4396 *class_uchardata++ = XCL_RANGE;
4397 #ifdef SUPPORT_UTF
4398 #ifndef COMPILE_PCRE8
4399 if (utf)
4400 {
4401 class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4402 class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4403 }
4404 else
4405 {
4406 *class_uchardata++ = c;
4407 *class_uchardata++ = d;
4408 }
4409 #else
4410 class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4411 class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4412 #endif
4413 #else /* SUPPORT_UTF */
4414 *class_uchardata++ = c;
4415 *class_uchardata++ = d;
4416 #endif /* SUPPORT_UTF */
4417
4418 /* With UCP support, we are done. Without UCP support, there is no
4419 caseless matching for UTF characters > 127; we can use the bit map
4420 for the smaller ones. As for 16 bit characters without UTF, we
4421 can still use */
4422
4423 #ifdef SUPPORT_UCP
4424 #ifndef COMPILE_PCRE8
4425 if (utf)
4426 #endif
4427 continue; /* With next character in the class */
4428 #endif /* SUPPORT_UCP */
4429
4430 #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4431 if (utf)
4432 {
4433 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4434 /* Adjust upper limit and fall through to set up the map */
4435 d = 127;
4436 }
4437 else
4438 {
4439 if (c > 255) continue;
4440 /* Adjust upper limit and fall through to set up the map */
4441 d = 255;
4442 }
4443 #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4444 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4445 /* Adjust upper limit and fall through to set up the map */
4446 d = 127;
4447 #else
4448 if (c > 255) continue;
4449 /* Adjust upper limit and fall through to set up the map */
4450 d = 255;
4451 #endif /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4452 }
4453 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4454
4455 /* We use the bit map for 8 bit mode, or when the characters fall
4456 partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4457
4458 class_has_8bitchar = 1;
4459
4460 /* We can save a bit of time by skipping this in the pre-compile. */
4461
4462 if (lengthptr == NULL) for (; c <= d; c++)
4463 {
4464 classbits[c/8] |= (1 << (c&7));
4465 if ((options & PCRE_CASELESS) != 0)
4466 {
4467 int uc = cd->fcc[c]; /* flip case */
4468 classbits[uc/8] |= (1 << (uc&7));
4469 }
4470 }
4471
4472 continue; /* Go get the next char in the class */
4473 }
4474
4475 /* Handle a lone single character - we can get here for a normal
4476 non-escape char, or after \ that introduces a single character or for an
4477 apparent range that isn't. */
4478
4479 LONE_SINGLE_CHARACTER:
4480
4481 /* Only the value of 1 matters for class_single_char. */
4482 if (class_single_char < 2) class_single_char++;
4483
4484 /* If class_charcount is 1, we saw precisely one character. As long as
4485 there were no negated characters >= 128 and there was no use of \p or \P,
4486 in other words, no use of any XCLASS features, we can optimize.
4487
4488 In UTF-8 mode, we can optimize the negative case only if there were no
4489 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
4490 operate on single-bytes characters only. This is an historical hangover.
4491 Maybe one day we can tidy these opcodes to handle multi-byte characters.
4492
4493 The optimization throws away the bit map. We turn the item into a
4494 1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4495 Note that OP_NOT[I] does not support multibyte characters. In the positive
4496 case, it can cause firstchar to be set. Otherwise, there can be no first
4497 char if this item is first, whatever repeat count may follow. In the case
4498 of reqchar, save the previous value for reinstating. */
4499
4500 #ifdef SUPPORT_UTF
4501 if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
4502 && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
4503 #else
4504 if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4505 #endif
4506 {
4507 ptr++;
4508 zeroreqchar = reqchar;
4509
4510 /* The OP_NOT[I] opcodes work on single characters only. */
4511
4512 if (negate_class)
4513 {
4514 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4515 zerofirstchar = firstchar;
4516 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4517 *code++ = c;
4518 goto NOT_CHAR;
4519 }
4520
4521 /* For a single, positive character, get the value into mcbuffer, and
4522 then we can handle this with the normal one-character code. */
4523
4524 #ifdef SUPPORT_UTF
4525 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4526 mclength = PRIV(ord2utf)(c, mcbuffer);
4527 else
4528 #endif
4529 {
4530 mcbuffer[0] = c;
4531 mclength = 1;
4532 }
4533 goto ONE_CHAR;
4534 } /* End of 1-char optimization */
4535
4536 /* Handle a character that cannot go in the bit map. */
4537
4538 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4539 if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4540 #elif defined SUPPORT_UTF
4541 if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4542 #elif !(defined COMPILE_PCRE8)
4543 if (c > 255)
4544 #endif
4545
4546 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4547 {
4548 xclass = TRUE;
4549 *class_uchardata++ = XCL_SINGLE;
4550 #ifdef SUPPORT_UTF
4551 #ifndef COMPILE_PCRE8
4552 /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4553 if (!utf)
4554 *class_uchardata++ = c;
4555 else
4556 #endif
4557 class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4558 #else /* SUPPORT_UTF */
4559 *class_uchardata++ = c;
4560 #endif /* SUPPORT_UTF */
4561
4562 #ifdef SUPPORT_UCP
4563 #ifdef COMPILE_PCRE8
4564 if ((options & PCRE_CASELESS) != 0)
4565 #else
4566 /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4567 if (utf && (options & PCRE_CASELESS) != 0)
4568 #endif
4569 {
4570 unsigned int othercase;
4571 if ((othercase = UCD_OTHERCASE(c)) != c)
4572 {
4573 *class_uchardata++ = XCL_SINGLE;
4574 class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4575 }
4576 }
4577 #endif /* SUPPORT_UCP */
4578
4579 }
4580 else
4581 #endif /* SUPPORT_UTF || COMPILE_PCRE16 */
4582
4583 /* Handle a single-byte character */
4584 {
4585 class_has_8bitchar = 1;
4586 classbits[c/8] |= (1 << (c&7));
4587 if ((options & PCRE_CASELESS) != 0)
4588 {
4589 c = cd->fcc[c]; /* flip case */
4590 classbits[c/8] |= (1 << (c&7));
4591 }
4592 }
4593 }
4594
4595 /* Loop until ']' reached. This "while" is the end of the "do" far above.
4596 If we are at the end of an internal nested string, revert to the outer
4597 string. */
4598
4599 while (((c = *(++ptr)) != 0 ||
4600 (nestptr != NULL &&
4601 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
4602 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4603
4604 /* Check for missing terminating ']' */
4605
4606 if (c == 0)
4607 {
4608 *errorcodeptr = ERR6;
4609 goto FAILED;
4610 }
4611
4612 /* If this is the first thing in the branch, there can be no first char
4613 setting, whatever the repeat count. Any reqchar setting must remain
4614 unchanged after any kind of repeat. */
4615
4616 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4617 zerofirstchar = firstchar;
4618 zeroreqchar = reqchar;
4619
4620 /* If there are characters with values > 255, we have to compile an
4621 extended class, with its own opcode, unless there was a negated special
4622 such as \S in the class, and PCRE_UCP is not set, because in that case all
4623 characters > 255 are in the class, so any that were explicitly given as
4624 well can be ignored. If (when there are explicit characters > 255 that must
4625 be listed) there are no characters < 256, we can omit the bitmap in the
4626 actual compiled code. */
4627
4628 #ifdef SUPPORT_UTF
4629 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4630 #elif !defined COMPILE_PCRE8
4631 if (xclass && !should_flip_negation)
4632 #endif
4633 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4634 {
4635 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
4636 *code++ = OP_XCLASS;
4637 code += LINK_SIZE;
4638 *code = negate_class? XCL_NOT:0;
4639
4640 /* If the map is required, move up the extra data to make room for it;
4641 otherwise just move the code pointer to the end of the extra data. */
4642
4643 if (class_has_8bitchar > 0)
4644 {
4645 *code++ |= XCL_MAP;
4646 memmove(code + (32 / sizeof(pcre_uchar)), code,
4647 IN_UCHARS(class_uchardata - code));
4648 memcpy(code, classbits, 32);
4649 code = class_uchardata + (32 / sizeof(pcre_uchar));
4650 }
4651 else code = class_uchardata;
4652
4653 /* Now fill in the complete length of the item */
4654
4655 PUT(previous, 1, (int)(code - previous));
4656 break; /* End of class handling */
4657 }
4658 #endif
4659
4660 /* If there are no characters > 255, or they are all to be included or
4661 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4662 whole class was negated and whether there were negative specials such as \S
4663 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4664 negating it if necessary. */
4665
4666 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4667 if (lengthptr == NULL) /* Save time in the pre-compile phase */
4668 {
4669 if (negate_class)
4670 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4671 memcpy(code, classbits, 32);
4672 }
4673 code += 32 / sizeof(pcre_uchar);
4674 NOT_CHAR:
4675 break;
4676
4677
4678 /* ===================================================================*/
4679 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4680 has been tested above. */
4681
4682 case CHAR_LEFT_CURLY_BRACKET:
4683 if (!is_quantifier) goto NORMAL_CHAR;
4684 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4685 if (*errorcodeptr != 0) goto FAILED;
4686 goto REPEAT;
4687
4688 case CHAR_ASTERISK:
4689 repeat_min = 0;
4690 repeat_max = -1;
4691 goto REPEAT;
4692
4693 case CHAR_PLUS:
4694 repeat_min = 1;
4695 repeat_max = -1;
4696 goto REPEAT;
4697
4698 case CHAR_QUESTION_MARK:
4699 repeat_min = 0;
4700 repeat_max = 1;
4701
4702 REPEAT:
4703 if (previous == NULL)
4704 {
4705 *errorcodeptr = ERR9;
4706 goto FAILED;
4707 }
4708
4709 if (repeat_min == 0)
4710 {
4711 firstchar = zerofirstchar; /* Adjust for zero repeat */
4712 reqchar = zeroreqchar; /* Ditto */
4713 }
4714
4715 /* Remember whether this is a variable length repeat */
4716
4717 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4718
4719 op_type = 0; /* Default single-char op codes */
4720 possessive_quantifier = FALSE; /* Default not possessive quantifier */
4721
4722 /* Save start of previous item, in case we have to move it up in order to
4723 insert something before it. */
4724
4725 tempcode = previous;
4726
4727 /* If the next character is '+', we have a possessive quantifier. This
4728 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4729 If the next character is '?' this is a minimizing repeat, by default,
4730 but if PCRE_UNGREEDY is set, it works the other way round. We change the
4731 repeat type to the non-default. */
4732
4733 if (ptr[1] == CHAR_PLUS)
4734 {
4735 repeat_type = 0; /* Force greedy */
4736 possessive_quantifier = TRUE;
4737 ptr++;
4738 }
4739 else if (ptr[1] == CHAR_QUESTION_MARK)
4740 {
4741 repeat_type = greedy_non_default;
4742 ptr++;
4743 }
4744 else repeat_type = greedy_default;
4745
4746 /* If previous was a recursion call, wrap it in atomic brackets so that
4747 previous becomes the atomic group. All recursions were so wrapped in the
4748 past, but it no longer happens for non-repeated recursions. In fact, the
4749 repeated ones could be re-implemented independently so as not to need this,
4750 but for the moment we rely on the code for repeating groups. */
4751
4752 if (*previous == OP_RECURSE)
4753 {
4754 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4755 *previous = OP_ONCE;
4756 PUT(previous, 1, 2 + 2*LINK_SIZE);
4757 previous[2 + 2*LINK_SIZE] = OP_KET;
4758 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4759 code += 2 + 2 * LINK_SIZE;
4760 length_prevgroup = 3 + 3*LINK_SIZE;
4761
4762 /* When actually compiling, we need to check whether this was a forward
4763 reference, and if so, adjust the offset. */
4764
4765 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4766 {
4767 int offset = GET(cd->hwm, -LINK_SIZE);
4768 if (offset == previous + 1 - cd->start_code)
4769 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4770 }
4771 }
4772
4773 /* Now handle repetition for the different types of item. */
4774
4775 /* If previous was a character match, abolish the item and generate a
4776 repeat item instead. If a char item has a minumum of more than one, ensure
4777 that it is set in reqchar - it might not be if a sequence such as x{3} is
4778 the first thing in a branch because the x will have gone into firstchar
4779 instead. */
4780
4781 if (*previous == OP_CHAR || *previous == OP_CHARI)
4782 {
4783 op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4784
4785 /* Deal with UTF characters that take up more than one character. It's
4786 easier to write this out separately than try to macrify it. Use c to
4787 hold the length of the character in bytes, plus UTF_LENGTH to flag that
4788 it's a length rather than a small character. */
4789
4790 #ifdef SUPPORT_UTF
4791 if (utf && NOT_FIRSTCHAR(code[-1]))
4792 {
4793 pcre_uchar *lastchar = code - 1;
4794 BACKCHAR(lastchar);
4795 c = (int)(code - lastchar); /* Length of UTF-8 character */
4796 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4797 c |= UTF_LENGTH; /* Flag c as a length */
4798 }
4799 else
4800 #endif /* SUPPORT_UTF */
4801
4802 /* Handle the case of a single charater - either with no UTF support, or
4803 with UTF disabled, or for a single character UTF character. */
4804 {
4805 c = code[-1];
4806 if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
4807 }
4808
4809 /* If the repetition is unlimited, it pays to see if the next thing on
4810 the line is something that cannot possibly match this character. If so,
4811 automatically possessifying this item gains some performance in the case
4812 where the match fails. */
4813
4814 if (!possessive_quantifier &&
4815 repeat_max < 0 &&
4816 check_auto_possessive(previous, utf, ptr + 1, options, cd))
4817 {
4818 repeat_type = 0; /* Force greedy */
4819 possessive_quantifier = TRUE;
4820 }
4821
4822 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
4823 }
4824
4825 /* If previous was a single negated character ([^a] or similar), we use
4826 one of the special opcodes, replacing it. The code is shared with single-
4827 character repeats by setting opt_type to add a suitable offset into
4828 repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
4829 are currently used only for single-byte chars. */
4830
4831 else if (*previous == OP_NOT || *previous == OP_NOTI)
4832 {
4833 op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
4834 c = previous[1];
4835 if (!possessive_quantifier &&
4836 repeat_max < 0 &&
4837 check_auto_possessive(previous, utf, ptr + 1, options, cd))
4838 {
4839 repeat_type = 0; /* Force greedy */
4840 possessive_quantifier = TRUE;
4841 }
4842 goto OUTPUT_SINGLE_REPEAT;
4843 }
4844
4845 /* If previous was a character type match (\d or similar), abolish it and
4846 create a suitable repeat item. The code is shared with single-character
4847 repeats by setting op_type to add a suitable offset into repeat_type. Note
4848 the the Unicode property types will be present only when SUPPORT_UCP is
4849 defined, but we don't wrap the little bits of code here because it just
4850 makes it horribly messy. */
4851
4852 else if (*previous < OP_EODN)
4853 {
4854 pcre_uchar *oldcode;
4855 int prop_type, prop_value;
4856 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
4857 c = *previous;
4858
4859 if (!possessive_quantifier &&
4860 repeat_max < 0 &&
4861 check_auto_possessive(previous, utf, ptr + 1, options, cd))
4862 {
4863 repeat_type = 0; /* Force greedy */
4864 possessive_quantifier = TRUE;
4865 }
4866
4867 OUTPUT_SINGLE_REPEAT:
4868 if (*previous == OP_PROP || *previous == OP_NOTPROP)
4869 {
4870 prop_type = previous[1];
4871 prop_value = previous[2];
4872 }
4873 else prop_type = prop_value = -1;
4874
4875 oldcode = code;
4876 code = previous; /* Usually overwrite previous item */
4877
4878 /* If the maximum is zero then the minimum must also be zero; Perl allows
4879 this case, so we do too - by simply omitting the item altogether. */
4880
4881 if (repeat_max == 0) goto END_REPEAT;
4882
4883 /*--------------------------------------------------------------------*/
4884 /* This code is obsolete from release 8.00; the restriction was finally
4885 removed: */
4886
4887 /* All real repeats make it impossible to handle partial matching (maybe
4888 one day we will be able to remove this restriction). */
4889
4890 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4891 /*--------------------------------------------------------------------*/
4892
4893 /* Combine the op_type with the repeat_type */
4894
4895 repeat_type += op_type;
4896
4897 /* A minimum of zero is handled either as the special case * or ?, or as
4898 an UPTO, with the maximum given. */
4899
4900 if (repeat_min == 0)
4901 {
4902 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4903 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4904 else
4905 {
4906 *code++ = OP_UPTO + repeat_type;
4907 PUT2INC(code, 0, repeat_max);
4908 }
4909 }
4910
4911 /* A repeat minimum of 1 is optimized into some special cases. If the
4912 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4913 left in place and, if the maximum is greater than 1, we use OP_UPTO with
4914 one less than the maximum. */
4915
4916 else if (repeat_min == 1)
4917 {
4918 if (repeat_max == -1)
4919 *code++ = OP_PLUS + repeat_type;
4920 else
4921 {
4922 code = oldcode; /* leave previous item in place */
4923 if (repeat_max == 1) goto END_REPEAT;
4924 *code++ = OP_UPTO + repeat_type;
4925 PUT2INC(code, 0, repeat_max - 1);
4926 }
4927 }
4928
4929 /* The case {n,n} is just an EXACT, while the general case {n,m} is
4930 handled as an EXACT followed by an UPTO. */
4931
4932 else
4933 {
4934 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
4935 PUT2INC(code, 0, repeat_min);
4936
4937 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4938 we have to insert the character for the previous code. For a repeated
4939 Unicode property match, there are two extra bytes that define the
4940 required property. In UTF-8 mode, long characters have their length in
4941 c, with the UTF_LENGTH bit as a flag. */
4942
4943 if (repeat_max < 0)
4944 {
4945 #ifdef SUPPORT_UTF
4946 if (utf && (c & UTF_LENGTH) != 0)
4947 {
4948 memcpy(code, utf_chars, IN_UCHARS(c & 7));
4949 code += c & 7;
4950 }
4951 else
4952 #endif
4953 {
4954 *code++ = c;
4955 if (prop_type >= 0)
4956 {
4957 *code++ = prop_type;
4958 *code++ = prop_value;
4959 }
4960 }
4961 *code++ = OP_STAR + repeat_type;
4962 }
4963
4964 /* Else insert an UPTO if the max is greater than the min, again
4965 preceded by the character, for the previously inserted code. If the
4966 UPTO is just for 1 instance, we can use QUERY instead. */
4967
4968 else if (repeat_max != repeat_min)
4969 {
4970 #ifdef SUPPORT_UTF
4971 if (utf && (c & UTF_LENGTH) != 0)
4972 {
4973 memcpy(code, utf_chars, IN_UCHARS(c & 7));
4974 code += c & 7;
4975 }
4976 else
4977 #endif
4978 *code++ = c;
4979 if (prop_type >= 0)
4980 {
4981 *code++ = prop_type;
4982 *code++ = prop_value;
4983 }
4984 repeat_max -= repeat_min;
4985
4986 if (repeat_max == 1)
4987 {
4988 *code++ = OP_QUERY + repeat_type;
4989 }
4990 else
4991 {
4992 *code++ = OP_UPTO + repeat_type;
4993 PUT2INC(code, 0, repeat_max);
4994 }
4995 }
4996 }
4997
4998 /* The character or character type itself comes last in all cases. */
4999
5000 #ifdef SUPPORT_UTF
5001 if (utf && (c & UTF_LENGTH) != 0)
5002 {
5003 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5004 code += c & 7;
5005 }
5006 else
5007 #endif
5008 *code++ = c;
5009
5010 /* For a repeated Unicode property match, there are two extra bytes that
5011 define the required property. */
5012
5013 #ifdef SUPPORT_UCP
5014 if (prop_type >= 0)
5015 {
5016 *code++ = prop_type;
5017 *code++ = prop_value;
5018 }
5019 #endif
5020 }
5021
5022 /* If previous was a character class or a back reference, we put the repeat
5023 stuff after it, but just skip the item if the repeat was {0,0}. */
5024
5025 else if (*previous == OP_CLASS ||
5026 *previous == OP_NCLASS ||
5027 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5028 *previous == OP_XCLASS ||
5029 #endif
5030 *previous == OP_REF ||
5031 *previous == OP_REFI)
5032 {
5033 if (repeat_max == 0)
5034 {
5035 code = previous;
5036 goto END_REPEAT;
5037 }
5038
5039 /*--------------------------------------------------------------------*/
5040 /* This code is obsolete from release 8.00; the restriction was finally
5041 removed: */
5042
5043 /* All real repeats make it impossible to handle partial matching (maybe
5044 one day we will be able to remove this restriction). */
5045
5046 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
5047 /*--------------------------------------------------------------------*/
5048
5049 if (repeat_min == 0 && repeat_max == -1)
5050 *code++ = OP_CRSTAR + repeat_type;
5051 else if (repeat_min == 1 && repeat_max == -1)
5052 *code++ = OP_CRPLUS + repeat_type;
5053 else if (repeat_min == 0 && repeat_max == 1)
5054 *code++ = OP_CRQUERY + repeat_type;
5055 else
5056 {
5057 *code++ = OP_CRRANGE + repeat_type;
5058 PUT2INC(code, 0, repeat_min);
5059 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5060 PUT2INC(code, 0, repeat_max);
5061 }
5062 }
5063
5064 /* If previous was a bracket group, we may have to replicate it in certain
5065 cases. Note that at this point we can encounter only the "basic" bracket
5066 opcodes such as BRA and CBRA, as this is the place where they get converted
5067 into the more special varieties such as BRAPOS and SBRA. A test for >=
5068 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5069 ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5070 repetition of assertions, but now it does, for Perl compatibility. */
5071
5072 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5073 {
5074 register int i;
5075 int len = (int)(code - previous);
5076 pcre_uchar *bralink = NULL;
5077 pcre_uchar *brazeroptr = NULL;
5078
5079 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5080 we just ignore the repeat. */
5081
5082 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5083 goto END_REPEAT;
5084
5085 /* There is no sense in actually repeating assertions. The only potential
5086 use of repetition is in cases when the assertion is optional. Therefore,
5087 if the minimum is greater than zero, just ignore the repeat. If the
5088 maximum is not not zero or one, set it to 1. */
5089
5090 if (*previous < OP_ONCE) /* Assertion */
5091 {
5092 if (repeat_min > 0) goto END_REPEAT;
5093 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5094 }
5095
5096 /* The case of a zero minimum is special because of the need to stick
5097 OP_BRAZERO in front of it, and because the group appears once in the
5098 data, whereas in other cases it appears the minimum number of times. For
5099 this reason, it is simplest to treat this case separately, as otherwise
5100 the code gets far too messy. There are several special subcases when the
5101 minimum is zero. */
5102
5103 if (repeat_min == 0)
5104 {
5105 /* If the maximum is also zero, we used to just omit the group from the
5106 output altogether, like this:
5107
5108 ** if (repeat_max == 0)
5109 ** {
5110 ** code = previous;
5111 ** goto END_REPEAT;
5112 ** }
5113
5114 However, that fails when a group or a subgroup within it is referenced
5115 as a subroutine from elsewhere in the pattern, so now we stick in
5116 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5117 don't have a list of which groups are referenced, we cannot do this
5118 selectively.
5119
5120 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5121 and do no more at this point. However, we do need to adjust any
5122 OP_RECURSE calls inside the group that refer to the group itself or any
5123 internal or forward referenced group, because the offset is from the
5124 start of the whole regex. Temporarily terminate the pattern while doing
5125 this. */
5126
5127 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5128 {
5129 *code = OP_END;
5130 adjust_recurse(previous, 1, utf, cd, save_hwm);
5131 memmove(previous + 1, previous, IN_UCHARS(len));
5132 code++;
5133 if (repeat_max == 0)
5134 {
5135 *previous++ = OP_SKIPZERO;
5136 goto END_REPEAT;
5137 }
5138 brazeroptr = previous; /* Save for possessive optimizing */
5139 *previous++ = OP_BRAZERO + repeat_type;
5140 }
5141
5142 /* If the maximum is greater than 1 and limited, we have to replicate
5143 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5144 The first one has to be handled carefully because it's the original
5145 copy, which has to be moved up. The remainder can be handled by code
5146 that is common with the non-zero minimum case below. We have to
5147 adjust the value or repeat_max, since one less copy is required. Once
5148 again, we may have to adjust any OP_RECURSE calls inside the group. */
5149
5150 else
5151 {
5152 int offset;
5153 *code = OP_END;
5154 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5155 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5156 code += 2 + LINK_SIZE;
5157 *previous++ = OP_BRAZERO + repeat_type;
5158 *previous++ = OP_BRA;
5159
5160 /* We chain together the bracket offset fields that have to be
5161 filled in later when the ends of the brackets are reached. */
5162
5163 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5164 bralink = previous;
5165 PUTINC(previous, 0, offset);
5166 }
5167
5168 repeat_max--;
5169 }
5170
5171 /* If the minimum is greater than zero, replicate the group as many
5172 times as necessary, and adjust the maximum to the number of subsequent
5173 copies that we need. If we set a first char from the group, and didn't
5174 set a required char, copy the latter from the former. If there are any
5175 forward reference subroutine calls in the group, there will be entries on
5176 the workspace list; replicate these with an appropriate increment. */
5177
5178 else
5179 {
5180 if (repeat_min > 1)
5181 {
5182 /* In the pre-compile phase, we don't actually do the replication. We
5183 just adjust the length as if we had. Do some paranoid checks for
5184 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5185 integer type when available, otherwise double. */
5186
5187 if (lengthptr != NULL)
5188 {
5189 int delta = (repeat_min - 1)*length_prevgroup;
5190 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5191 (INT64_OR_DOUBLE)length_prevgroup >
5192 (INT64_OR_DOUBLE)INT_MAX ||
5193 OFLOW_MAX - *lengthptr < delta)
5194 {
5195 *errorcodeptr = ERR20;
5196 goto FAILED;
5197 }
5198 *lengthptr += delta;
5199 }
5200
5201 /* This is compiling for real. If there is a set first byte for
5202 the group, and we have not yet set a "required byte", set it. Make
5203 sure there is enough workspace for copying forward references before
5204 doing the copy. */
5205
5206 else
5207 {
5208 if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5209
5210 for (i = 1; i < repeat_min; i++)
5211 {
5212 pcre_uchar *hc;
5213 pcre_uchar *this_hwm = cd->hwm;
5214 memcpy(code, previous, IN_UCHARS(len));
5215
5216 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5217 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5218 {
5219 int save_offset = save_hwm - cd->start_workspace;
5220 int this_offset = this_hwm - cd->start_workspace;
5221 *errorcodeptr = expand_workspace(cd);
5222 if (*errorcodeptr != 0) goto FAILED;
5223 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5224 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5225 }
5226
5227 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5228 {
5229 PUT(cd->hwm, 0, GET(hc, 0) + len);
5230 cd->hwm += LINK_SIZE;
5231 }
5232 save_hwm = this_hwm;
5233 code += len;
5234 }
5235 }
5236 }
5237
5238 if (repeat_max > 0) repeat_max -= repeat_min;
5239 }
5240
5241 /* This code is common to both the zero and non-zero minimum cases. If
5242 the maximum is limited, it replicates the group in a nested fashion,
5243 remembering the bracket starts on a stack. In the case of a zero minimum,
5244 the first one was set up above. In all cases the repeat_max now specifies
5245 the number of additional copies needed. Again, we must remember to
5246 replicate entries on the forward reference list. */
5247
5248 if (repeat_max >= 0)
5249 {
5250 /* In the pre-compile phase, we don't actually do the replication. We
5251 just adjust the length as if we had. For each repetition we must add 1
5252 to the length for BRAZERO and for all but the last repetition we must
5253 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5254 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5255 a 64-bit integer type when available, otherwise double. */
5256
5257 if (lengthptr != NULL && repeat_max > 0)
5258 {
5259 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5260 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5261 if ((INT64_OR_DOUBLE)repeat_max *
5262 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5263 > (INT64_OR_DOUBLE)INT_MAX ||
5264 OFLOW_MAX - *lengthptr < delta)
5265 {
5266 *errorcodeptr = ERR20;
5267 goto FAILED;
5268 }
5269 *lengthptr += delta;
5270 }
5271
5272 /* This is compiling for real */
5273
5274 else for (i = repeat_max - 1; i >= 0; i--)
5275 {
5276 pcre_uchar *hc;
5277 pcre_uchar *this_hwm = cd->hwm;
5278
5279 *code++ = OP_BRAZERO + repeat_type;
5280
5281 /* All but the final copy start a new nesting, maintaining the
5282 chain of brackets outstanding. */
5283
5284 if (i != 0)
5285 {
5286 int offset;
5287 *code++ = OP_BRA;
5288 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5289 bralink = code;
5290 PUTINC(code, 0, offset);
5291 }
5292
5293 memcpy(code, previous, IN_UCHARS(len));
5294
5295 /* Ensure there is enough workspace for forward references before
5296 copying them. */
5297
5298 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5299 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5300 {
5301 int save_offset = save_hwm - cd->start_workspace;
5302 int this_offset = this_hwm - cd->start_workspace;
5303 *errorcodeptr = expand_workspace(cd);
5304 if (*errorcodeptr != 0) goto FAILED;
5305 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5306 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5307 }
5308
5309 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5310 {
5311 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5312 cd->hwm += LINK_SIZE;
5313 }
5314 save_hwm = this_hwm;
5315 code += len;
5316 }
5317
5318 /* Now chain through the pending brackets, and fill in their length
5319 fields (which are holding the chain links pro tem). */
5320
5321 while (bralink != NULL)
5322 {
5323 int oldlinkoffset;
5324 int offset = (int)(code - bralink + 1);
5325 pcre_uchar *bra = code - offset;
5326 oldlinkoffset = GET(bra, 1);
5327 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5328 *code++ = OP_KET;
5329 PUTINC(code, 0, offset);
5330 PUT(bra, 1, offset);
5331 }
5332 }
5333
5334 /* If the maximum is unlimited, set a repeater in the final copy. For
5335 ONCE brackets, that's all we need to do. However, possessively repeated
5336 ONCE brackets can be converted into non-capturing brackets, as the
5337 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5338 deal with possessive ONCEs specially.
5339
5340 Otherwise, when we are doing the actual compile phase, check to see
5341 whether this group is one that could match an empty string. If so,
5342 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5343 that runtime checking can be done. [This check is also applied to ONCE
5344 groups at runtime, but in a different way.]
5345
5346 Then, if the quantifier was possessive and the bracket is not a
5347 conditional, we convert the BRA code to the POS form, and the KET code to
5348 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5349 subpattern at both the start and at the end.) The use of special opcodes
5350 makes it possible to reduce greatly the stack usage in pcre_exec(). If
5351 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5352
5353 Then, if the minimum number of matches is 1 or 0, cancel the possessive
5354 flag so that the default action below, of wrapping everything inside
5355 atomic brackets, does not happen. When the minimum is greater than 1,
5356 there will be earlier copies of the group, and so we still have to wrap
5357 the whole thing. */
5358
5359 else
5360 {
5361 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5362 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5363
5364 /* Convert possessive ONCE brackets to non-capturing */
5365
5366 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5367 possessive_quantifier) *bracode = OP_BRA;
5368
5369 /* For non-possessive ONCE brackets, all we need to do is to
5370 set the KET. */
5371
5372 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5373 *ketcode = OP_KETRMAX + repeat_type;
5374
5375 /* Handle non-ONCE brackets and possessive ONCEs (which have been
5376 converted to non-capturing above). */
5377
5378 else
5379 {
5380 /* In the compile phase, check for empty string matching. */
5381
5382 if (lengthptr == NULL)
5383 {
5384 pcre_uchar *scode = bracode;
5385 do
5386 {
5387 if (could_be_empty_branch(scode, ketcode, utf, cd))
5388 {
5389 *bracode += OP_SBRA - OP_BRA;
5390 break;
5391 }
5392 scode += GET(scode, 1);
5393 }
5394 while (*scode == OP_ALT);
5395 }
5396
5397 /* Handle possessive quantifiers. */
5398
5399 if (possessive_quantifier)
5400 {
5401 /* For COND brackets, we wrap the whole thing in a possessively
5402 repeated non-capturing bracket, because we have not invented POS
5403 versions of the COND opcodes. Because we are moving code along, we
5404 must ensure that any pending recursive references are updated. */
5405
5406 if (*bracode == OP_COND || *bracode == OP_SCOND)
5407 {
5408 int nlen = (int)(code - bracode);
5409 *code = OP_END;
5410 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5411 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5412 code += 1 + LINK_SIZE;
5413 nlen += 1 + LINK_SIZE;
5414 *bracode = OP_BRAPOS;
5415 *code++ = OP_KETRPOS;
5416 PUTINC(code, 0, nlen);
5417 PUT(bracode, 1, nlen);
5418 }
5419
5420 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5421
5422 else
5423 {
5424 *bracode += 1; /* Switch to xxxPOS opcodes */
5425 *ketcode = OP_KETRPOS;
5426 }
5427
5428 /* If the minimum is zero, mark it as possessive, then unset the
5429 possessive flag when the minimum is 0 or 1. */
5430
5431 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5432 if (repeat_min < 2) possessive_quantifier = FALSE;
5433 }
5434
5435 /* Non-possessive quantifier */
5436
5437 else *ketcode = OP_KETRMAX + repeat_type;
5438 }
5439 }
5440 }
5441
5442 /* If previous is OP_FAIL, it was generated by an empty class [] in
5443 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5444 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5445 error above. We can just ignore the repeat in JS case. */
5446
5447 else if (*previous == OP_FAIL) goto END_REPEAT;
5448
5449 /* Else there's some kind of shambles */
5450
5451 else
5452 {
5453 *errorcodeptr = ERR11;
5454 goto FAILED;
5455 }
5456
5457 /* If the character following a repeat is '+', or if certain optimization
5458 tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5459 there are special alternative opcodes for this case. For anything else, we
5460 wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5461 notation is just syntactic sugar, taken from Sun's Java package, but the
5462 special opcodes can optimize it.
5463
5464 Some (but not all) possessively repeated subpatterns have already been
5465 completely handled in the code just above. For them, possessive_quantifier
5466 is always FALSE at this stage.
5467
5468 Note that the repeated item starts at tempcode, not at previous, which
5469 might be the first part of a string whose (former) last char we repeated.
5470
5471 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5472 an 'upto' may follow. We skip over an 'exact' item, and then test the
5473 length of what remains before proceeding. */
5474
5475 if (possessive_quantifier)
5476 {
5477 int len;
5478
5479 if (*tempcode == OP_TYPEEXACT)
5480 tempcode += PRIV(OP_lengths)[*tempcode] +
5481 ((tempcode[1 + IMM2_SIZE] == OP_PROP
5482 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5483
5484 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5485 {
5486 tempcode += PRIV(OP_lengths)[*tempcode];
5487 #ifdef SUPPORT_UTF
5488 if (utf && HAS_EXTRALEN(tempcode[-1]))
5489 tempcode += GET_EXTRALEN(tempcode[-1]);
5490 #endif
5491 }
5492
5493 len = (int)(code - tempcode);
5494 if (len > 0) switch (*tempcode)
5495 {
5496 case OP_STAR: *tempcode = OP_POSSTAR; break;
5497 case OP_PLUS: *tempcode = OP_POSPLUS; break;
5498 case OP_QUERY: *tempcode = OP_POSQUERY; break;
5499 case OP_UPTO: *tempcode = OP_POSUPTO; break;
5500
5501 case OP_STARI: *tempcode = OP_POSSTARI; break;
5502 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
5503 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5504 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
5505
5506 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
5507 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
5508 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5509 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
5510
5511 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
5512 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
5513 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5514 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
5515
5516 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
5517 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
5518 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5519 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
5520
5521 /* Because we are moving code along, we must ensure that any
5522 pending recursive references are updated. */
5523
5524 default:
5525 *code = OP_END;
5526 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5527 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5528 code += 1 + LINK_SIZE;
5529 len += 1 + LINK_SIZE;
5530 tempcode[0] = OP_ONCE;
5531 *code++ = OP_KET;
5532 PUTINC(code, 0, len);
5533 PUT(tempcode, 1, len);
5534 break;
5535 }
5536 }
5537
5538 /* In all case we no longer have a previous item. We also set the
5539 "follows varying string" flag for subsequently encountered reqchars if
5540 it isn't already set and we have just passed a varying length item. */
5541
5542 END_REPEAT:
5543 previous = NULL;
5544 cd->req_varyopt |= reqvary;
5545 break;
5546
5547
5548 /* ===================================================================*/
5549 /* Start of nested parenthesized sub-expression, or comment or lookahead or
5550 lookbehind or option setting or condition or all the other extended
5551 parenthesis forms. */
5552
5553 case CHAR_LEFT_PARENTHESIS:
5554 newoptions = options;
5555 skipbytes = 0;
5556 bravalue = OP_CBRA;
5557 save_hwm = cd->hwm;
5558 reset_bracount = FALSE;
5559
5560 /* First deal with various "verbs" that can be introduced by '*'. */
5561
5562 ptr++;
5563 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5564 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5565 {
5566 int i, namelen;
5567 int arglen = 0;
5568 const char *vn = verbnames;
5569 const pcre_uchar *name = ptr + 1;
5570 const pcre_uchar *arg = NULL;
5571 previous = NULL;
5572 ptr++;
5573 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5574 namelen = (int)(ptr - name);
5575
5576 /* It appears that Perl allows any characters whatsoever, other than
5577 a closing parenthesis, to appear in arguments, so we no longer insist on
5578 letters, digits, and underscores. */
5579
5580 if (*ptr == CHAR_COLON)
5581 {
5582 arg = ++ptr;
5583 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5584 arglen = (int)(ptr - arg);
5585 }
5586
5587 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5588 {
5589 *errorcodeptr = ERR60;
5590 goto FAILED;
5591 }
5592
5593 /* Scan the table of verb names */
5594
5595 for (i = 0; i < verbcount; i++)
5596 {
5597 if (namelen == verbs[i].len &&
5598 STRNCMP_UC_C8(name, vn, namelen) == 0)
5599 {
5600 /* Check for open captures before ACCEPT and convert it to
5601 ASSERT_ACCEPT if in an assertion. */
5602
5603 if (verbs[i].op == OP_ACCEPT)
5604 {
5605 open_capitem *oc;
5606 if (arglen != 0)
5607 {
5608 *errorcodeptr = ERR59;
5609 goto FAILED;
5610 }
5611 cd->had_accept = TRUE;
5612 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5613 {
5614 *code++ = OP_CLOSE;
5615 PUT2INC(code, 0, oc->number);
5616 }
5617 *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5618
5619 /* Do not set firstchar after *ACCEPT */
5620 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5621 }
5622
5623 /* Handle other cases with/without an argument */
5624
5625 else if (arglen == 0)
5626 {
5627 if (verbs[i].op < 0) /* Argument is mandatory */
5628 {
5629 *errorcodeptr = ERR66;
5630 goto FAILED;
5631 }
5632 *code = verbs[i].op;
5633 if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
5634 }
5635
5636 else
5637 {
5638 if (verbs[i].op_arg < 0) /* Argument is forbidden */
5639 {
5640 *errorcodeptr = ERR59;
5641 goto FAILED;
5642 }
5643 *code = verbs[i].op_arg;
5644 if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5645 *code++ = arglen;
5646 memcpy(code, arg, IN_UCHARS(arglen));
5647 code += arglen;
5648 *code++ = 0;
5649 }
5650
5651 break; /* Found verb, exit loop */
5652 }
5653
5654 vn += verbs[i].len + 1;
5655 }
5656
5657 if (i < verbcount) continue; /* Successfully handled a verb */
5658 *errorcodeptr = ERR60; /* Verb not recognized */
5659 goto FAILED;
5660 }
5661
5662 /* Deal with the extended parentheses; all are introduced by '?', and the
5663 appearance of any of them means that this is not a capturing group. */
5664
5665 else if (*ptr == CHAR_QUESTION_MARK)
5666 {
5667 int i, set, unset, namelen;
5668 int *optset;
5669 const pcre_uchar *name;
5670 pcre_uchar *slot;
5671
5672 switch (*(++ptr))
5673 {
5674 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
5675 ptr++;
5676 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5677 if (*ptr == 0)
5678 {
5679 *errorcodeptr = ERR18;
5680 goto FAILED;
5681 }
5682 continue;
5683
5684
5685 /* ------------------------------------------------------------ */
5686 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
5687 reset_bracount = TRUE;
5688 /* Fall through */
5689
5690 /* ------------------------------------------------------------ */
5691 case CHAR_COLON: /* Non-capturing bracket */
5692 bravalue = OP_BRA;
5693 ptr++;
5694 break;
5695
5696
5697 /* ------------------------------------------------------------ */
5698 case CHAR_LEFT_PARENTHESIS:
5699 bravalue = OP_COND; /* Conditional group */
5700
5701 /* A condition can be an assertion, a number (referring to a numbered
5702 group), a name (referring to a named group), or 'R', referring to
5703 recursion. R<digits> and R&name are also permitted for recursion tests.
5704
5705 There are several syntaxes for testing a named group: (?(name)) is used
5706 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5707
5708 There are two unfortunate ambiguities, caused by history. (a) 'R' can
5709 be the recursive thing or the name 'R' (and similarly for 'R' followed
5710 by digits), and (b) a number could be a name that consists of digits.
5711 In both cases, we look for a name first; if not found, we try the other
5712 cases. */
5713
5714 /* For conditions that are assertions, check the syntax, and then exit
5715 the switch. This will take control down to where bracketed groups,
5716 including assertions, are processed. */
5717
5718 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
5719 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
5720 break;
5721
5722 /* Most other conditions use OP_CREF (a couple change to OP_RREF
5723 below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5724
5725 code[1+LINK_SIZE] = OP_CREF;
5726 skipbytes = 1+IMM2_SIZE;
5727 refsign = -1;
5728
5729 /* Check for a test for recursion in a named group. */
5730
5731 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
5732 {
5733 terminator = -1;
5734 ptr += 2;
5735 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
5736 }
5737
5738 /* Check for a test for a named group's having been set, using the Perl
5739 syntax (?(<name>) or (?('name') */
5740
5741 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
5742 {
5743 terminator = CHAR_GREATER_THAN_SIGN;
5744 ptr++;
5745 }
5746 else if (ptr[1] == CHAR_APOSTROPHE)
5747 {
5748 terminator = CHAR_APOSTROPHE;
5749 ptr++;
5750 }
5751 else
5752 {
5753 terminator = 0;
5754 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5755 }
5756
5757 /* We now expect to read a name; any thing else is an error */
5758
5759 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5760 {
5761 ptr += 1; /* To get the right offset */
5762 *errorcodeptr = ERR28;
5763 goto FAILED;
5764 }
5765
5766 /* Read the name, but also get it as a number if it's all digits */
5767
5768 recno = 0;
5769 name = ++ptr;
5770 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5771 {
5772 if (recno >= 0)
5773 recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
5774 ptr++;
5775 }
5776 namelen = (int)(ptr - name);
5777
5778 if ((terminator > 0 && *ptr++ != terminator) ||
5779 *ptr++ != CHAR_RIGHT_PARENTHESIS)
5780 {
5781 ptr--; /* Error offset */
5782 *errorcodeptr = ERR26;
5783 goto FAILED;
5784 }
5785
5786 /* Do no further checking in the pre-compile phase. */
5787
5788 if (lengthptr != NULL) break;
5789
5790 /* In the real compile we do the work of looking for the actual
5791 reference. If the string started with "+" or "-" we require the rest to
5792 be digits, in which case recno will be set. */
5793
5794 if (refsign > 0)
5795 {
5796 if (recno <= 0)
5797 {
5798 *errorcodeptr = ERR58;
5799 goto FAILED;
5800 }
5801 recno = (refsign == CHAR_MINUS)?
5802 cd->bracount - recno + 1 : recno +cd->bracount;
5803 if (recno <= 0 || recno > cd->final_bracount)
5804 {
5805 *errorcodeptr = ERR15;
5806 goto FAILED;
5807 }
5808 PUT2(code, 2+LINK_SIZE, recno);
5809 break;
5810 }
5811
5812 /* Otherwise (did not start with "+" or "-"), start by looking for the
5813 name. If we find a name, add one to the opcode to change OP_CREF or
5814 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5815 except they record that the reference was originally to a name. The
5816 information is used to check duplicate names. */
5817
5818 slot = cd->name_table;
5819 for (i = 0; i < cd->names_found; i++)
5820 {
5821 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5822 slot += cd->name_entry_size;
5823 }
5824
5825 /* Found a previous named subpattern */
5826
5827 if (i < cd->names_found)
5828 {
5829 recno = GET2(slot, 0);
5830 PUT2(code, 2+LINK_SIZE, recno);
5831 code[1+LINK_SIZE]++;
5832 }
5833
5834 /* Search the pattern for a forward reference */
5835
5836 else if ((i = find_parens(cd, name, namelen,
5837 (options & PCRE_EXTENDED) != 0, utf)) > 0)
5838 {
5839 PUT2(code, 2+LINK_SIZE, i);
5840 code[1+LINK_SIZE]++;
5841 }
5842
5843 /* If terminator == 0 it means that the name followed directly after
5844 the opening parenthesis [e.g. (?(abc)...] and in this case there are
5845 some further alternatives to try. For the cases where terminator != 0
5846 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5847 now checked all the possibilities, so give an error. */
5848
5849 else if (terminator != 0)
5850 {
5851 *errorcodeptr = ERR15;
5852 goto FAILED;
5853 }
5854
5855 /* Check for (?(R) for recursion. Allow digits after R to specify a
5856 specific group number. */
5857
5858 else if (*name == CHAR_R)
5859 {
5860 recno = 0;
5861 for (i = 1; i < namelen; i++)
5862 {
5863 if (!IS_DIGIT(name[i]))
5864 {
5865 *errorcodeptr = ERR15;
5866 goto FAILED;
5867 }
5868 recno = recno * 10 + name[i] - CHAR_0;
5869 }
5870 if (recno == 0) recno = RREF_ANY;
5871 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
5872 PUT2(code, 2+LINK_SIZE, recno);
5873 }
5874
5875 /* Similarly, check for the (?(DEFINE) "condition", which is always
5876 false. */
5877
5878 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5879 {
5880 code[1+LINK_SIZE] = OP_DEF;
5881 skipbytes = 1;
5882 }
5883
5884 /* Check for the "name" actually being a subpattern number. We are
5885 in the second pass here, so final_bracount is set. */
5886
5887 else if (recno > 0 && recno <= cd->final_bracount)
5888 {
5889 PUT2(code, 2+LINK_SIZE, recno);
5890 }
5891
5892 /* Either an unidentified subpattern, or a reference to (?(0) */
5893
5894 else
5895 {
5896 *errorcodeptr = (recno == 0)? ERR35: ERR15;
5897 goto FAILED;
5898 }
5899 break;
5900
5901
5902 /* ------------------------------------------------------------ */
5903 case CHAR_EQUALS_SIGN: /* Positive lookahead */
5904 bravalue = OP_ASSERT;
5905 cd->assert_depth += 1;
5906 ptr++;
5907 break;
5908
5909
5910 /* ------------------------------------------------------------ */
5911 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
5912 ptr++;
5913 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
5914 {
5915 *code++ = OP_FAIL;
5916 previous = NULL;
5917 continue;
5918 }
5919 bravalue = OP_ASSERT_NOT;
5920 cd->assert_depth += 1;
5921 break;
5922
5923
5924 /* ------------------------------------------------------------ */
5925 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
5926 switch (ptr[1])
5927 {
5928 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
5929 bravalue = OP_ASSERTBACK;
5930 cd->assert_depth += 1;
5931 ptr += 2;
5932 break;
5933
5934 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
5935 bravalue = OP_ASSERTBACK_NOT;
5936 cd->assert_depth += 1;
5937 ptr += 2;
5938 break;
5939
5940 default: /* Could be name define, else bad */
5941 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5942 goto DEFINE_NAME;
5943 ptr++; /* Correct offset for error */
5944 *errorcodeptr = ERR24;
5945 goto FAILED;
5946 }
5947 break;
5948
5949
5950 /* ------------------------------------------------------------ */
5951 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
5952 bravalue = OP_ONCE;
5953 ptr++;
5954 break;
5955
5956
5957 /* ------------------------------------------------------------ */
5958 case CHAR_C: /* Callout - may be followed by digits; */
5959 previous_callout = code; /* Save for later completion */
5960 after_manual_callout = 1; /* Skip one item before completing */
5961 *code++ = OP_CALLOUT;
5962 {
5963 int n = 0;
5964 ptr++;
5965 while(IS_DIGIT(*ptr))
5966 n = n * 10 + *ptr++ - CHAR_0;
5967 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5968 {
5969 *errorcodeptr = ERR39;
5970 goto FAILED;
5971 }
5972 if (n > 255)
5973 {
5974 *errorcodeptr = ERR38;
5975 goto FAILED;
5976 }
5977 *code++ = n;
5978 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
5979 PUT(code, LINK_SIZE, 0); /* Default length */
5980 code += 2 * LINK_SIZE;
5981 }
5982 previous = NULL;
5983 continue;
5984
5985
5986 /* ------------------------------------------------------------ */
5987 case CHAR_P: /* Python-style named subpattern handling */
5988 if (*(++ptr) == CHAR_EQUALS_SIGN ||
5989 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
5990 {
5991 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
5992 terminator = CHAR_RIGHT_PARENTHESIS;
5993 goto NAMED_REF_OR_RECURSE;
5994 }
5995 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
5996 {
5997 *errorcodeptr = ERR41;
5998 goto FAILED;
5999 }
6000 /* Fall through to handle (?P< as (?< is handled */
6001
6002
6003 /* ------------------------------------------------------------ */
6004 DEFINE_NAME: /* Come here from (?< handling */
6005 case CHAR_APOSTROPHE:
6006 {
6007 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6008 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6009 name = ++ptr;
6010
6011 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6012 namelen = (int)(ptr - name);
6013
6014 /* In the pre-compile phase, just do a syntax check. */
6015
6016 if (lengthptr != NULL)
6017 {
6018 if (*ptr != terminator)
6019 {
6020 *errorcodeptr = ERR42;
6021 goto FAILED;
6022 }
6023 if (cd->names_found >= MAX_NAME_COUNT)
6024 {
6025 *errorcodeptr = ERR49;
6026 goto FAILED;
6027 }
6028 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6029 {
6030 cd->name_entry_size = namelen + IMM2_SIZE + 1;
6031 if (namelen > MAX_NAME_SIZE)
6032 {
6033 *errorcodeptr = ERR48;
6034 goto FAILED;
6035 }
6036 }
6037 }
6038
6039 /* In the real compile, create the entry in the table, maintaining
6040 alphabetical order. Duplicate names for different numbers are
6041 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
6042 number are always OK. (An existing number can be re-used if (?|
6043 appears in the pattern.) In either event, a duplicate name results in
6044 a duplicate entry in the table, even if the number is the same. This
6045 is because the number of names, and hence the table size, is computed
6046 in the pre-compile, and it affects various numbers and pointers which
6047 would all have to be modified, and the compiled code moved down, if
6048 duplicates with the same number were omitted from the table. This
6049 doesn't seem worth the hassle. However, *different* names for the
6050 same number are not permitted. */
6051
6052 else
6053 {
6054 BOOL dupname = FALSE;
6055 slot = cd->name_table;
6056
6057 for (i = 0; i < cd->names_found; i++)
6058 {
6059 int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6060 if (crc == 0)
6061 {
6062 if (slot[IMM2_SIZE+namelen] == 0)
6063 {
6064 if (GET2(slot, 0) != cd->bracount + 1 &&
6065 (options & PCRE_DUPNAMES) == 0)
6066 {
6067 *errorcodeptr = ERR43;
6068 goto FAILED;
6069 }
6070 else dupname = TRUE;
6071 }
6072 else crc = -1; /* Current name is a substring */
6073 }
6074
6075 /* Make space in the table and break the loop for an earlier
6076 name. For a duplicate or later name, carry on. We do this for
6077 duplicates so that in the simple case (when ?(| is not used) they
6078 are in order of their numbers. */
6079
6080 if (crc < 0)
6081 {
6082 memmove(slot + cd->name_entry_size, slot,
6083 IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
6084 break;
6085 }
6086
6087 /* Continue the loop for a later or duplicate name */
6088
6089 slot += cd->name_entry_size;
6090 }
6091
6092 /* For non-duplicate names, check for a duplicate number before
6093 adding the new name. */
6094
6095 if (!dupname)
6096 {
6097 pcre_uchar *cslot = cd->name_table;
6098 for (i = 0; i < cd->names_found; i++)
6099 {
6100 if (cslot != slot)
6101 {
6102 if (GET2(cslot, 0) == cd->bracount + 1)
6103 {
6104 *errorcodeptr = ERR65;
6105 goto FAILED;
6106 }
6107 }
6108 else i--;
6109 cslot += cd->name_entry_size;
6110 }
6111 }
6112
6113 PUT2(slot, 0, cd->bracount + 1);
6114 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6115 slot[IMM2_SIZE + namelen] = 0;
6116 }
6117 }
6118
6119 /* In both pre-compile and compile, count the number of names we've
6120 encountered. */
6121
6122 cd->names_found++;
6123 ptr++; /* Move past > or ' */
6124 goto NUMBERED_GROUP;
6125
6126
6127 /* ------------------------------------------------------------ */
6128 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
6129 terminator = CHAR_RIGHT_PARENTHESIS;
6130 is_recurse = TRUE;
6131 /* Fall through */
6132
6133 /* We come here from the Python syntax above that handles both
6134 references (?P=name) and recursion (?P>name), as well as falling
6135 through from the Perl recursion syntax (?&name). We also come here from
6136 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6137 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6138
6139 NAMED_REF_OR_RECURSE:
6140 name = ++ptr;
6141 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6142 namelen = (int)(ptr - name);
6143
6144 /* In the pre-compile phase, do a syntax check. We used to just set
6145 a dummy reference number, because it was not used in the first pass.
6146 However, with the change of recursive back references to be atomic,
6147 we have to look for the number so that this state can be identified, as
6148 otherwise the incorrect length is computed. If it's not a backwards
6149 reference, the dummy number will do. */
6150
6151 if (lengthptr != NULL)
6152 {
6153 const pcre_uchar *temp;
6154
6155 if (namelen == 0)
6156 {
6157 *errorcodeptr = ERR62;
6158 goto FAILED;
6159 }
6160 if (*ptr != terminator)
6161 {
6162 *errorcodeptr = ERR42;
6163 goto FAILED;
6164 }
6165 if (namelen > MAX_NAME_SIZE)
6166 {
6167 *errorcodeptr = ERR48;
6168 goto FAILED;
6169 }
6170
6171 /* The name table does not exist in the first pass, so we cannot
6172 do a simple search as in the code below. Instead, we have to scan the
6173 pattern to find the number. It is important that we scan it only as
6174 far as we have got because the syntax of named subpatterns has not
6175 been checked for the rest of the pattern, and find_parens() assumes
6176 correct syntax. In any case, it's a waste of resources to scan
6177 further. We stop the scan at the current point by temporarily
6178 adjusting the value of cd->endpattern. */
6179
6180 temp = cd->end_pattern;
6181 cd->end_pattern = ptr;
6182 recno = find_parens(cd, name, namelen,
6183 (options & PCRE_EXTENDED) != 0, utf);
6184 cd->end_pattern = temp;
6185 if (recno < 0) recno = 0; /* Forward ref; set dummy number */
6186 }
6187
6188 /* In the real compile, seek the name in the table. We check the name
6189 first, and then check that we have reached the end of the name in the
6190 table. That way, if the name that is longer than any in the table,
6191 the comparison will fail without reading beyond the table entry. */
6192
6193 else
6194 {
6195 slot = cd->name_table;
6196 for (i = 0; i < cd->names_found; i++)
6197 {
6198 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6199 slot[IMM2_SIZE+namelen] == 0)
6200 break;
6201 slot += cd->name_entry_size;
6202 }
6203
6204 if (i < cd->names_found) /* Back reference */
6205 {
6206 recno = GET2(slot, 0);
6207 }
6208 else if ((recno = /* Forward back reference */
6209 find_parens(cd, name, namelen,
6210 (options & PCRE_EXTENDED) != 0, utf)) <= 0)
6211 {
6212 *errorcodeptr = ERR15;
6213 goto FAILED;
6214 }
6215 }
6216
6217 /* In both phases, we can now go to the code than handles numerical
6218 recursion or backreferences. */
6219
6220 if (is_recurse) goto HANDLE_RECURSION;
6221 else goto HANDLE_REFERENCE;
6222
6223
6224 /* ------------------------------------------------------------ */
6225 case CHAR_R: /* Recursion */
6226 ptr++; /* Same as (?0) */
6227 /* Fall through */
6228
6229
6230 /* ------------------------------------------------------------ */
6231 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
6232 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6233 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6234 {
6235 const pcre_uchar *called;
6236 terminator = CHAR_RIGHT_PARENTHESIS;
6237
6238 /* Come here from the \g<...> and \g'...' code (Oniguruma
6239 compatibility). However, the syntax has been checked to ensure that
6240 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6241 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6242 ever be taken. */
6243
6244 HANDLE_NUMERICAL_RECURSION:
6245
6246 if ((refsign = *ptr) == CHAR_PLUS)
6247 {
6248 ptr++;
6249 if (!IS_DIGIT(*ptr))
6250 {
6251 *errorcodeptr = ERR63;
6252 goto FAILED;
6253 }
6254 }
6255 else if (refsign == CHAR_MINUS)
6256 {
6257 if (!IS_DIGIT(ptr[1]))
6258 goto OTHER_CHAR_AFTER_QUERY;
6259 ptr++;
6260 }
6261
6262 recno = 0;
6263 while(IS_DIGIT(*ptr))
6264 recno = recno * 10 + *ptr++ - CHAR_0;
6265
6266 if (*ptr != terminator)
6267 {
6268 *errorcodeptr = ERR29;
6269 goto FAILED;
6270 }
6271
6272 if (refsign == CHAR_MINUS)
6273 {
6274 if (recno == 0)
6275 {
6276 *errorcodeptr = ERR58;
6277 goto FAILED;
6278 }
6279 recno = cd->bracount - recno + 1;
6280 if (recno <= 0)
6281 {
6282 *errorcodeptr = ERR15;
6283 goto FAILED;
6284 }
6285 }
6286 else if (refsign == CHAR_PLUS)
6287 {
6288 if (recno == 0)
6289 {
6290 *errorcodeptr = ERR58;
6291 goto FAILED;
6292 }
6293 recno += cd->bracount;
6294 }
6295
6296 /* Come here from code above that handles a named recursion */
6297
6298 HANDLE_RECURSION:
6299
6300 previous = code;
6301 called = cd->start_code;
6302
6303 /* When we are actually compiling, find the bracket that is being
6304 referenced. Temporarily end the regex in case it doesn't exist before
6305 this point. If we end up with a forward reference, first check that
6306 the bracket does occur later so we can give the error (and position)
6307 now. Then remember this forward reference in the workspace so it can
6308 be filled in at the end. */
6309
6310 if (lengthptr == NULL)
6311 {
6312 *code = OP_END;
6313 if (recno != 0)
6314 called = PRIV(find_bracket)(cd->start_code, utf, recno);
6315
6316 /* Forward reference */
6317
6318 if (called == NULL)
6319 {
6320 if (find_parens(cd, NULL, recno,
6321 (options & PCRE_EXTENDED) != 0, utf) < 0)
6322 {
6323 *errorcodeptr = ERR15;
6324 goto FAILED;
6325 }
6326
6327 /* Fudge the value of "called" so that when it is inserted as an
6328 offset below, what it actually inserted is the reference number
6329 of the group. Then remember the forward reference. */
6330
6331 called = cd->start_code + recno;
6332 if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6333 WORK_SIZE_SAFETY_MARGIN)
6334 {
6335 *errorcodeptr = expand_workspace(cd);
6336 if (*errorcodeptr != 0) goto FAILED;
6337 }
6338 PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6339 }
6340
6341 /* If not a forward reference, and the subpattern is still open,
6342 this is a recursive call. We check to see if this is a left
6343 recursion that could loop for ever, and diagnose that case. We
6344 must not, however, do this check if we are in a conditional
6345 subpattern because the condition might be testing for recursion in
6346 a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
6347 Forever loops are also detected at runtime, so those that occur in
6348 conditional subpatterns will be picked up then. */
6349
6350 else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6351 could_be_empty(called, code, bcptr, utf, cd))
6352 {
6353 *errorcodeptr = ERR40;
6354 goto FAILED;
6355 }
6356 }
6357
6358 /* Insert the recursion/subroutine item. It does not have a set first
6359 character (relevant if it is repeated, because it will then be
6360 wrapped with ONCE brackets). */
6361
6362 *code = OP_RECURSE;
6363 PUT(code, 1, (int)(called - cd->start_code));
6364 code += 1 + LINK_SIZE;
6365 groupsetfirstchar = FALSE;
6366 }
6367
6368 /* Can't determine a first byte now */
6369
6370 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6371 continue;
6372
6373
6374 /* ------------------------------------------------------------ */
6375 default: /* Other characters: check option setting */
6376 OTHER_CHAR_AFTER_QUERY:
6377 set = unset = 0;
6378 optset = &set;
6379
6380 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6381 {
6382 switch (*ptr++)
6383 {
6384 case CHAR_MINUS: optset = &unset; break;
6385
6386 case CHAR_J: /* Record that it changed in the external options */
6387 *optset |= PCRE_DUPNAMES;
6388 cd->external_flags |= PCRE_JCHANGED;
6389 break;
6390
6391 case CHAR_i: *optset |= PCRE_CASELESS; break;
6392 case CHAR_m: *optset |= PCRE_MULTILINE; break;
6393 case CHAR_s: *optset |= PCRE_DOTALL; break;
6394 case CHAR_x: *optset |= PCRE_EXTENDED; break;
6395 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
6396 case CHAR_X: *optset |= PCRE_EXTRA; break;
6397
6398 default: *errorcodeptr = ERR12;
6399 ptr--; /* Correct the offset */
6400 goto FAILED;
6401 }
6402 }
6403
6404 /* Set up the changed option bits, but don't change anything yet. */
6405
6406 newoptions = (options | set) & (~unset);
6407
6408 /* If the options ended with ')' this is not the start of a nested
6409 group with option changes, so the options change at this level. If this
6410 item is right at the start of the pattern, the options can be
6411 abstracted and made external in the pre-compile phase, and ignored in
6412 the compile phase. This can be helpful when matching -- for instance in
6413 caseless checking of required bytes.
6414
6415 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
6416 definitely *not* at the start of the pattern because something has been
6417 compiled. In the pre-compile phase, however, the code pointer can have
6418 that value after the start, because it gets reset as code is discarded
6419 during the pre-compile. However, this can happen only at top level - if
6420 we are within parentheses, the starting BRA will still be present. At
6421 any parenthesis level, the length value can be used to test if anything
6422 has been compiled at that level. Thus, a test for both these conditions
6423 is necessary to ensure we correctly detect the start of the pattern in
6424 both phases.
6425
6426 If we are not at the pattern start, reset the greedy defaults and the
6427 case value for firstchar and reqchar. */
6428
6429 if (*ptr == CHAR_RIGHT_PARENTHESIS)
6430 {
6431 if (code == cd->start_code + 1 + LINK_SIZE &&
6432 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
6433 {
6434 cd->external_options = newoptions;
6435 }
6436 else
6437 {
6438 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6439 greedy_non_default = greedy_default ^ 1;
6440 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6441 }
6442
6443 /* Change options at this level, and pass them back for use
6444 in subsequent branches. */
6445
6446 *optionsptr = options = newoptions;
6447 previous = NULL; /* This item can't be repeated */
6448 continue; /* It is complete */
6449 }
6450
6451 /* If the options ended with ':' we are heading into a nested group
6452 with possible change of options. Such groups are non-capturing and are
6453 not assertions of any kind. All we need to do is skip over the ':';
6454 the newoptions value is handled below. */
6455
6456 bravalue = OP_BRA;
6457 ptr++;
6458 } /* End of switch for character following (? */
6459 } /* End of (? handling */
6460
6461 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
6462 is set, all unadorned brackets become non-capturing and behave like (?:...)
6463 brackets. */
6464
6465 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
6466 {
6467 bravalue = OP_BRA;
6468 }
6469
6470 /* Else we have a capturing group. */
6471
6472 else
6473 {
6474 NUMBERED_GROUP:
6475 cd->bracount += 1;
6476 PUT2(code, 1+LINK_SIZE, cd->bracount);
6477 skipbytes = IMM2_SIZE;
6478 }
6479
6480 /* Process nested bracketed regex. Assertions used not to be repeatable,
6481 but this was changed for Perl compatibility, so all kinds can now be
6482 repeated. We copy code into a non-register variable (tempcode) in order to
6483 be able to pass its address because some compilers complain otherwise. */
6484
6485 previous = code; /* For handling repetition */
6486 *code = bravalue;
6487 tempcode = code;
6488 tempreqvary = cd->req_varyopt; /* Save value before bracket */
6489 tempbracount = cd->bracount; /* Save value before bracket */
6490 length_prevgroup = 0; /* Initialize for pre-compile phase */
6491
6492 if (!compile_regex(
6493 newoptions, /* The complete new option state */
6494 &tempcode, /* Where to put code (updated) */
6495 &ptr, /* Input pointer (updated) */
6496 errorcodeptr, /* Where to put an error message */
6497 (bravalue == OP_ASSERTBACK ||
6498 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
6499 reset_bracount, /* True if (?| group */
6500 skipbytes, /* Skip over bracket number */
6501 cond_depth +
6502 ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
6503 &subfirstchar, /* For possible first char */
6504 &subreqchar, /* For possible last char */
6505 bcptr, /* Current branch chain */
6506 cd, /* Tables block */
6507 (lengthptr == NULL)? NULL : /* Actual compile phase */
6508 &length_prevgroup /* Pre-compile phase */
6509 ))
6510 goto FAILED;
6511
6512 /* If this was an atomic group and there are no capturing groups within it,
6513 generate OP_ONCE_NC instead of OP_ONCE. */
6514
6515 if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
6516 *code = OP_ONCE_NC;
6517
6518 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6519 cd->assert_depth -= 1;
6520
6521 /* At the end of compiling, code is still pointing to the start of the
6522 group, while tempcode has been updated to point past the end of the group.
6523 The pattern pointer (ptr) is on the bracket.
6524
6525 If this is a conditional bracket, check that there are no more than
6526 two branches in the group, or just one if it's a DEFINE group. We do this
6527 in the real compile phase, not in the pre-pass, where the whole group may
6528 not be available. */
6529
6530 if (bravalue == OP_COND && lengthptr == NULL)
6531 {
6532 pcre_uchar *tc = code;
6533 int condcount = 0;
6534
6535 do {
6536 condcount++;
6537 tc += GET(tc,1);
6538 }
6539 while (*tc != OP_KET);
6540
6541 /* A DEFINE group is never obeyed inline (the "condition" is always
6542 false). It must have only one branch. */
6543
6544 if (code[LINK_SIZE+1] == OP_DEF)
6545 {
6546 if (condcount > 1)
6547 {
6548 *errorcodeptr = ERR54;
6549 goto FAILED;
6550 }
6551 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
6552 }
6553
6554 /* A "normal" conditional group. If there is just one branch, we must not
6555 make use of its firstchar or reqchar, because this is equivalent to an
6556 empty second branch. */
6557
6558 else
6559 {
6560 if (condcount > 2)
6561 {
6562 *errorcodeptr = ERR27;
6563 goto FAILED;
6564 }
6565 if (condcount == 1) subfirstchar = subreqchar = REQ_NONE;
6566 }
6567 }
6568
6569 /* Error if hit end of pattern */
6570
6571 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6572 {
6573 *errorcodeptr = ERR14;
6574 goto FAILED;
6575 }
6576
6577 /* In the pre-compile phase, update the length by the length of the group,
6578 less the brackets at either end. Then reduce the compiled code to just a
6579 set of non-capturing brackets so that it doesn't use much memory if it is
6580 duplicated by a quantifier.*/
6581
6582 if (lengthptr != NULL)
6583 {
6584 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6585 {
6586 *errorcodeptr = ERR20;
6587 goto FAILED;
6588 }
6589 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6590 code++; /* This already contains bravalue */
6591 PUTINC(code, 0, 1 + LINK_SIZE);
6592 *code++ = OP_KET;
6593 PUTINC(code, 0, 1 + LINK_SIZE);
6594 break; /* No need to waste time with special character handling */
6595 }
6596
6597 /* Otherwise update the main code pointer to the end of the group. */
6598
6599 code = tempcode;
6600
6601 /* For a DEFINE group, required and first character settings are not
6602 relevant. */
6603
6604 if (bravalue == OP_DEF) break;
6605
6606 /* Handle updating of the required and first characters for other types of
6607 group. Update for normal brackets of all kinds, and conditions with two
6608 branches (see code above). If the bracket is followed by a quantifier with
6609 zero repeat, we have to back off. Hence the definition of zeroreqchar and
6610 zerofirstchar outside the main loop so that they can be accessed for the
6611 back off. */
6612
6613 zeroreqchar = reqchar;
6614 zerofirstchar = firstchar;
6615 groupsetfirstchar = FALSE;
6616
6617 if (bravalue >= OP_ONCE)
6618 {
6619 /* If we have not yet set a firstchar in this branch, take it from the
6620 subpattern, remembering that it was set here so that a repeat of more
6621 than one can replicate it as reqchar if necessary. If the subpattern has
6622 no firstchar, set "none" for the whole branch. In both cases, a zero
6623 repeat forces firstchar to "none". */
6624
6625 if (firstchar == REQ_UNSET)
6626 {
6627 if (subfirstchar >= 0)
6628 {
6629 firstchar = subfirstchar;
6630 groupsetfirstchar = TRUE;
6631 }
6632 else firstchar = REQ_NONE;
6633 zerofirstchar = REQ_NONE; <