/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1379 - (show annotations)
Mon Oct 14 13:54:07 2013 UTC (6 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 296643 byte(s)
Error occurred while calculating annotation data.
More auto-possessification additions, using possessive class repeats. These are 
not yet used for explicit possessification.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Tables of names of POSIX character classes and their lengths. The names are
264 now all in a single string, to reduce the number of relocations when a shared
265 library is dynamically loaded. The list of lengths is terminated by a zero
266 length entry. The first three must be alpha, lower, upper, as this is assumed
267 for handling case independence. */
268
269 static const char posix_names[] =
270 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
271 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
272 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
273 STRING_word0 STRING_xdigit;
274
275 static const pcre_uint8 posix_name_lengths[] = {
276 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
277
278 /* Table of class bit maps for each POSIX class. Each class is formed from a
279 base map, with an optional addition or removal of another map. Then, for some
280 classes, there is some additional tweaking: for [:blank:] the vertical space
281 characters are removed, and for [:alpha:] and [:alnum:] the underscore
282 character is removed. The triples in the table consist of the base map offset,
283 second map offset or -1 if no second map, and a non-negative value for map
284 addition or a negative value for map subtraction (if there are two maps). The
285 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
286 remove vertical space characters, 2 => remove underscore. */
287
288 static const int posix_class_maps[] = {
289 cbit_word, cbit_digit, -2, /* alpha */
290 cbit_lower, -1, 0, /* lower */
291 cbit_upper, -1, 0, /* upper */
292 cbit_word, -1, 2, /* alnum - word without underscore */
293 cbit_print, cbit_cntrl, 0, /* ascii */
294 cbit_space, -1, 1, /* blank - a GNU extension */
295 cbit_cntrl, -1, 0, /* cntrl */
296 cbit_digit, -1, 0, /* digit */
297 cbit_graph, -1, 0, /* graph */
298 cbit_print, -1, 0, /* print */
299 cbit_punct, -1, 0, /* punct */
300 cbit_space, -1, 0, /* space */
301 cbit_word, -1, 0, /* word - a Perl extension */
302 cbit_xdigit,-1, 0 /* xdigit */
303 };
304
305 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
306 substitutes must be in the order of the names, defined above, and there are
307 both positive and negative cases. NULL means no substitute. */
308
309 #ifdef SUPPORT_UCP
310 static const pcre_uchar string_PNd[] = {
311 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
312 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
313 static const pcre_uchar string_pNd[] = {
314 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
315 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
316 static const pcre_uchar string_PXsp[] = {
317 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
318 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319 static const pcre_uchar string_pXsp[] = {
320 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322 static const pcre_uchar string_PXwd[] = {
323 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
324 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325 static const pcre_uchar string_pXwd[] = {
326 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328
329 static const pcre_uchar *substitutes[] = {
330 string_PNd, /* \D */
331 string_pNd, /* \d */
332 string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
333 string_pXsp, /* \s */
334 string_PXwd, /* \W */
335 string_pXwd /* \w */
336 };
337
338 static const pcre_uchar string_pL[] = {
339 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
340 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pLl[] = {
342 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_pLu[] = {
345 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
346 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXan[] = {
348 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 static const pcre_uchar string_h[] = {
351 CHAR_BACKSLASH, CHAR_h, '\0' };
352 static const pcre_uchar string_pXps[] = {
353 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355 static const pcre_uchar string_PL[] = {
356 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
357 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358 static const pcre_uchar string_PLl[] = {
359 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
360 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361 static const pcre_uchar string_PLu[] = {
362 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
363 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
364 static const pcre_uchar string_PXan[] = {
365 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
366 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
367 static const pcre_uchar string_H[] = {
368 CHAR_BACKSLASH, CHAR_H, '\0' };
369 static const pcre_uchar string_PXps[] = {
370 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372
373 static const pcre_uchar *posix_substitutes[] = {
374 string_pL, /* alpha */
375 string_pLl, /* lower */
376 string_pLu, /* upper */
377 string_pXan, /* alnum */
378 NULL, /* ascii */
379 string_h, /* blank */
380 NULL, /* cntrl */
381 string_pNd, /* digit */
382 NULL, /* graph */
383 NULL, /* print */
384 NULL, /* punct */
385 string_pXps, /* space */ /* NOTE: Xps is POSIX space */
386 string_pXwd, /* word */
387 NULL, /* xdigit */
388 /* Negated cases */
389 string_PL, /* ^alpha */
390 string_PLl, /* ^lower */
391 string_PLu, /* ^upper */
392 string_PXan, /* ^alnum */
393 NULL, /* ^ascii */
394 string_H, /* ^blank */
395 NULL, /* ^cntrl */
396 string_PNd, /* ^digit */
397 NULL, /* ^graph */
398 NULL, /* ^print */
399 NULL, /* ^punct */
400 string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
401 string_PXwd, /* ^word */
402 NULL /* ^xdigit */
403 };
404 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
405 #endif
406
407 #define STRING(a) # a
408 #define XSTRING(s) STRING(s)
409
410 /* The texts of compile-time error messages. These are "char *" because they
411 are passed to the outside world. Do not ever re-use any error number, because
412 they are documented. Always add a new error instead. Messages marked DEAD below
413 are no longer used. This used to be a table of strings, but in order to reduce
414 the number of relocations needed when a shared library is loaded dynamically,
415 it is now one long string. We cannot use a table of offsets, because the
416 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
417 simply count through to the one we want - this isn't a performance issue
418 because these strings are used only when there is a compilation error.
419
420 Each substring ends with \0 to insert a null character. This includes the final
421 substring, so that the whole string ends with \0\0, which can be detected when
422 counting through. */
423
424 static const char error_texts[] =
425 "no error\0"
426 "\\ at end of pattern\0"
427 "\\c at end of pattern\0"
428 "unrecognized character follows \\\0"
429 "numbers out of order in {} quantifier\0"
430 /* 5 */
431 "number too big in {} quantifier\0"
432 "missing terminating ] for character class\0"
433 "invalid escape sequence in character class\0"
434 "range out of order in character class\0"
435 "nothing to repeat\0"
436 /* 10 */
437 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
438 "internal error: unexpected repeat\0"
439 "unrecognized character after (? or (?-\0"
440 "POSIX named classes are supported only within a class\0"
441 "missing )\0"
442 /* 15 */
443 "reference to non-existent subpattern\0"
444 "erroffset passed as NULL\0"
445 "unknown option bit(s) set\0"
446 "missing ) after comment\0"
447 "parentheses nested too deeply\0" /** DEAD **/
448 /* 20 */
449 "regular expression is too large\0"
450 "failed to get memory\0"
451 "unmatched parentheses\0"
452 "internal error: code overflow\0"
453 "unrecognized character after (?<\0"
454 /* 25 */
455 "lookbehind assertion is not fixed length\0"
456 "malformed number or name after (?(\0"
457 "conditional group contains more than two branches\0"
458 "assertion expected after (?(\0"
459 "(?R or (?[+-]digits must be followed by )\0"
460 /* 30 */
461 "unknown POSIX class name\0"
462 "POSIX collating elements are not supported\0"
463 "this version of PCRE is compiled without UTF support\0"
464 "spare error\0" /** DEAD **/
465 "character value in \\x{} or \\o{} is too large\0"
466 /* 35 */
467 "invalid condition (?(0)\0"
468 "\\C not allowed in lookbehind assertion\0"
469 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
470 "number after (?C is > 255\0"
471 "closing ) for (?C expected\0"
472 /* 40 */
473 "recursive call could loop indefinitely\0"
474 "unrecognized character after (?P\0"
475 "syntax error in subpattern name (missing terminator)\0"
476 "two named subpatterns have the same name\0"
477 "invalid UTF-8 string\0"
478 /* 45 */
479 "support for \\P, \\p, and \\X has not been compiled\0"
480 "malformed \\P or \\p sequence\0"
481 "unknown property name after \\P or \\p\0"
482 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
483 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
484 /* 50 */
485 "repeated subpattern is too long\0" /** DEAD **/
486 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
487 "internal error: overran compiling workspace\0"
488 "internal error: previously-checked referenced subpattern not found\0"
489 "DEFINE group contains more than one branch\0"
490 /* 55 */
491 "repeating a DEFINE group is not allowed\0" /** DEAD **/
492 "inconsistent NEWLINE options\0"
493 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
494 "a numbered reference must not be zero\0"
495 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
496 /* 60 */
497 "(*VERB) not recognized or malformed\0"
498 "number is too big\0"
499 "subpattern name expected\0"
500 "digit expected after (?+\0"
501 "] is an invalid data character in JavaScript compatibility mode\0"
502 /* 65 */
503 "different names for subpatterns of the same number are not allowed\0"
504 "(*MARK) must have an argument\0"
505 "this version of PCRE is not compiled with Unicode property support\0"
506 "\\c must be followed by an ASCII character\0"
507 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
508 /* 70 */
509 "internal error: unknown opcode in find_fixedlength()\0"
510 "\\N is not supported in a class\0"
511 "too many forward references\0"
512 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
513 "invalid UTF-16 string\0"
514 /* 75 */
515 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
516 "character value in \\u.... sequence is too large\0"
517 "invalid UTF-32 string\0"
518 "setting UTF is disabled by the application\0"
519 "non-hex character in \\x{} (closing brace missing?)\0"
520 /* 80 */
521 "non-octal character in \\o{} (closing brace missing?)\0"
522 "missing opening brace after \\o\0"
523 ;
524
525 /* Table to identify digits and hex digits. This is used when compiling
526 patterns. Note that the tables in chartables are dependent on the locale, and
527 may mark arbitrary characters as digits - but the PCRE compiling code expects
528 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
529 a private table here. It costs 256 bytes, but it is a lot faster than doing
530 character value tests (at least in some simple cases I timed), and in some
531 applications one wants PCRE to compile efficiently as well as match
532 efficiently.
533
534 For convenience, we use the same bit definitions as in chartables:
535
536 0x04 decimal digit
537 0x08 hexadecimal digit
538
539 Then we can use ctype_digit and ctype_xdigit in the code. */
540
541 /* Using a simple comparison for decimal numbers rather than a memory read
542 is much faster, and the resulting code is simpler (the compiler turns it
543 into a subtraction and unsigned comparison). */
544
545 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
546
547 #ifndef EBCDIC
548
549 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
550 UTF-8 mode. */
551
552 static const pcre_uint8 digitab[] =
553 {
554 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
555 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
556 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
557 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
558 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
559 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
560 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
561 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
562 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
563 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
564 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
565 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
566 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
586
587 #else
588
589 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
590
591 static const pcre_uint8 digitab[] =
592 {
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
609 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
617 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
623 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
624 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
625
626 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
627 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
628 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
629 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
631 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
635 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
636 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
638 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
640 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
643 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
644 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
645 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
646 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
647 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
648 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
649 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
650 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
651 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
652 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
653 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
654 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
655 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
656 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
657 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
658 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
659 #endif
660
661
662 /* This table is used to check whether auto-possessification is possible
663 between adjacent character-type opcodes. The left-hand (repeated) opcode is
664 used to select the row, and the right-hand opcode is use to select the column.
665 A value of 1 means that auto-possessification is OK. For example, the second
666 value in the first row means that \D+\d can be turned into \D++\d.
667
668 The Unicode property types (\P and \p) have to be present to fill out the table
669 because of what their opcode values are, but the table values should always be
670 zero because property types are handled separately in the code. The last four
671 columns apply to items that cannot be repeated, so there is no need to have
672 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
673 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
674
675 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
676 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
677
678 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
679 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
680 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
681 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
682 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
683 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
684 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
685 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
686 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
687 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
688 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
689 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
690 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
691 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
692 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
693 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
694 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
695 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
696 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
697 };
698
699
700 /* This table is used to check whether auto-possessification is possible
701 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
702 left-hand (repeated) opcode is used to select the row, and the right-hand
703 opcode is used to select the column. The values are as follows:
704
705 0 Always return FALSE (never auto-possessify)
706 1 Character groups are distinct (possessify if both are OP_PROP)
707 2 Check character categories in the same group (general or particular)
708 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
709
710 4 Check left general category vs right particular category
711 5 Check right general category vs left particular category
712
713 6 Left alphanum vs right general category
714 7 Left space vs right general category
715 8 Left word vs right general category
716
717 9 Right alphanum vs left general category
718 10 Right space vs left general category
719 11 Right word vs left general category
720
721 12 Left alphanum vs right particular category
722 13 Left space vs right particular category
723 14 Left word vs right particular category
724
725 15 Right alphanum vs left particular category
726 16 Right space vs left particular category
727 17 Right word vs left particular category
728 */
729
730 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
731 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
732 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
733 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
734 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
735 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
736 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
737 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
738 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
739 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
740 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
741 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
742 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
743 };
744
745 /* This table is used to check whether auto-possessification is possible
746 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
747 specifies a general category and the other specifies a particular category. The
748 row is selected by the general category and the column by the particular
749 category. The value is 1 if the particular category is not part of the general
750 category. */
751
752 static const pcre_uint8 catposstab[7][30] = {
753 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
754 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
755 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
756 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
757 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
758 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
759 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
760 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
761 };
762
763 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
764 a general or particular category. The properties in each row are those
765 that apply to the character set in question. Duplication means that a little
766 unnecessary work is done when checking, but this keeps things much simpler
767 because they can all use the same code. For more details see the comment where
768 this table is used.
769
770 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
771 "space", but from Perl 5.18 it's included, so both categories are treated the
772 same here. */
773
774 static const pcre_uint8 posspropstab[3][4] = {
775 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
776 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
777 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
778 };
779
780
781
782 /*************************************************
783 * Find an error text *
784 *************************************************/
785
786 /* The error texts are now all in one long string, to save on relocations. As
787 some of the text is of unknown length, we can't use a table of offsets.
788 Instead, just count through the strings. This is not a performance issue
789 because it happens only when there has been a compilation error.
790
791 Argument: the error number
792 Returns: pointer to the error string
793 */
794
795 static const char *
796 find_error_text(int n)
797 {
798 const char *s = error_texts;
799 for (; n > 0; n--)
800 {
801 while (*s++ != CHAR_NULL) {};
802 if (*s == CHAR_NULL) return "Error text not found (please report)";
803 }
804 return s;
805 }
806
807
808
809 /*************************************************
810 * Expand the workspace *
811 *************************************************/
812
813 /* This function is called during the second compiling phase, if the number of
814 forward references fills the existing workspace, which is originally a block on
815 the stack. A larger block is obtained from malloc() unless the ultimate limit
816 has been reached or the increase will be rather small.
817
818 Argument: pointer to the compile data block
819 Returns: 0 if all went well, else an error number
820 */
821
822 static int
823 expand_workspace(compile_data *cd)
824 {
825 pcre_uchar *newspace;
826 int newsize = cd->workspace_size * 2;
827
828 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
829 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
830 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
831 return ERR72;
832
833 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
834 if (newspace == NULL) return ERR21;
835 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
836 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
837 if (cd->workspace_size > COMPILE_WORK_SIZE)
838 (PUBL(free))((void *)cd->start_workspace);
839 cd->start_workspace = newspace;
840 cd->workspace_size = newsize;
841 return 0;
842 }
843
844
845
846 /*************************************************
847 * Check for counted repeat *
848 *************************************************/
849
850 /* This function is called when a '{' is encountered in a place where it might
851 start a quantifier. It looks ahead to see if it really is a quantifier or not.
852 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
853 where the ddds are digits.
854
855 Arguments:
856 p pointer to the first char after '{'
857
858 Returns: TRUE or FALSE
859 */
860
861 static BOOL
862 is_counted_repeat(const pcre_uchar *p)
863 {
864 if (!IS_DIGIT(*p)) return FALSE;
865 p++;
866 while (IS_DIGIT(*p)) p++;
867 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
868
869 if (*p++ != CHAR_COMMA) return FALSE;
870 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
871
872 if (!IS_DIGIT(*p)) return FALSE;
873 p++;
874 while (IS_DIGIT(*p)) p++;
875
876 return (*p == CHAR_RIGHT_CURLY_BRACKET);
877 }
878
879
880
881 /*************************************************
882 * Handle escapes *
883 *************************************************/
884
885 /* This function is called when a \ has been encountered. It either returns a
886 positive value for a simple escape such as \n, or 0 for a data character which
887 will be placed in chptr. A backreference to group n is returned as negative n.
888 When UTF-8 is enabled, a positive value greater than 255 may be returned in
889 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
890 character of the escape sequence.
891
892 Arguments:
893 ptrptr points to the pattern position pointer
894 chptr points to a returned data character
895 errorcodeptr points to the errorcode variable
896 bracount number of previous extracting brackets
897 options the options bits
898 isclass TRUE if inside a character class
899
900 Returns: zero => a data character
901 positive => a special escape sequence
902 negative => a back reference
903 on error, errorcodeptr is set
904 */
905
906 static int
907 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
908 int bracount, int options, BOOL isclass)
909 {
910 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
911 BOOL utf = (options & PCRE_UTF8) != 0;
912 const pcre_uchar *ptr = *ptrptr + 1;
913 pcre_uint32 c;
914 int escape = 0;
915 int i;
916
917 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
918 ptr--; /* Set pointer back to the last byte */
919
920 /* If backslash is at the end of the pattern, it's an error. */
921
922 if (c == CHAR_NULL) *errorcodeptr = ERR1;
923
924 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
925 in a table. A non-zero result is something that can be returned immediately.
926 Otherwise further processing may be required. */
927
928 #ifndef EBCDIC /* ASCII/UTF-8 coding */
929 /* Not alphanumeric */
930 else if (c < CHAR_0 || c > CHAR_z) {}
931 else if ((i = escapes[c - CHAR_0]) != 0)
932 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
933
934 #else /* EBCDIC coding */
935 /* Not alphanumeric */
936 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
937 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
938 #endif
939
940 /* Escapes that need further processing, or are illegal. */
941
942 else
943 {
944 const pcre_uchar *oldptr;
945 BOOL braced, negated, overflow;
946 int s;
947
948 switch (c)
949 {
950 /* A number of Perl escapes are not handled by PCRE. We give an explicit
951 error. */
952
953 case CHAR_l:
954 case CHAR_L:
955 *errorcodeptr = ERR37;
956 break;
957
958 case CHAR_u:
959 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
960 {
961 /* In JavaScript, \u must be followed by four hexadecimal numbers.
962 Otherwise it is a lowercase u letter. */
963 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
964 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
965 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
966 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
967 {
968 c = 0;
969 for (i = 0; i < 4; ++i)
970 {
971 register pcre_uint32 cc = *(++ptr);
972 #ifndef EBCDIC /* ASCII/UTF-8 coding */
973 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
974 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
975 #else /* EBCDIC coding */
976 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
977 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
978 #endif
979 }
980
981 #if defined COMPILE_PCRE8
982 if (c > (utf ? 0x10ffffU : 0xffU))
983 #elif defined COMPILE_PCRE16
984 if (c > (utf ? 0x10ffffU : 0xffffU))
985 #elif defined COMPILE_PCRE32
986 if (utf && c > 0x10ffffU)
987 #endif
988 {
989 *errorcodeptr = ERR76;
990 }
991 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
992 }
993 }
994 else
995 *errorcodeptr = ERR37;
996 break;
997
998 case CHAR_U:
999 /* In JavaScript, \U is an uppercase U letter. */
1000 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1001 break;
1002
1003 /* In a character class, \g is just a literal "g". Outside a character
1004 class, \g must be followed by one of a number of specific things:
1005
1006 (1) A number, either plain or braced. If positive, it is an absolute
1007 backreference. If negative, it is a relative backreference. This is a Perl
1008 5.10 feature.
1009
1010 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1011 is part of Perl's movement towards a unified syntax for back references. As
1012 this is synonymous with \k{name}, we fudge it up by pretending it really
1013 was \k.
1014
1015 (3) For Oniguruma compatibility we also support \g followed by a name or a
1016 number either in angle brackets or in single quotes. However, these are
1017 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1018 the ESC_g code (cf \k). */
1019
1020 case CHAR_g:
1021 if (isclass) break;
1022 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1023 {
1024 escape = ESC_g;
1025 break;
1026 }
1027
1028 /* Handle the Perl-compatible cases */
1029
1030 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1031 {
1032 const pcre_uchar *p;
1033 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1034 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1035 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1036 {
1037 escape = ESC_k;
1038 break;
1039 }
1040 braced = TRUE;
1041 ptr++;
1042 }
1043 else braced = FALSE;
1044
1045 if (ptr[1] == CHAR_MINUS)
1046 {
1047 negated = TRUE;
1048 ptr++;
1049 }
1050 else negated = FALSE;
1051
1052 /* The integer range is limited by the machine's int representation. */
1053 s = 0;
1054 overflow = FALSE;
1055 while (IS_DIGIT(ptr[1]))
1056 {
1057 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1058 {
1059 overflow = TRUE;
1060 break;
1061 }
1062 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1063 }
1064 if (overflow) /* Integer overflow */
1065 {
1066 while (IS_DIGIT(ptr[1]))
1067 ptr++;
1068 *errorcodeptr = ERR61;
1069 break;
1070 }
1071
1072 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1073 {
1074 *errorcodeptr = ERR57;
1075 break;
1076 }
1077
1078 if (s == 0)
1079 {
1080 *errorcodeptr = ERR58;
1081 break;
1082 }
1083
1084 if (negated)
1085 {
1086 if (s > bracount)
1087 {
1088 *errorcodeptr = ERR15;
1089 break;
1090 }
1091 s = bracount - (s - 1);
1092 }
1093
1094 escape = -s;
1095 break;
1096
1097 /* The handling of escape sequences consisting of a string of digits
1098 starting with one that is not zero is not straightforward. Perl has changed
1099 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1100 recommended to avoid the ambiguities in the old syntax.
1101
1102 Outside a character class, the digits are read as a decimal number. If the
1103 number is less than 8 (used to be 10), or if there are that many previous
1104 extracting left brackets, then it is a back reference. Otherwise, up to
1105 three octal digits are read to form an escaped byte. Thus \123 is likely to
1106 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1107 the octal value is greater than 377, the least significant 8 bits are
1108 taken. \8 and \9 are treated as the literal characters 8 and 9.
1109
1110 Inside a character class, \ followed by a digit is always either a literal
1111 8 or 9 or an octal number. */
1112
1113 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1114 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1115
1116 if (!isclass)
1117 {
1118 oldptr = ptr;
1119 /* The integer range is limited by the machine's int representation. */
1120 s = (int)(c -CHAR_0);
1121 overflow = FALSE;
1122 while (IS_DIGIT(ptr[1]))
1123 {
1124 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1125 {
1126 overflow = TRUE;
1127 break;
1128 }
1129 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1130 }
1131 if (overflow) /* Integer overflow */
1132 {
1133 while (IS_DIGIT(ptr[1]))
1134 ptr++;
1135 *errorcodeptr = ERR61;
1136 break;
1137 }
1138 if (s < 8 || s <= bracount) /* Check for back reference */
1139 {
1140 escape = -s;
1141 break;
1142 }
1143 ptr = oldptr; /* Put the pointer back and fall through */
1144 }
1145
1146 /* Handle a digit following \ when the number is not a back reference. If
1147 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1148 then treat the digit as a following literal. At least by Perl 5.18 this
1149 changed so as not to insert the binary zero. */
1150
1151 if ((c = *ptr) >= CHAR_8) break;
1152
1153 /* Fall through with a digit less than 8 */
1154
1155 /* \0 always starts an octal number, but we may drop through to here with a
1156 larger first octal digit. The original code used just to take the least
1157 significant 8 bits of octal numbers (I think this is what early Perls used
1158 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1159 but no more than 3 octal digits. */
1160
1161 case CHAR_0:
1162 c -= CHAR_0;
1163 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1164 c = c * 8 + *(++ptr) - CHAR_0;
1165 #ifdef COMPILE_PCRE8
1166 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1167 #endif
1168 break;
1169
1170 /* \o is a relatively new Perl feature, supporting a more general way of
1171 specifying character codes in octal. The only supported form is \o{ddd}. */
1172
1173 case CHAR_o:
1174 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1175 {
1176 ptr += 2;
1177 c = 0;
1178 overflow = FALSE;
1179 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1180 {
1181 register pcre_uint32 cc = *ptr++;
1182 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1183 #ifdef COMPILE_PCRE32
1184 if (c >= 0x20000000l) { overflow = TRUE; break; }
1185 #endif
1186 c = (c << 3) + cc - CHAR_0 ;
1187 #if defined COMPILE_PCRE8
1188 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1189 #elif defined COMPILE_PCRE16
1190 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1191 #elif defined COMPILE_PCRE32
1192 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1193 #endif
1194 }
1195 if (overflow)
1196 {
1197 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1198 *errorcodeptr = ERR34;
1199 }
1200 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1201 {
1202 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1203 }
1204 else *errorcodeptr = ERR80;
1205 }
1206 break;
1207
1208 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1209 numbers. Otherwise it is a lowercase x letter. */
1210
1211 case CHAR_x:
1212 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1213 {
1214 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1215 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1216 {
1217 c = 0;
1218 for (i = 0; i < 2; ++i)
1219 {
1220 register pcre_uint32 cc = *(++ptr);
1221 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1222 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1223 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1224 #else /* EBCDIC coding */
1225 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1226 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1227 #endif
1228 }
1229 }
1230 } /* End JavaScript handling */
1231
1232 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1233 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1234 digits. If not, { used to be treated as a data character. However, Perl
1235 seems to read hex digits up to the first non-such, and ignore the rest, so
1236 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1237 now gives an error. */
1238
1239 else
1240 {
1241 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1242 {
1243 ptr += 2;
1244 c = 0;
1245 overflow = FALSE;
1246 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1247 {
1248 register pcre_uint32 cc = *ptr++;
1249 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1250
1251 #ifdef COMPILE_PCRE32
1252 if (c >= 0x10000000l) { overflow = TRUE; break; }
1253 #endif
1254
1255 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1256 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1257 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1258 #else /* EBCDIC coding */
1259 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1260 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1261 #endif
1262
1263 #if defined COMPILE_PCRE8
1264 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1265 #elif defined COMPILE_PCRE16
1266 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1267 #elif defined COMPILE_PCRE32
1268 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1269 #endif
1270 }
1271
1272 if (overflow)
1273 {
1274 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1275 *errorcodeptr = ERR34;
1276 }
1277
1278 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1279 {
1280 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1281 }
1282
1283 /* If the sequence of hex digits does not end with '}', give an error.
1284 We used just to recognize this construct and fall through to the normal
1285 \x handling, but nowadays Perl gives an error, which seems much more
1286 sensible, so we do too. */
1287
1288 else *errorcodeptr = ERR79;
1289 } /* End of \x{} processing */
1290
1291 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1292
1293 else
1294 {
1295 c = 0;
1296 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1297 {
1298 pcre_uint32 cc; /* Some compilers don't like */
1299 cc = *(++ptr); /* ++ in initializers */
1300 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1301 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1302 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1303 #else /* EBCDIC coding */
1304 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1305 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1306 #endif
1307 }
1308 } /* End of \xdd handling */
1309 } /* End of Perl-style \x handling */
1310 break;
1311
1312 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1313 An error is given if the byte following \c is not an ASCII character. This
1314 coding is ASCII-specific, but then the whole concept of \cx is
1315 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1316
1317 case CHAR_c:
1318 c = *(++ptr);
1319 if (c == CHAR_NULL)
1320 {
1321 *errorcodeptr = ERR2;
1322 break;
1323 }
1324 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1325 if (c > 127) /* Excludes all non-ASCII in either mode */
1326 {
1327 *errorcodeptr = ERR68;
1328 break;
1329 }
1330 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1331 c ^= 0x40;
1332 #else /* EBCDIC coding */
1333 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1334 c ^= 0xC0;
1335 #endif
1336 break;
1337
1338 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1339 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1340 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1341 odd, but there used to be some cases other than the default, and there may
1342 be again in future, so I haven't "optimized" it. */
1343
1344 default:
1345 if ((options & PCRE_EXTRA) != 0) switch(c)
1346 {
1347 default:
1348 *errorcodeptr = ERR3;
1349 break;
1350 }
1351 break;
1352 }
1353 }
1354
1355 /* Perl supports \N{name} for character names, as well as plain \N for "not
1356 newline". PCRE does not support \N{name}. However, it does support
1357 quantification such as \N{2,3}. */
1358
1359 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1360 !is_counted_repeat(ptr+2))
1361 *errorcodeptr = ERR37;
1362
1363 /* If PCRE_UCP is set, we change the values for \d etc. */
1364
1365 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1366 escape += (ESC_DU - ESC_D);
1367
1368 /* Set the pointer to the final character before returning. */
1369
1370 *ptrptr = ptr;
1371 *chptr = c;
1372 return escape;
1373 }
1374
1375
1376
1377 #ifdef SUPPORT_UCP
1378 /*************************************************
1379 * Handle \P and \p *
1380 *************************************************/
1381
1382 /* This function is called after \P or \p has been encountered, provided that
1383 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1384 pointing at the P or p. On exit, it is pointing at the final character of the
1385 escape sequence.
1386
1387 Argument:
1388 ptrptr points to the pattern position pointer
1389 negptr points to a boolean that is set TRUE for negation else FALSE
1390 ptypeptr points to an unsigned int that is set to the type value
1391 pdataptr points to an unsigned int that is set to the detailed property value
1392 errorcodeptr points to the error code variable
1393
1394 Returns: TRUE if the type value was found, or FALSE for an invalid type
1395 */
1396
1397 static BOOL
1398 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1399 unsigned int *pdataptr, int *errorcodeptr)
1400 {
1401 pcre_uchar c;
1402 int i, bot, top;
1403 const pcre_uchar *ptr = *ptrptr;
1404 pcre_uchar name[32];
1405
1406 c = *(++ptr);
1407 if (c == CHAR_NULL) goto ERROR_RETURN;
1408
1409 *negptr = FALSE;
1410
1411 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1412 negation. */
1413
1414 if (c == CHAR_LEFT_CURLY_BRACKET)
1415 {
1416 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1417 {
1418 *negptr = TRUE;
1419 ptr++;
1420 }
1421 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1422 {
1423 c = *(++ptr);
1424 if (c == CHAR_NULL) goto ERROR_RETURN;
1425 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1426 name[i] = c;
1427 }
1428 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1429 name[i] = 0;
1430 }
1431
1432 /* Otherwise there is just one following character */
1433
1434 else
1435 {
1436 name[0] = c;
1437 name[1] = 0;
1438 }
1439
1440 *ptrptr = ptr;
1441
1442 /* Search for a recognized property name using binary chop */
1443
1444 bot = 0;
1445 top = PRIV(utt_size);
1446
1447 while (bot < top)
1448 {
1449 int r;
1450 i = (bot + top) >> 1;
1451 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1452 if (r == 0)
1453 {
1454 *ptypeptr = PRIV(utt)[i].type;
1455 *pdataptr = PRIV(utt)[i].value;
1456 return TRUE;
1457 }
1458 if (r > 0) bot = i + 1; else top = i;
1459 }
1460
1461 *errorcodeptr = ERR47;
1462 *ptrptr = ptr;
1463 return FALSE;
1464
1465 ERROR_RETURN:
1466 *errorcodeptr = ERR46;
1467 *ptrptr = ptr;
1468 return FALSE;
1469 }
1470 #endif
1471
1472
1473
1474 /*************************************************
1475 * Read repeat counts *
1476 *************************************************/
1477
1478 /* Read an item of the form {n,m} and return the values. This is called only
1479 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1480 so the syntax is guaranteed to be correct, but we need to check the values.
1481
1482 Arguments:
1483 p pointer to first char after '{'
1484 minp pointer to int for min
1485 maxp pointer to int for max
1486 returned as -1 if no max
1487 errorcodeptr points to error code variable
1488
1489 Returns: pointer to '}' on success;
1490 current ptr on error, with errorcodeptr set non-zero
1491 */
1492
1493 static const pcre_uchar *
1494 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1495 {
1496 int min = 0;
1497 int max = -1;
1498
1499 /* Read the minimum value and do a paranoid check: a negative value indicates
1500 an integer overflow. */
1501
1502 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1503 if (min < 0 || min > 65535)
1504 {
1505 *errorcodeptr = ERR5;
1506 return p;
1507 }
1508
1509 /* Read the maximum value if there is one, and again do a paranoid on its size.
1510 Also, max must not be less than min. */
1511
1512 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1513 {
1514 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1515 {
1516 max = 0;
1517 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1518 if (max < 0 || max > 65535)
1519 {
1520 *errorcodeptr = ERR5;
1521 return p;
1522 }
1523 if (max < min)
1524 {
1525 *errorcodeptr = ERR4;
1526 return p;
1527 }
1528 }
1529 }
1530
1531 /* Fill in the required variables, and pass back the pointer to the terminating
1532 '}'. */
1533
1534 *minp = min;
1535 *maxp = max;
1536 return p;
1537 }
1538
1539
1540
1541 /*************************************************
1542 * Find first significant op code *
1543 *************************************************/
1544
1545 /* This is called by several functions that scan a compiled expression looking
1546 for a fixed first character, or an anchoring op code etc. It skips over things
1547 that do not influence this. For some calls, it makes sense to skip negative
1548 forward and all backward assertions, and also the \b assertion; for others it
1549 does not.
1550
1551 Arguments:
1552 code pointer to the start of the group
1553 skipassert TRUE if certain assertions are to be skipped
1554
1555 Returns: pointer to the first significant opcode
1556 */
1557
1558 static const pcre_uchar*
1559 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1560 {
1561 for (;;)
1562 {
1563 switch ((int)*code)
1564 {
1565 case OP_ASSERT_NOT:
1566 case OP_ASSERTBACK:
1567 case OP_ASSERTBACK_NOT:
1568 if (!skipassert) return code;
1569 do code += GET(code, 1); while (*code == OP_ALT);
1570 code += PRIV(OP_lengths)[*code];
1571 break;
1572
1573 case OP_WORD_BOUNDARY:
1574 case OP_NOT_WORD_BOUNDARY:
1575 if (!skipassert) return code;
1576 /* Fall through */
1577
1578 case OP_CALLOUT:
1579 case OP_CREF:
1580 case OP_DNCREF:
1581 case OP_RREF:
1582 case OP_DNRREF:
1583 case OP_DEF:
1584 code += PRIV(OP_lengths)[*code];
1585 break;
1586
1587 default:
1588 return code;
1589 }
1590 }
1591 /* Control never reaches here */
1592 }
1593
1594
1595
1596 /*************************************************
1597 * Find the fixed length of a branch *
1598 *************************************************/
1599
1600 /* Scan a branch and compute the fixed length of subject that will match it,
1601 if the length is fixed. This is needed for dealing with backward assertions.
1602 In UTF8 mode, the result is in characters rather than bytes. The branch is
1603 temporarily terminated with OP_END when this function is called.
1604
1605 This function is called when a backward assertion is encountered, so that if it
1606 fails, the error message can point to the correct place in the pattern.
1607 However, we cannot do this when the assertion contains subroutine calls,
1608 because they can be forward references. We solve this by remembering this case
1609 and doing the check at the end; a flag specifies which mode we are running in.
1610
1611 Arguments:
1612 code points to the start of the pattern (the bracket)
1613 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1614 atend TRUE if called when the pattern is complete
1615 cd the "compile data" structure
1616
1617 Returns: the fixed length,
1618 or -1 if there is no fixed length,
1619 or -2 if \C was encountered (in UTF-8 mode only)
1620 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1621 or -4 if an unknown opcode was encountered (internal error)
1622 */
1623
1624 static int
1625 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1626 {
1627 int length = -1;
1628
1629 register int branchlength = 0;
1630 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1631
1632 /* Scan along the opcodes for this branch. If we get to the end of the
1633 branch, check the length against that of the other branches. */
1634
1635 for (;;)
1636 {
1637 int d;
1638 pcre_uchar *ce, *cs;
1639 register pcre_uchar op = *cc;
1640
1641 switch (op)
1642 {
1643 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1644 OP_BRA (normal non-capturing bracket) because the other variants of these
1645 opcodes are all concerned with unlimited repeated groups, which of course
1646 are not of fixed length. */
1647
1648 case OP_CBRA:
1649 case OP_BRA:
1650 case OP_ONCE:
1651 case OP_ONCE_NC:
1652 case OP_COND:
1653 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1654 if (d < 0) return d;
1655 branchlength += d;
1656 do cc += GET(cc, 1); while (*cc == OP_ALT);
1657 cc += 1 + LINK_SIZE;
1658 break;
1659
1660 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1661 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1662 an ALT. If it is END it's the end of the outer call. All can be handled by
1663 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1664 because they all imply an unlimited repeat. */
1665
1666 case OP_ALT:
1667 case OP_KET:
1668 case OP_END:
1669 case OP_ACCEPT:
1670 case OP_ASSERT_ACCEPT:
1671 if (length < 0) length = branchlength;
1672 else if (length != branchlength) return -1;
1673 if (*cc != OP_ALT) return length;
1674 cc += 1 + LINK_SIZE;
1675 branchlength = 0;
1676 break;
1677
1678 /* A true recursion implies not fixed length, but a subroutine call may
1679 be OK. If the subroutine is a forward reference, we can't deal with
1680 it until the end of the pattern, so return -3. */
1681
1682 case OP_RECURSE:
1683 if (!atend) return -3;
1684 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1685 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1686 if (cc > cs && cc < ce) return -1; /* Recursion */
1687 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1688 if (d < 0) return d;
1689 branchlength += d;
1690 cc += 1 + LINK_SIZE;
1691 break;
1692
1693 /* Skip over assertive subpatterns */
1694
1695 case OP_ASSERT:
1696 case OP_ASSERT_NOT:
1697 case OP_ASSERTBACK:
1698 case OP_ASSERTBACK_NOT:
1699 do cc += GET(cc, 1); while (*cc == OP_ALT);
1700 cc += PRIV(OP_lengths)[*cc];
1701 break;
1702
1703 /* Skip over things that don't match chars */
1704
1705 case OP_MARK:
1706 case OP_PRUNE_ARG:
1707 case OP_SKIP_ARG:
1708 case OP_THEN_ARG:
1709 cc += cc[1] + PRIV(OP_lengths)[*cc];
1710 break;
1711
1712 case OP_CALLOUT:
1713 case OP_CIRC:
1714 case OP_CIRCM:
1715 case OP_CLOSE:
1716 case OP_COMMIT:
1717 case OP_CREF:
1718 case OP_DEF:
1719 case OP_DNCREF:
1720 case OP_DNRREF:
1721 case OP_DOLL:
1722 case OP_DOLLM:
1723 case OP_EOD:
1724 case OP_EODN:
1725 case OP_FAIL:
1726 case OP_NOT_WORD_BOUNDARY:
1727 case OP_PRUNE:
1728 case OP_REVERSE:
1729 case OP_RREF:
1730 case OP_SET_SOM:
1731 case OP_SKIP:
1732 case OP_SOD:
1733 case OP_SOM:
1734 case OP_THEN:
1735 case OP_WORD_BOUNDARY:
1736 cc += PRIV(OP_lengths)[*cc];
1737 break;
1738
1739 /* Handle literal characters */
1740
1741 case OP_CHAR:
1742 case OP_CHARI:
1743 case OP_NOT:
1744 case OP_NOTI:
1745 branchlength++;
1746 cc += 2;
1747 #ifdef SUPPORT_UTF
1748 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1749 #endif
1750 break;
1751
1752 /* Handle exact repetitions. The count is already in characters, but we
1753 need to skip over a multibyte character in UTF8 mode. */
1754
1755 case OP_EXACT:
1756 case OP_EXACTI:
1757 case OP_NOTEXACT:
1758 case OP_NOTEXACTI:
1759 branchlength += (int)GET2(cc,1);
1760 cc += 2 + IMM2_SIZE;
1761 #ifdef SUPPORT_UTF
1762 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1763 #endif
1764 break;
1765
1766 case OP_TYPEEXACT:
1767 branchlength += GET2(cc,1);
1768 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1769 cc += 2;
1770 cc += 1 + IMM2_SIZE + 1;
1771 break;
1772
1773 /* Handle single-char matchers */
1774
1775 case OP_PROP:
1776 case OP_NOTPROP:
1777 cc += 2;
1778 /* Fall through */
1779
1780 case OP_HSPACE:
1781 case OP_VSPACE:
1782 case OP_NOT_HSPACE:
1783 case OP_NOT_VSPACE:
1784 case OP_NOT_DIGIT:
1785 case OP_DIGIT:
1786 case OP_NOT_WHITESPACE:
1787 case OP_WHITESPACE:
1788 case OP_NOT_WORDCHAR:
1789 case OP_WORDCHAR:
1790 case OP_ANY:
1791 case OP_ALLANY:
1792 branchlength++;
1793 cc++;
1794 break;
1795
1796 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1797 otherwise \C is coded as OP_ALLANY. */
1798
1799 case OP_ANYBYTE:
1800 return -2;
1801
1802 /* Check a class for variable quantification */
1803
1804 case OP_CLASS:
1805 case OP_NCLASS:
1806 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1807 case OP_XCLASS:
1808 /* The original code caused an unsigned overflow in 64 bit systems,
1809 so now we use a conditional statement. */
1810 if (op == OP_XCLASS)
1811 cc += GET(cc, 1);
1812 else
1813 cc += PRIV(OP_lengths)[OP_CLASS];
1814 #else
1815 cc += PRIV(OP_lengths)[OP_CLASS];
1816 #endif
1817
1818 switch (*cc)
1819 {
1820 case OP_CRSTAR:
1821 case OP_CRMINSTAR:
1822 case OP_CRPLUS:
1823 case OP_CRMINPLUS:
1824 case OP_CRQUERY:
1825 case OP_CRMINQUERY:
1826 case OP_CRPOSSTAR:
1827 case OP_CRPOSPLUS:
1828 case OP_CRPOSQUERY:
1829 return -1;
1830
1831 case OP_CRRANGE:
1832 case OP_CRMINRANGE:
1833 case OP_CRPOSRANGE:
1834 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1835 branchlength += (int)GET2(cc,1);
1836 cc += 1 + 2 * IMM2_SIZE;
1837 break;
1838
1839 default:
1840 branchlength++;
1841 }
1842 break;
1843
1844 /* Anything else is variable length */
1845
1846 case OP_ANYNL:
1847 case OP_BRAMINZERO:
1848 case OP_BRAPOS:
1849 case OP_BRAPOSZERO:
1850 case OP_BRAZERO:
1851 case OP_CBRAPOS:
1852 case OP_EXTUNI:
1853 case OP_KETRMAX:
1854 case OP_KETRMIN:
1855 case OP_KETRPOS:
1856 case OP_MINPLUS:
1857 case OP_MINPLUSI:
1858 case OP_MINQUERY:
1859 case OP_MINQUERYI:
1860 case OP_MINSTAR:
1861 case OP_MINSTARI:
1862 case OP_MINUPTO:
1863 case OP_MINUPTOI:
1864 case OP_NOTMINPLUS:
1865 case OP_NOTMINPLUSI:
1866 case OP_NOTMINQUERY:
1867 case OP_NOTMINQUERYI:
1868 case OP_NOTMINSTAR:
1869 case OP_NOTMINSTARI:
1870 case OP_NOTMINUPTO:
1871 case OP_NOTMINUPTOI:
1872 case OP_NOTPLUS:
1873 case OP_NOTPLUSI:
1874 case OP_NOTPOSPLUS:
1875 case OP_NOTPOSPLUSI:
1876 case OP_NOTPOSQUERY:
1877 case OP_NOTPOSQUERYI:
1878 case OP_NOTPOSSTAR:
1879 case OP_NOTPOSSTARI:
1880 case OP_NOTPOSUPTO:
1881 case OP_NOTPOSUPTOI:
1882 case OP_NOTQUERY:
1883 case OP_NOTQUERYI:
1884 case OP_NOTSTAR:
1885 case OP_NOTSTARI:
1886 case OP_NOTUPTO:
1887 case OP_NOTUPTOI:
1888 case OP_PLUS:
1889 case OP_PLUSI:
1890 case OP_POSPLUS:
1891 case OP_POSPLUSI:
1892 case OP_POSQUERY:
1893 case OP_POSQUERYI:
1894 case OP_POSSTAR:
1895 case OP_POSSTARI:
1896 case OP_POSUPTO:
1897 case OP_POSUPTOI:
1898 case OP_QUERY:
1899 case OP_QUERYI:
1900 case OP_REF:
1901 case OP_REFI:
1902 case OP_DNREF:
1903 case OP_DNREFI:
1904 case OP_SBRA:
1905 case OP_SBRAPOS:
1906 case OP_SCBRA:
1907 case OP_SCBRAPOS:
1908 case OP_SCOND:
1909 case OP_SKIPZERO:
1910 case OP_STAR:
1911 case OP_STARI:
1912 case OP_TYPEMINPLUS:
1913 case OP_TYPEMINQUERY:
1914 case OP_TYPEMINSTAR:
1915 case OP_TYPEMINUPTO:
1916 case OP_TYPEPLUS:
1917 case OP_TYPEPOSPLUS:
1918 case OP_TYPEPOSQUERY:
1919 case OP_TYPEPOSSTAR:
1920 case OP_TYPEPOSUPTO:
1921 case OP_TYPEQUERY:
1922 case OP_TYPESTAR:
1923 case OP_TYPEUPTO:
1924 case OP_UPTO:
1925 case OP_UPTOI:
1926 return -1;
1927
1928 /* Catch unrecognized opcodes so that when new ones are added they
1929 are not forgotten, as has happened in the past. */
1930
1931 default:
1932 return -4;
1933 }
1934 }
1935 /* Control never gets here */
1936 }
1937
1938
1939
1940 /*************************************************
1941 * Scan compiled regex for specific bracket *
1942 *************************************************/
1943
1944 /* This little function scans through a compiled pattern until it finds a
1945 capturing bracket with the given number, or, if the number is negative, an
1946 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1947 so that it can be called from pcre_study() when finding the minimum matching
1948 length.
1949
1950 Arguments:
1951 code points to start of expression
1952 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1953 number the required bracket number or negative to find a lookbehind
1954
1955 Returns: pointer to the opcode for the bracket, or NULL if not found
1956 */
1957
1958 const pcre_uchar *
1959 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
1960 {
1961 for (;;)
1962 {
1963 register pcre_uchar c = *code;
1964
1965 if (c == OP_END) return NULL;
1966
1967 /* XCLASS is used for classes that cannot be represented just by a bit
1968 map. This includes negated single high-valued characters. The length in
1969 the table is zero; the actual length is stored in the compiled code. */
1970
1971 if (c == OP_XCLASS) code += GET(code, 1);
1972
1973 /* Handle recursion */
1974
1975 else if (c == OP_REVERSE)
1976 {
1977 if (number < 0) return (pcre_uchar *)code;
1978 code += PRIV(OP_lengths)[c];
1979 }
1980
1981 /* Handle capturing bracket */
1982
1983 else if (c == OP_CBRA || c == OP_SCBRA ||
1984 c == OP_CBRAPOS || c == OP_SCBRAPOS)
1985 {
1986 int n = (int)GET2(code, 1+LINK_SIZE);
1987 if (n == number) return (pcre_uchar *)code;
1988 code += PRIV(OP_lengths)[c];
1989 }
1990
1991 /* Otherwise, we can get the item's length from the table, except that for
1992 repeated character types, we have to test for \p and \P, which have an extra
1993 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1994 must add in its length. */
1995
1996 else
1997 {
1998 switch(c)
1999 {
2000 case OP_TYPESTAR:
2001 case OP_TYPEMINSTAR:
2002 case OP_TYPEPLUS:
2003 case OP_TYPEMINPLUS:
2004 case OP_TYPEQUERY:
2005 case OP_TYPEMINQUERY:
2006 case OP_TYPEPOSSTAR:
2007 case OP_TYPEPOSPLUS:
2008 case OP_TYPEPOSQUERY:
2009 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2010 break;
2011
2012 case OP_TYPEUPTO:
2013 case OP_TYPEMINUPTO:
2014 case OP_TYPEEXACT:
2015 case OP_TYPEPOSUPTO:
2016 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2017 code += 2;
2018 break;
2019
2020 case OP_MARK:
2021 case OP_PRUNE_ARG:
2022 case OP_SKIP_ARG:
2023 case OP_THEN_ARG:
2024 code += code[1];
2025 break;
2026 }
2027
2028 /* Add in the fixed length from the table */
2029
2030 code += PRIV(OP_lengths)[c];
2031
2032 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2033 a multi-byte character. The length in the table is a minimum, so we have to
2034 arrange to skip the extra bytes. */
2035
2036 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2037 if (utf) switch(c)
2038 {
2039 case OP_CHAR:
2040 case OP_CHARI:
2041 case OP_EXACT:
2042 case OP_EXACTI:
2043 case OP_UPTO:
2044 case OP_UPTOI:
2045 case OP_MINUPTO:
2046 case OP_MINUPTOI:
2047 case OP_POSUPTO:
2048 case OP_POSUPTOI:
2049 case OP_STAR:
2050 case OP_STARI:
2051 case OP_MINSTAR:
2052 case OP_MINSTARI:
2053 case OP_POSSTAR:
2054 case OP_POSSTARI:
2055 case OP_PLUS:
2056 case OP_PLUSI:
2057 case OP_MINPLUS:
2058 case OP_MINPLUSI:
2059 case OP_POSPLUS:
2060 case OP_POSPLUSI:
2061 case OP_QUERY:
2062 case OP_QUERYI:
2063 case OP_MINQUERY:
2064 case OP_MINQUERYI:
2065 case OP_POSQUERY:
2066 case OP_POSQUERYI:
2067 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2068 break;
2069 }
2070 #else
2071 (void)(utf); /* Keep compiler happy by referencing function argument */
2072 #endif
2073 }
2074 }
2075 }
2076
2077
2078
2079 /*************************************************
2080 * Scan compiled regex for recursion reference *
2081 *************************************************/
2082
2083 /* This little function scans through a compiled pattern until it finds an
2084 instance of OP_RECURSE.
2085
2086 Arguments:
2087 code points to start of expression
2088 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2089
2090 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2091 */
2092
2093 static const pcre_uchar *
2094 find_recurse(const pcre_uchar *code, BOOL utf)
2095 {
2096 for (;;)
2097 {
2098 register pcre_uchar c = *code;
2099 if (c == OP_END) return NULL;
2100 if (c == OP_RECURSE) return code;
2101
2102 /* XCLASS is used for classes that cannot be represented just by a bit
2103 map. This includes negated single high-valued characters. The length in
2104 the table is zero; the actual length is stored in the compiled code. */
2105
2106 if (c == OP_XCLASS) code += GET(code, 1);
2107
2108 /* Otherwise, we can get the item's length from the table, except that for
2109 repeated character types, we have to test for \p and \P, which have an extra
2110 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2111 must add in its length. */
2112
2113 else
2114 {
2115 switch(c)
2116 {
2117 case OP_TYPESTAR:
2118 case OP_TYPEMINSTAR:
2119 case OP_TYPEPLUS:
2120 case OP_TYPEMINPLUS:
2121 case OP_TYPEQUERY:
2122 case OP_TYPEMINQUERY:
2123 case OP_TYPEPOSSTAR:
2124 case OP_TYPEPOSPLUS:
2125 case OP_TYPEPOSQUERY:
2126 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2127 break;
2128
2129 case OP_TYPEPOSUPTO:
2130 case OP_TYPEUPTO:
2131 case OP_TYPEMINUPTO:
2132 case OP_TYPEEXACT:
2133 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2134 code += 2;
2135 break;
2136
2137 case OP_MARK:
2138 case OP_PRUNE_ARG:
2139 case OP_SKIP_ARG:
2140 case OP_THEN_ARG:
2141 code += code[1];
2142 break;
2143 }
2144
2145 /* Add in the fixed length from the table */
2146
2147 code += PRIV(OP_lengths)[c];
2148
2149 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2150 by a multi-byte character. The length in the table is a minimum, so we have
2151 to arrange to skip the extra bytes. */
2152
2153 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2154 if (utf) switch(c)
2155 {
2156 case OP_CHAR:
2157 case OP_CHARI:
2158 case OP_NOT:
2159 case OP_NOTI:
2160 case OP_EXACT:
2161 case OP_EXACTI:
2162 case OP_NOTEXACT:
2163 case OP_NOTEXACTI:
2164 case OP_UPTO:
2165 case OP_UPTOI:
2166 case OP_NOTUPTO:
2167 case OP_NOTUPTOI:
2168 case OP_MINUPTO:
2169 case OP_MINUPTOI:
2170 case OP_NOTMINUPTO:
2171 case OP_NOTMINUPTOI:
2172 case OP_POSUPTO:
2173 case OP_POSUPTOI:
2174 case OP_NOTPOSUPTO:
2175 case OP_NOTPOSUPTOI:
2176 case OP_STAR:
2177 case OP_STARI:
2178 case OP_NOTSTAR:
2179 case OP_NOTSTARI:
2180 case OP_MINSTAR:
2181 case OP_MINSTARI:
2182 case OP_NOTMINSTAR:
2183 case OP_NOTMINSTARI:
2184 case OP_POSSTAR:
2185 case OP_POSSTARI:
2186 case OP_NOTPOSSTAR:
2187 case OP_NOTPOSSTARI:
2188 case OP_PLUS:
2189 case OP_PLUSI:
2190 case OP_NOTPLUS:
2191 case OP_NOTPLUSI:
2192 case OP_MINPLUS:
2193 case OP_MINPLUSI:
2194 case OP_NOTMINPLUS:
2195 case OP_NOTMINPLUSI:
2196 case OP_POSPLUS:
2197 case OP_POSPLUSI:
2198 case OP_NOTPOSPLUS:
2199 case OP_NOTPOSPLUSI:
2200 case OP_QUERY:
2201 case OP_QUERYI:
2202 case OP_NOTQUERY:
2203 case OP_NOTQUERYI:
2204 case OP_MINQUERY:
2205 case OP_MINQUERYI:
2206 case OP_NOTMINQUERY:
2207 case OP_NOTMINQUERYI:
2208 case OP_POSQUERY:
2209 case OP_POSQUERYI:
2210 case OP_NOTPOSQUERY:
2211 case OP_NOTPOSQUERYI:
2212 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2213 break;
2214 }
2215 #else
2216 (void)(utf); /* Keep compiler happy by referencing function argument */
2217 #endif
2218 }
2219 }
2220 }
2221
2222
2223
2224 /*************************************************
2225 * Scan compiled branch for non-emptiness *
2226 *************************************************/
2227
2228 /* This function scans through a branch of a compiled pattern to see whether it
2229 can match the empty string or not. It is called from could_be_empty()
2230 below and from compile_branch() when checking for an unlimited repeat of a
2231 group that can match nothing. Note that first_significant_code() skips over
2232 backward and negative forward assertions when its final argument is TRUE. If we
2233 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2234 bracket whose current branch will already have been scanned.
2235
2236 Arguments:
2237 code points to start of search
2238 endcode points to where to stop
2239 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2240 cd contains pointers to tables etc.
2241 recurses chain of recurse_check to catch mutual recursion
2242
2243 Returns: TRUE if what is matched could be empty
2244 */
2245
2246 typedef struct recurse_check {
2247 struct recurse_check *prev;
2248 const pcre_uchar *group;
2249 } recurse_check;
2250
2251 static BOOL
2252 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2253 BOOL utf, compile_data *cd, recurse_check *recurses)
2254 {
2255 register pcre_uchar c;
2256 recurse_check this_recurse;
2257
2258 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2259 code < endcode;
2260 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2261 {
2262 const pcre_uchar *ccode;
2263
2264 c = *code;
2265
2266 /* Skip over forward assertions; the other assertions are skipped by
2267 first_significant_code() with a TRUE final argument. */
2268
2269 if (c == OP_ASSERT)
2270 {
2271 do code += GET(code, 1); while (*code == OP_ALT);
2272 c = *code;
2273 continue;
2274 }
2275
2276 /* For a recursion/subroutine call, if its end has been reached, which
2277 implies a backward reference subroutine call, we can scan it. If it's a
2278 forward reference subroutine call, we can't. To detect forward reference
2279 we have to scan up the list that is kept in the workspace. This function is
2280 called only when doing the real compile, not during the pre-compile that
2281 measures the size of the compiled pattern. */
2282
2283 if (c == OP_RECURSE)
2284 {
2285 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2286 BOOL empty_branch;
2287
2288 /* Test for forward reference or uncompleted reference. This is disabled
2289 when called to scan a completed pattern by setting cd->start_workspace to
2290 NULL. */
2291
2292 if (cd->start_workspace != NULL)
2293 {
2294 const pcre_uchar *tcode;
2295 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2296 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2297 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2298 }
2299
2300 /* If we are scanning a completed pattern, there are no forward references
2301 and all groups are complete. We need to detect whether this is a recursive
2302 call, as otherwise there will be an infinite loop. If it is a recursion,
2303 just skip over it. Simple recursions are easily detected. For mutual
2304 recursions we keep a chain on the stack. */
2305
2306 else
2307 {
2308 recurse_check *r = recurses;
2309 const pcre_uchar *endgroup = scode;
2310
2311 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2312 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2313
2314 for (r = recurses; r != NULL; r = r->prev)
2315 if (r->group == scode) break;
2316 if (r != NULL) continue; /* Mutual recursion */
2317 }
2318
2319 /* Completed reference; scan the referenced group, remembering it on the
2320 stack chain to detect mutual recursions. */
2321
2322 empty_branch = FALSE;
2323 this_recurse.prev = recurses;
2324 this_recurse.group = scode;
2325
2326 do
2327 {
2328 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2329 {
2330 empty_branch = TRUE;
2331 break;
2332 }
2333 scode += GET(scode, 1);
2334 }
2335 while (*scode == OP_ALT);
2336
2337 if (!empty_branch) return FALSE; /* All branches are non-empty */
2338 continue;
2339 }
2340
2341 /* Groups with zero repeats can of course be empty; skip them. */
2342
2343 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2344 c == OP_BRAPOSZERO)
2345 {
2346 code += PRIV(OP_lengths)[c];
2347 do code += GET(code, 1); while (*code == OP_ALT);
2348 c = *code;
2349 continue;
2350 }
2351
2352 /* A nested group that is already marked as "could be empty" can just be
2353 skipped. */
2354
2355 if (c == OP_SBRA || c == OP_SBRAPOS ||
2356 c == OP_SCBRA || c == OP_SCBRAPOS)
2357 {
2358 do code += GET(code, 1); while (*code == OP_ALT);
2359 c = *code;
2360 continue;
2361 }
2362
2363 /* For other groups, scan the branches. */
2364
2365 if (c == OP_BRA || c == OP_BRAPOS ||
2366 c == OP_CBRA || c == OP_CBRAPOS ||
2367 c == OP_ONCE || c == OP_ONCE_NC ||
2368 c == OP_COND)
2369 {
2370 BOOL empty_branch;
2371 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2372
2373 /* If a conditional group has only one branch, there is a second, implied,
2374 empty branch, so just skip over the conditional, because it could be empty.
2375 Otherwise, scan the individual branches of the group. */
2376
2377 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2378 code += GET(code, 1);
2379 else
2380 {
2381 empty_branch = FALSE;
2382 do
2383 {
2384 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2385 empty_branch = TRUE;
2386 code += GET(code, 1);
2387 }
2388 while (*code == OP_ALT);
2389 if (!empty_branch) return FALSE; /* All branches are non-empty */
2390 }
2391
2392 c = *code;
2393 continue;
2394 }
2395
2396 /* Handle the other opcodes */
2397
2398 switch (c)
2399 {
2400 /* Check for quantifiers after a class. XCLASS is used for classes that
2401 cannot be represented just by a bit map. This includes negated single
2402 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2403 actual length is stored in the compiled code, so we must update "code"
2404 here. */
2405
2406 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2407 case OP_XCLASS:
2408 ccode = code += GET(code, 1);
2409 goto CHECK_CLASS_REPEAT;
2410 #endif
2411
2412 case OP_CLASS:
2413 case OP_NCLASS:
2414 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2415
2416 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2417 CHECK_CLASS_REPEAT:
2418 #endif
2419
2420 switch (*ccode)
2421 {
2422 case OP_CRSTAR: /* These could be empty; continue */
2423 case OP_CRMINSTAR:
2424 case OP_CRQUERY:
2425 case OP_CRMINQUERY:
2426 case OP_CRPOSSTAR:
2427 case OP_CRPOSQUERY:
2428 break;
2429
2430 default: /* Non-repeat => class must match */
2431 case OP_CRPLUS: /* These repeats aren't empty */
2432 case OP_CRMINPLUS:
2433 case OP_CRPOSPLUS:
2434 return FALSE;
2435
2436 case OP_CRRANGE:
2437 case OP_CRMINRANGE:
2438 case OP_CRPOSRANGE:
2439 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2440 break;
2441 }
2442 break;
2443
2444 /* Opcodes that must match a character */
2445
2446 case OP_ANY:
2447 case OP_ALLANY:
2448 case OP_ANYBYTE:
2449
2450 case OP_PROP:
2451 case OP_NOTPROP:
2452 case OP_ANYNL:
2453
2454 case OP_NOT_HSPACE:
2455 case OP_HSPACE:
2456 case OP_NOT_VSPACE:
2457 case OP_VSPACE:
2458 case OP_EXTUNI:
2459
2460 case OP_NOT_DIGIT:
2461 case OP_DIGIT:
2462 case OP_NOT_WHITESPACE:
2463 case OP_WHITESPACE:
2464 case OP_NOT_WORDCHAR:
2465 case OP_WORDCHAR:
2466
2467 case OP_CHAR:
2468 case OP_CHARI:
2469 case OP_NOT:
2470 case OP_NOTI:
2471
2472 case OP_PLUS:
2473 case OP_PLUSI:
2474 case OP_MINPLUS:
2475 case OP_MINPLUSI:
2476
2477 case OP_NOTPLUS:
2478 case OP_NOTPLUSI:
2479 case OP_NOTMINPLUS:
2480 case OP_NOTMINPLUSI:
2481
2482 case OP_POSPLUS:
2483 case OP_POSPLUSI:
2484 case OP_NOTPOSPLUS:
2485 case OP_NOTPOSPLUSI:
2486
2487 case OP_EXACT:
2488 case OP_EXACTI:
2489 case OP_NOTEXACT:
2490 case OP_NOTEXACTI:
2491
2492 case OP_TYPEPLUS:
2493 case OP_TYPEMINPLUS:
2494 case OP_TYPEPOSPLUS:
2495 case OP_TYPEEXACT:
2496
2497 return FALSE;
2498
2499 /* These are going to continue, as they may be empty, but we have to
2500 fudge the length for the \p and \P cases. */
2501
2502 case OP_TYPESTAR:
2503 case OP_TYPEMINSTAR:
2504 case OP_TYPEPOSSTAR:
2505 case OP_TYPEQUERY:
2506 case OP_TYPEMINQUERY:
2507 case OP_TYPEPOSQUERY:
2508 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2509 break;
2510
2511 /* Same for these */
2512
2513 case OP_TYPEUPTO:
2514 case OP_TYPEMINUPTO:
2515 case OP_TYPEPOSUPTO:
2516 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2517 code += 2;
2518 break;
2519
2520 /* End of branch */
2521
2522 case OP_KET:
2523 case OP_KETRMAX:
2524 case OP_KETRMIN:
2525 case OP_KETRPOS:
2526 case OP_ALT:
2527 return TRUE;
2528
2529 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2530 MINUPTO, and POSUPTO and their caseless and negative versions may be
2531 followed by a multibyte character. */
2532
2533 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2534 case OP_STAR:
2535 case OP_STARI:
2536 case OP_NOTSTAR:
2537 case OP_NOTSTARI:
2538
2539 case OP_MINSTAR:
2540 case OP_MINSTARI:
2541 case OP_NOTMINSTAR:
2542 case OP_NOTMINSTARI:
2543
2544 case OP_POSSTAR:
2545 case OP_POSSTARI:
2546 case OP_NOTPOSSTAR:
2547 case OP_NOTPOSSTARI:
2548
2549 case OP_QUERY:
2550 case OP_QUERYI:
2551 case OP_NOTQUERY:
2552 case OP_NOTQUERYI:
2553
2554 case OP_MINQUERY:
2555 case OP_MINQUERYI:
2556 case OP_NOTMINQUERY:
2557 case OP_NOTMINQUERYI:
2558
2559 case OP_POSQUERY:
2560 case OP_POSQUERYI:
2561 case OP_NOTPOSQUERY:
2562 case OP_NOTPOSQUERYI:
2563
2564 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2565 break;
2566
2567 case OP_UPTO:
2568 case OP_UPTOI:
2569 case OP_NOTUPTO:
2570 case OP_NOTUPTOI:
2571
2572 case OP_MINUPTO:
2573 case OP_MINUPTOI:
2574 case OP_NOTMINUPTO:
2575 case OP_NOTMINUPTOI:
2576
2577 case OP_POSUPTO:
2578 case OP_POSUPTOI:
2579 case OP_NOTPOSUPTO:
2580 case OP_NOTPOSUPTOI:
2581
2582 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2583 break;
2584 #endif
2585
2586 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2587 string. */
2588
2589 case OP_MARK:
2590 case OP_PRUNE_ARG:
2591 case OP_SKIP_ARG:
2592 case OP_THEN_ARG:
2593 code += code[1];
2594 break;
2595
2596 /* None of the remaining opcodes are required to match a character. */
2597
2598 default:
2599 break;
2600 }
2601 }
2602
2603 return TRUE;
2604 }
2605
2606
2607
2608 /*************************************************
2609 * Scan compiled regex for non-emptiness *
2610 *************************************************/
2611
2612 /* This function is called to check for left recursive calls. We want to check
2613 the current branch of the current pattern to see if it could match the empty
2614 string. If it could, we must look outwards for branches at other levels,
2615 stopping when we pass beyond the bracket which is the subject of the recursion.
2616 This function is called only during the real compile, not during the
2617 pre-compile.
2618
2619 Arguments:
2620 code points to start of the recursion
2621 endcode points to where to stop (current RECURSE item)
2622 bcptr points to the chain of current (unclosed) branch starts
2623 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2624 cd pointers to tables etc
2625
2626 Returns: TRUE if what is matched could be empty
2627 */
2628
2629 static BOOL
2630 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2631 branch_chain *bcptr, BOOL utf, compile_data *cd)
2632 {
2633 while (bcptr != NULL && bcptr->current_branch >= code)
2634 {
2635 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2636 return FALSE;
2637 bcptr = bcptr->outer;
2638 }
2639 return TRUE;
2640 }
2641
2642
2643
2644 /*************************************************
2645 * Base opcode of repeated opcodes *
2646 *************************************************/
2647
2648 /* Returns the base opcode for repeated single character type opcodes. If the
2649 opcode is not a repeated character type, it returns with the original value.
2650
2651 Arguments: c opcode
2652 Returns: base opcode for the type
2653 */
2654
2655 static pcre_uchar
2656 get_repeat_base(pcre_uchar c)
2657 {
2658 return (c > OP_TYPEPOSUPTO)? c :
2659 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2660 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2661 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2662 (c >= OP_STARI)? OP_STARI :
2663 OP_STAR;
2664 }
2665
2666
2667
2668 #ifdef SUPPORT_UCP
2669 /*************************************************
2670 * Check a character and a property *
2671 *************************************************/
2672
2673 /* This function is called by check_auto_possessive() when a property item
2674 is adjacent to a fixed character.
2675
2676 Arguments:
2677 c the character
2678 ptype the property type
2679 pdata the data for the type
2680 negated TRUE if it's a negated property (\P or \p{^)
2681
2682 Returns: TRUE if auto-possessifying is OK
2683 */
2684
2685 static BOOL
2686 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2687 BOOL negated)
2688 {
2689 const pcre_uint32 *p;
2690 const ucd_record *prop = GET_UCD(c);
2691
2692 switch(ptype)
2693 {
2694 case PT_LAMP:
2695 return (prop->chartype == ucp_Lu ||
2696 prop->chartype == ucp_Ll ||
2697 prop->chartype == ucp_Lt) == negated;
2698
2699 case PT_GC:
2700 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2701
2702 case PT_PC:
2703 return (pdata == prop->chartype) == negated;
2704
2705 case PT_SC:
2706 return (pdata == prop->script) == negated;
2707
2708 /* These are specials */
2709
2710 case PT_ALNUM:
2711 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2712 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2713
2714 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2715 means that Perl space and POSIX space are now identical. PCRE was changed
2716 at release 8.34. */
2717
2718 case PT_SPACE: /* Perl space */
2719 case PT_PXSPACE: /* POSIX space */
2720 switch(c)
2721 {
2722 HSPACE_CASES:
2723 VSPACE_CASES:
2724 return negated;
2725
2726 default:
2727 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2728 }
2729 break; /* Control never reaches here */
2730
2731 case PT_WORD:
2732 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2733 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2734 c == CHAR_UNDERSCORE) == negated;
2735
2736 case PT_CLIST:
2737 p = PRIV(ucd_caseless_sets) + prop->caseset;
2738 for (;;)
2739 {
2740 if (c < *p) return !negated;
2741 if (c == *p++) return negated;
2742 }
2743 break; /* Control never reaches here */
2744 }
2745
2746 return FALSE;
2747 }
2748 #endif /* SUPPORT_UCP */
2749
2750
2751
2752 /*************************************************
2753 * Fill the character property list *
2754 *************************************************/
2755
2756 /* Checks whether the code points to an opcode that can take part in auto-
2757 possessification, and if so, fills a list with its properties.
2758
2759 Arguments:
2760 code points to start of expression
2761 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2762 fcc points to case-flipping table
2763 list points to output list
2764 list[0] will be filled with the opcode
2765 list[1] will be non-zero if this opcode
2766 can match an empty character string
2767 list[2..7] depends on the opcode
2768
2769 Returns: points to the start of the next opcode if *code is accepted
2770 NULL if *code is not accepted
2771 */
2772
2773 static const pcre_uchar *
2774 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2775 const pcre_uint8 *fcc, pcre_uint32 *list)
2776 {
2777 pcre_uchar c = *code;
2778 const pcre_uchar *end;
2779 const pcre_uint32 *clist_src;
2780 pcre_uint32 *clist_dest;
2781 pcre_uint32 chr;
2782 pcre_uchar base;
2783
2784 list[0] = c;
2785 list[1] = FALSE;
2786 code++;
2787
2788 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2789 {
2790 base = get_repeat_base(c);
2791 c -= (base - OP_STAR);
2792
2793 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2794 code += IMM2_SIZE;
2795
2796 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2797
2798 switch(base)
2799 {
2800 case OP_STAR:
2801 list[0] = OP_CHAR;
2802 break;
2803
2804 case OP_STARI:
2805 list[0] = OP_CHARI;
2806 break;
2807
2808 case OP_NOTSTAR:
2809 list[0] = OP_NOT;
2810 break;
2811
2812 case OP_NOTSTARI:
2813 list[0] = OP_NOTI;
2814 break;
2815
2816 case OP_TYPESTAR:
2817 list[0] = *code;
2818 code++;
2819 break;
2820 }
2821 c = list[0];
2822 }
2823
2824 switch(c)
2825 {
2826 case OP_NOT_DIGIT:
2827 case OP_DIGIT:
2828 case OP_NOT_WHITESPACE:
2829 case OP_WHITESPACE:
2830 case OP_NOT_WORDCHAR:
2831 case OP_WORDCHAR:
2832 case OP_ANY:
2833 case OP_ALLANY:
2834 case OP_ANYNL:
2835 case OP_NOT_HSPACE:
2836 case OP_HSPACE:
2837 case OP_NOT_VSPACE:
2838 case OP_VSPACE:
2839 case OP_EXTUNI:
2840 case OP_EODN:
2841 case OP_EOD:
2842 case OP_DOLL:
2843 case OP_DOLLM:
2844 return code;
2845
2846 case OP_CHAR:
2847 case OP_NOT:
2848 GETCHARINCTEST(chr, code);
2849 list[2] = chr;
2850 list[3] = NOTACHAR;
2851 return code;
2852
2853 case OP_CHARI:
2854 case OP_NOTI:
2855 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2856 GETCHARINCTEST(chr, code);
2857 list[2] = chr;
2858
2859 #ifdef SUPPORT_UCP
2860 if (chr < 128 || (chr < 256 && !utf))
2861 list[3] = fcc[chr];
2862 else
2863 list[3] = UCD_OTHERCASE(chr);
2864 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2865 list[3] = (chr < 256) ? fcc[chr] : chr;
2866 #else
2867 list[3] = fcc[chr];
2868 #endif
2869
2870 /* The othercase might be the same value. */
2871
2872 if (chr == list[3])
2873 list[3] = NOTACHAR;
2874 else
2875 list[4] = NOTACHAR;
2876 return code;
2877
2878 #ifdef SUPPORT_UCP
2879 case OP_PROP:
2880 case OP_NOTPROP:
2881 if (code[0] != PT_CLIST)
2882 {
2883 list[2] = code[0];
2884 list[3] = code[1];
2885 return code + 2;
2886 }
2887
2888 /* Convert only if we have enough space. */
2889
2890 clist_src = PRIV(ucd_caseless_sets) + code[1];
2891 clist_dest = list + 2;
2892 code += 2;
2893
2894 do {
2895 if (clist_dest >= list + 8)
2896 {
2897 /* Early return if there is not enough space. This should never
2898 happen, since all clists are shorter than 5 character now. */
2899 list[2] = code[0];
2900 list[3] = code[1];
2901 return code;
2902 }
2903 *clist_dest++ = *clist_src;
2904 }
2905 while(*clist_src++ != NOTACHAR);
2906
2907 /* All characters are stored. The terminating NOTACHAR
2908 is copied form the clist itself. */
2909
2910 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2911 return code;
2912 #endif
2913
2914 case OP_NCLASS:
2915 case OP_CLASS:
2916 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2917 case OP_XCLASS:
2918
2919 if (c == OP_XCLASS)
2920 end = code + GET(code, 0);
2921 else
2922 #endif
2923 end = code + 32 / sizeof(pcre_uchar);
2924
2925 switch(*end)
2926 {
2927 case OP_CRSTAR:
2928 case OP_CRMINSTAR:
2929 case OP_CRQUERY:
2930 case OP_CRMINQUERY:
2931 case OP_CRPOSSTAR:
2932 case OP_CRPOSQUERY:
2933 list[1] = TRUE;
2934 end++;
2935 break;
2936
2937 case OP_CRPLUS:
2938 case OP_CRMINPLUS:
2939 case OP_CRPOSPLUS:
2940 end++;
2941 break;
2942
2943 case OP_CRRANGE:
2944 case OP_CRMINRANGE:
2945 case OP_CRPOSRANGE:
2946 list[1] = (GET2(end, 1) == 0);
2947 end += 1 + 2 * IMM2_SIZE;
2948 break;
2949 }
2950 list[2] = end - code;
2951 return end;
2952 }
2953 return NULL; /* Opcode not accepted */
2954 }
2955
2956
2957
2958 /*************************************************
2959 * Scan further character sets for match *
2960 *************************************************/
2961
2962 /* Checks whether the base and the current opcode have a common character, in
2963 which case the base cannot be possessified.
2964
2965 Arguments:
2966 code points to the byte code
2967 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2968 cd static compile data
2969 base_list the data list of the base opcode
2970
2971 Returns: TRUE if the auto-possessification is possible
2972 */
2973
2974 static BOOL
2975 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
2976 const pcre_uint32* base_list, const pcre_uchar *base_end)
2977 {
2978 pcre_uchar c;
2979 pcre_uint32 list[8];
2980 const pcre_uint32* chr_ptr;
2981 const pcre_uint32* ochr_ptr;
2982 const pcre_uint32* list_ptr;
2983 const pcre_uchar *next_code;
2984 const pcre_uint8 *class_bits;
2985 pcre_uint32 chr;
2986
2987 /* Note: the base_list[1] contains whether the current opcode has greedy
2988 (represented by a non-zero value) quantifier. This is a different from
2989 other character type lists, which stores here that the character iterator
2990 matches to an empty string (also represented by a non-zero value). */
2991
2992 for(;;)
2993 {
2994 c = *code;
2995
2996 /* Skip over callouts */
2997
2998 if (c == OP_CALLOUT)
2999 {
3000 code += PRIV(OP_lengths)[c];
3001 continue;
3002 }
3003
3004 if (c == OP_ALT)
3005 {
3006 do code += GET(code, 1); while (*code == OP_ALT);
3007 c = *code;
3008 }
3009
3010 switch(c)
3011 {
3012 case OP_END:
3013 case OP_KETRPOS:
3014 /* TRUE only in greedy case. The non-greedy case could be replaced by
3015 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3016 uses more memory, which we cannot get at this stage.) */
3017
3018 return base_list[1] != 0;
3019
3020 case OP_KET:
3021 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3022 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3023 cannot be converted to a possessive form. */
3024
3025 if (base_list[1] == 0) return FALSE;
3026
3027 switch(*(code - GET(code, 1)))
3028 {
3029 case OP_ASSERT:
3030 case OP_ASSERT_NOT:
3031 case OP_ASSERTBACK:
3032 case OP_ASSERTBACK_NOT:
3033 case OP_ONCE:
3034 case OP_ONCE_NC:
3035 /* Atomic sub-patterns and assertions can always auto-possessify their
3036 last iterator. */
3037 return TRUE;
3038 }
3039
3040 code += PRIV(OP_lengths)[c];
3041 continue;
3042
3043 case OP_ONCE:
3044 case OP_ONCE_NC:
3045 case OP_BRA:
3046 case OP_CBRA:
3047 next_code = code;
3048 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3049
3050 /* We do not support repeated brackets, because they can lead to
3051 infinite recursion. */
3052
3053 if (*next_code != OP_KET) return FALSE;
3054
3055 next_code = code + GET(code, 1);
3056 code += PRIV(OP_lengths)[c];
3057
3058 while (*next_code == OP_ALT)
3059 {
3060 if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3061 code = next_code + 1 + LINK_SIZE;
3062 next_code += GET(next_code, 1);
3063 }
3064 continue;
3065
3066 case OP_BRAZERO:
3067 case OP_BRAMINZERO:
3068
3069 next_code = code + 1;
3070 if (*next_code != OP_BRA && *next_code != OP_CBRA)
3071 return FALSE;
3072
3073 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3074
3075 /* We do not support repeated brackets, because they can lead to
3076 infinite recursion. */
3077 if (*next_code != OP_KET) return FALSE;
3078
3079 /* The bracket content will be checked by the
3080 OP_BRA/OP_CBRA case above. */
3081 next_code += 1 + LINK_SIZE;
3082 if (!compare_opcodes(next_code, utf, cd, base_list, base_end)) return FALSE;
3083
3084 code += PRIV(OP_lengths)[c];
3085 continue;
3086 }
3087
3088 /* Check for a supported opcode, and load its properties. */
3089
3090 code = get_chr_property_list(code, utf, cd->fcc, list);
3091 if (code == NULL) return FALSE; /* Unsupported */
3092
3093 /* If either opcode is a small character list, set pointers for comparing
3094 characters from that list with another list, or with a property. */
3095
3096 if (base_list[0] == OP_CHAR)
3097 {
3098 chr_ptr = base_list + 2;
3099 list_ptr = list;
3100 }
3101 else if (list[0] == OP_CHAR)
3102 {
3103 chr_ptr = list + 2;
3104 list_ptr = base_list;
3105 }
3106
3107 /* Some property combinations also acceptable. Unicode property opcodes are
3108 processed specially; the rest can be handled with a lookup table. */
3109
3110 else
3111 {
3112 pcre_uint32 leftop, rightop;
3113
3114 if (list[1] != 0) return FALSE; /* Must match at least one character */
3115 leftop = base_list[0];
3116 rightop = list[0];
3117
3118 #ifdef SUPPORT_UCP
3119 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3120 {
3121 if (rightop == OP_EOD) return TRUE;
3122 if (rightop == OP_PROP || rightop == OP_NOTPROP)
3123 {
3124 int n;
3125 const pcre_uint8 *p;
3126 BOOL same = leftop == rightop;
3127 BOOL lisprop = leftop == OP_PROP;
3128 BOOL risprop = rightop == OP_PROP;
3129 BOOL bothprop = lisprop && risprop;
3130
3131 /* There's a table that specifies how each combination is to be
3132 processed:
3133 0 Always return FALSE (never auto-possessify)
3134 1 Character groups are distinct (possessify if both are OP_PROP)
3135 2 Check character categories in the same group (general or particular)
3136 3 Return TRUE if the two opcodes are not the same
3137 ... see comments below
3138 */
3139
3140 n = propposstab[base_list[2]][list[2]];
3141 switch(n)
3142 {
3143 case 0: return FALSE;
3144 case 1: return bothprop;
3145 case 2: return (base_list[3] == list[3]) != same;
3146 case 3: return !same;
3147
3148 case 4: /* Left general category, right particular category */
3149 return risprop && catposstab[base_list[3]][list[3]] == same;
3150
3151 case 5: /* Right general category, left particular category */
3152 return lisprop && catposstab[list[3]][base_list[3]] == same;
3153
3154 /* This code is logically tricky. Think hard before fiddling with it.
3155 The posspropstab table has four entries per row. Each row relates to
3156 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3157 Only WORD actually needs all four entries, but using repeats for the
3158 others means they can all use the same code below.
3159
3160 The first two entries in each row are Unicode general categories, and
3161 apply always, because all the characters they include are part of the
3162 PCRE character set. The third and fourth entries are a general and a
3163 particular category, respectively, that include one or more relevant
3164 characters. One or the other is used, depending on whether the check
3165 is for a general or a particular category. However, in both cases the
3166 category contains more characters than the specials that are defined
3167 for the property being tested against. Therefore, it cannot be used
3168 in a NOTPROP case.
3169
3170 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3171 Underscore is covered by ucp_P or ucp_Po. */
3172
3173 case 6: /* Left alphanum vs right general category */
3174 case 7: /* Left space vs right general category */
3175 case 8: /* Left word vs right general category */
3176 p = posspropstab[n-6];
3177 return risprop && lisprop ==
3178 (list[3] != p[0] &&
3179 list[3] != p[1] &&
3180 (list[3] != p[2] || !lisprop));
3181
3182 case 9: /* Right alphanum vs left general category */
3183 case 10: /* Right space vs left general category */
3184 case 11: /* Right word vs left general category */
3185 p = posspropstab[n-9];
3186 return lisprop && risprop ==
3187 (base_list[3] != p[0] &&
3188 base_list[3] != p[1] &&
3189 (base_list[3] != p[2] || !risprop));
3190
3191 case 12: /* Left alphanum vs right particular category */
3192 case 13: /* Left space vs right particular category */
3193 case 14: /* Left word vs right particular category */
3194 p = posspropstab[n-12];
3195 return risprop && lisprop ==
3196 (catposstab[p[0]][list[3]] &&
3197 catposstab[p[1]][list[3]] &&
3198 (list[3] != p[3] || !lisprop));
3199
3200 case 15: /* Right alphanum vs left particular category */
3201 case 16: /* Right space vs left particular category */
3202 case 17: /* Right word vs left particular category */
3203 p = posspropstab[n-15];
3204 return lisprop && risprop ==
3205 (catposstab[p[0]][base_list[3]] &&
3206 catposstab[p[1]][base_list[3]] &&
3207 (base_list[3] != p[3] || !risprop));
3208 }
3209 }
3210 return FALSE;
3211 }
3212
3213 else
3214 #endif /* SUPPORT_UCP */
3215
3216 return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3217 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3218 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3219 }
3220
3221 /* Control reaches here only if one of the items is a small character list.
3222 All characters are checked against the other side. */
3223
3224 do
3225 {
3226 chr = *chr_ptr;
3227
3228 switch(list_ptr[0])
3229 {
3230 case OP_CHAR:
3231 ochr_ptr = list_ptr + 2;
3232 do
3233 {
3234 if (chr == *ochr_ptr) return FALSE;
3235 ochr_ptr++;
3236 }
3237 while(*ochr_ptr != NOTACHAR);
3238 break;
3239
3240 case OP_NOT:
3241 ochr_ptr = list_ptr + 2;
3242 do
3243 {
3244 if (chr == *ochr_ptr)
3245 break;
3246 ochr_ptr++;
3247 }
3248 while(*ochr_ptr != NOTACHAR);
3249 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3250 break;
3251
3252 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3253 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3254
3255 case OP_DIGIT:
3256 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3257 break;
3258
3259 case OP_NOT_DIGIT:
3260 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3261 break;
3262
3263 case OP_WHITESPACE:
3264 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3265 break;
3266
3267 case OP_NOT_WHITESPACE:
3268 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3269 break;
3270
3271 case OP_WORDCHAR:
3272 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3273 break;
3274
3275 case OP_NOT_WORDCHAR:
3276 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3277 break;
3278
3279 case OP_HSPACE:
3280 switch(chr)
3281 {
3282 HSPACE_CASES: return FALSE;
3283 default: break;
3284 }
3285 break;
3286
3287 case OP_NOT_HSPACE:
3288 switch(chr)
3289 {
3290 HSPACE_CASES: break;
3291 default: return FALSE;
3292 }
3293 break;
3294
3295 case OP_ANYNL:
3296 case OP_VSPACE:
3297 switch(chr)
3298 {
3299 VSPACE_CASES: return FALSE;
3300 default: break;
3301 }
3302 break;
3303
3304 case OP_NOT_VSPACE:
3305 switch(chr)
3306 {
3307 VSPACE_CASES: break;
3308 default: return FALSE;
3309 }
3310 break;
3311
3312 case OP_DOLL:
3313 case OP_EODN:
3314 switch (chr)
3315 {
3316 case CHAR_CR:
3317 case CHAR_LF:
3318 case CHAR_VT:
3319 case CHAR_FF:
3320 case CHAR_NEL:
3321 #ifndef EBCDIC
3322 case 0x2028:
3323 case 0x2029:
3324 #endif /* Not EBCDIC */
3325 return FALSE;
3326 }
3327 break;
3328
3329 case OP_EOD: /* Can always possessify before \z */
3330 break;
3331
3332 case OP_PROP:
3333 case OP_NOTPROP:
3334 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3335 list_ptr[0] == OP_NOTPROP))
3336 return FALSE;
3337 break;
3338
3339 case OP_NCLASS:
3340 if (chr > 255) return FALSE;
3341 /* Fall through */
3342
3343 case OP_CLASS:
3344 if (chr > 255) break;
3345 class_bits = (pcre_uint8 *)((list_ptr == list ? code : base_end) - list_ptr[2]);
3346 if ((class_bits[chr >> 3] & (1 << (chr & 7))) != 0)
3347 return FALSE;
3348 break;
3349
3350 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3351 case OP_XCLASS:
3352 if (list_ptr != list) return FALSE; /* Class is first opcode */
3353 if (PRIV(xclass)(chr, code - list_ptr[2] + LINK_SIZE, utf))
3354 return FALSE;
3355 break;
3356 #endif
3357
3358 default:
3359 return FALSE;
3360 }
3361
3362 chr_ptr++;
3363 }
3364 while(*chr_ptr != NOTACHAR);
3365
3366 /* At least one character must be matched from this opcode. */
3367
3368 if (list[1] == 0) return TRUE;
3369 }
3370
3371 return FALSE;
3372 }
3373
3374
3375
3376 /*************************************************
3377 * Scan compiled regex for auto-possession *
3378 *************************************************/
3379
3380 /* Replaces single character iterations with their possessive alternatives
3381 if appropriate. This function modifies the compiled opcode!
3382
3383 Arguments:
3384 code points to start of the byte code
3385 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3386 cd static compile data
3387
3388 Returns: nothing
3389 */
3390
3391 static void
3392 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3393 {
3394 register pcre_uchar c, d;
3395 const pcre_uchar *end;
3396 pcre_uchar *repeat_code;
3397 pcre_uint32 list[8];
3398
3399 for (;;)
3400 {
3401 c = *code;
3402
3403 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3404 {
3405 c -= get_repeat_base(c) - OP_STAR;
3406 end = (c <= OP_MINUPTO) ?
3407 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3408 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3409
3410 if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3411 {
3412 switch(c)
3413 {
3414 case OP_STAR:
3415 *code += OP_POSSTAR - OP_STAR;
3416 break;
3417
3418 case OP_MINSTAR:
3419 *code += OP_POSSTAR - OP_MINSTAR;
3420 break;
3421
3422 case OP_PLUS:
3423 *code += OP_POSPLUS - OP_PLUS;
3424 break;
3425
3426 case OP_MINPLUS:
3427 *code += OP_POSPLUS - OP_MINPLUS;
3428 break;
3429
3430 case OP_QUERY:
3431 *code += OP_POSQUERY - OP_QUERY;
3432 break;
3433
3434 case OP_MINQUERY:
3435 *code += OP_POSQUERY - OP_MINQUERY;
3436 break;
3437
3438 case OP_UPTO:
3439 *code += OP_POSUPTO - OP_UPTO;
3440 break;
3441
3442 case OP_MINUPTO:
3443 *code += OP_MINUPTO - OP_UPTO;
3444 break;
3445 }
3446 }
3447 c = *code;
3448 }
3449 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3450 {
3451 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3452 if (c == OP_XCLASS)
3453 repeat_code = code + 1 + GET(code, 1);
3454 else
3455 #endif
3456 repeat_code = code + 1 + (32 / sizeof(pcre_uchar));
3457
3458 d = *repeat_code;
3459 if (d >= OP_CRSTAR && d <= OP_CRMINRANGE)
3460 {
3461 /* end must not be NULL. */
3462 end = get_chr_property_list(code, utf, cd->fcc, list);
3463
3464 list[1] = d == OP_CRSTAR || d == OP_CRPLUS || d == OP_CRQUERY ||
3465 d == OP_CRRANGE;
3466
3467 if (compare_opcodes(end, utf, cd, list, end))
3468 {
3469 switch (d)
3470 {
3471 case OP_CRSTAR:
3472 *repeat_code = OP_CRPOSSTAR;
3473 break;
3474
3475 case OP_CRPLUS:
3476 *repeat_code = OP_CRPOSPLUS;
3477 break;
3478
3479 case OP_CRQUERY:
3480 *repeat_code = OP_CRPOSQUERY;
3481 break;
3482
3483 case OP_CRRANGE:
3484 *repeat_code = OP_CRPOSRANGE;
3485 break;
3486 }
3487 }
3488 }
3489 }
3490
3491 switch(c)
3492 {
3493 case OP_END:
3494 return;
3495
3496 case OP_TYPESTAR:
3497 case OP_TYPEMINSTAR:
3498 case OP_TYPEPLUS:
3499 case OP_TYPEMINPLUS:
3500 case OP_TYPEQUERY:
3501 case OP_TYPEMINQUERY:
3502 case OP_TYPEPOSSTAR:
3503 case OP_TYPEPOSPLUS:
3504 case OP_TYPEPOSQUERY:
3505 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3506 break;
3507
3508 case OP_TYPEUPTO:
3509 case OP_TYPEMINUPTO:
3510 case OP_TYPEEXACT:
3511 case OP_TYPEPOSUPTO:
3512 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3513 code += 2;
3514 break;
3515
3516 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3517 case OP_XCLASS:
3518 code += GET(code, 1);
3519 break;
3520 #endif
3521
3522 case OP_MARK:
3523 case OP_PRUNE_ARG:
3524 case OP_SKIP_ARG:
3525 case OP_THEN_ARG:
3526 code += code[1];
3527 break;
3528 }
3529
3530 /* Add in the fixed length from the table */
3531
3532 code += PRIV(OP_lengths)[c];
3533
3534 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3535 a multi-byte character. The length in the table is a minimum, so we have to
3536 arrange to skip the extra bytes. */
3537
3538 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3539 if (utf) switch(c)
3540 {
3541 case OP_CHAR:
3542 case OP_CHARI:
3543 case OP_NOT:
3544 case OP_NOTI:
3545 case OP_STAR:
3546 case OP_MINSTAR:
3547 case OP_PLUS:
3548 case OP_MINPLUS:
3549 case OP_QUERY:
3550 case OP_MINQUERY:
3551 case OP_UPTO:
3552 case OP_MINUPTO:
3553 case OP_EXACT:
3554 case OP_POSSTAR:
3555 case OP_POSPLUS:
3556 case OP_POSQUERY:
3557 case OP_POSUPTO:
3558 case OP_STARI:
3559 case OP_MINSTARI:
3560 case OP_PLUSI:
3561 case OP_MINPLUSI:
3562 case OP_QUERYI:
3563 case OP_MINQUERYI:
3564 case OP_UPTOI:
3565 case OP_MINUPTOI:
3566 case OP_EXACTI:
3567 case OP_POSSTARI:
3568 case OP_POSPLUSI:
3569 case OP_POSQUERYI:
3570 case OP_POSUPTOI:
3571 case OP_NOTSTAR:
3572 case OP_NOTMINSTAR:
3573 case OP_NOTPLUS:
3574 case OP_NOTMINPLUS:
3575 case OP_NOTQUERY:
3576 case OP_NOTMINQUERY:
3577 case OP_NOTUPTO:
3578 case OP_NOTMINUPTO:
3579 case OP_NOTEXACT:
3580 case OP_NOTPOSSTAR:
3581 case OP_NOTPOSPLUS:
3582 case OP_NOTPOSQUERY:
3583 case OP_NOTPOSUPTO:
3584 case OP_NOTSTARI:
3585 case OP_NOTMINSTARI:
3586 case OP_NOTPLUSI:
3587 case OP_NOTMINPLUSI:
3588 case OP_NOTQUERYI:
3589 case OP_NOTMINQUERYI:
3590 case OP_NOTUPTOI:
3591 case OP_NOTMINUPTOI:
3592 case OP_NOTEXACTI:
3593 case OP_NOTPOSSTARI:
3594 case OP_NOTPOSPLUSI:
3595 case OP_NOTPOSQUERYI:
3596 case OP_NOTPOSUPTOI:
3597 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3598 break;
3599 }
3600 #else
3601 (void)(utf); /* Keep compiler happy by referencing function argument */
3602 #endif
3603 }
3604 }
3605
3606
3607
3608 /*************************************************
3609 * Check for POSIX class syntax *
3610 *************************************************/
3611
3612 /* This function is called when the sequence "[:" or "[." or "[=" is
3613 encountered in a character class. It checks whether this is followed by a
3614 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3615 reach an unescaped ']' without the special preceding character, return FALSE.
3616
3617 Originally, this function only recognized a sequence of letters between the
3618 terminators, but it seems that Perl recognizes any sequence of characters,
3619 though of course unknown POSIX names are subsequently rejected. Perl gives an
3620 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3621 didn't consider this to be a POSIX class. Likewise for [:1234:].
3622
3623 The problem in trying to be exactly like Perl is in the handling of escapes. We
3624 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3625 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3626 below handles the special case of \], but does not try to do any other escape
3627 processing. This makes it different from Perl for cases such as [:l\ower:]
3628 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3629 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
3630 I think.
3631
3632 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3633 It seems that the appearance of a nested POSIX class supersedes an apparent
3634 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3635 a digit.
3636
3637 In Perl, unescaped square brackets may also appear as part of class names. For
3638 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3639 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3640 seem right at all. PCRE does not allow closing square brackets in POSIX class
3641 names.
3642
3643 Arguments:
3644 ptr pointer to the initial [
3645 endptr where to return the end pointer
3646
3647 Returns: TRUE or FALSE
3648 */
3649
3650 static BOOL
3651 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3652 {
3653 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3654 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3655 for (++ptr; *ptr != CHAR_NULL; ptr++)
3656 {
3657 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3658 ptr++;
3659 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3660 else
3661 {
3662 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3663 {
3664 *endptr = ptr;
3665 return TRUE;
3666 }
3667 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3668 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3669 ptr[1] == CHAR_EQUALS_SIGN) &&
3670 check_posix_syntax(ptr, endptr))
3671 return FALSE;
3672 }
3673 }
3674 return FALSE;
3675 }
3676
3677
3678
3679
3680 /*************************************************
3681 * Check POSIX class name *
3682 *************************************************/
3683
3684 /* This function is called to check the name given in a POSIX-style class entry
3685 such as [:alnum:].
3686
3687 Arguments:
3688 ptr points to the first letter
3689 len the length of the name
3690
3691 Returns: a value representing the name, or -1 if unknown
3692 */
3693
3694 static int
3695 check_posix_name(const pcre_uchar *ptr, int len)
3696 {
3697 const char *pn = posix_names;
3698 register int yield = 0;
3699 while (posix_name_lengths[yield] != 0)
3700 {
3701 if (len == posix_name_lengths[yield] &&
3702 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3703 pn += posix_name_lengths[yield] + 1;
3704 yield++;
3705 }
3706 return -1;
3707 }
3708
3709
3710 /*************************************************
3711 * Adjust OP_RECURSE items in repeated group *
3712 *************************************************/
3713
3714 /* OP_RECURSE items contain an offset from the start of the regex to the group
3715 that is referenced. This means that groups can be replicated for fixed
3716 repetition simply by copying (because the recursion is allowed to refer to
3717 earlier groups that are outside the current group). However, when a group is
3718 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3719 inserted before it, after it has been compiled. This means that any OP_RECURSE
3720 items within it that refer to the group itself or any contained groups have to
3721 have their offsets adjusted. That one of the jobs of this function. Before it
3722 is called, the partially compiled regex must be temporarily terminated with
3723 OP_END.
3724
3725 This function has been extended with the possibility of forward references for
3726 recursions and subroutine calls. It must also check the list of such references
3727 for the group we are dealing with. If it finds that one of the recursions in
3728 the current group is on this list, it adjusts the offset in the list, not the
3729 value in the reference (which is a group number).
3730
3731 Arguments:
3732 group points to the start of the group
3733 adjust the amount by which the group is to be moved
3734 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3735 cd contains pointers to tables etc.
3736 save_hwm the hwm forward reference pointer at the start of the group
3737
3738 Returns: nothing
3739 */
3740
3741 static void
3742 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3743 pcre_uchar *save_hwm)
3744 {
3745 pcre_uchar *ptr = group;
3746
3747 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3748 {
3749 int offset;
3750 pcre_uchar *hc;
3751
3752 /* See if this recursion is on the forward reference list. If so, adjust the
3753 reference. */
3754
3755 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3756 {
3757 offset = (int)GET(hc, 0);
3758 if (cd->start_code + offset == ptr + 1)
3759 {
3760 PUT(hc, 0, offset + adjust);
3761 break;
3762 }
3763 }
3764
3765 /* Otherwise, adjust the recursion offset if it's after the start of this
3766 group. */
3767
3768 if (hc >= cd->hwm)
3769 {
3770 offset = (int)GET(ptr, 1);
3771 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3772 }
3773
3774 ptr += 1 + LINK_SIZE;
3775 }
3776 }
3777
3778
3779
3780 /*************************************************
3781 * Insert an automatic callout point *
3782 *************************************************/
3783
3784 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3785 callout points before each pattern item.
3786
3787 Arguments:
3788 code current code pointer
3789 ptr current pattern pointer
3790 cd pointers to tables etc
3791
3792 Returns: new code pointer
3793 */
3794
3795 static pcre_uchar *
3796 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3797 {
3798 *code++ = OP_CALLOUT;
3799 *code++ = 255;
3800 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
3801 PUT(code, LINK_SIZE, 0); /* Default length */
3802 return code + 2 * LINK_SIZE;
3803 }
3804
3805
3806
3807 /*************************************************
3808 * Complete a callout item *
3809 *************************************************/
3810
3811 /* A callout item contains the length of the next item in the pattern, which
3812 we can't fill in till after we have reached the relevant point. This is used
3813 for both automatic and manual callouts.
3814
3815 Arguments:
3816 previous_callout points to previous callout item
3817 ptr current pattern pointer
3818 cd pointers to tables etc
3819
3820 Returns: nothing
3821 */
3822
3823 static void
3824 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3825 {
3826 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3827 PUT(previous_callout, 2 + LINK_SIZE, length);
3828 }
3829
3830
3831
3832 #ifdef SUPPORT_UCP
3833 /*************************************************
3834 * Get othercase range *
3835 *************************************************/
3836
3837 /* This function is passed the start and end of a class range, in UTF-8 mode
3838 with UCP support. It searches up the characters, looking for ranges of
3839 characters in the "other" case. Each call returns the next one, updating the
3840 start address. A character with multiple other cases is returned on its own
3841 with a special return value.
3842
3843 Arguments:
3844 cptr points to starting character value; updated
3845 d end value
3846 ocptr where to put start of othercase range
3847 odptr where to put end of othercase range
3848
3849 Yield: -1 when no more
3850 0 when a range is returned
3851 >0 the CASESET offset for char with multiple other cases
3852 in this case, ocptr contains the original
3853 */
3854
3855 static int
3856 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
3857 pcre_uint32 *odptr)
3858 {
3859 pcre_uint32 c, othercase, next;
3860 unsigned int co;
3861
3862 /* Find the first character that has an other case. If it has multiple other
3863 cases, return its case offset value. */
3864
3865 for (c = *cptr; c <= d; c++)
3866 {
3867 if ((co = UCD_CASESET(c)) != 0)
3868 {
3869 *ocptr = c++; /* Character that has the set */
3870 *cptr = c; /* Rest of input range */
3871 return (int)co;
3872 }
3873 if ((othercase = UCD_OTHERCASE(c)) != c) break;
3874 }
3875
3876 if (c > d) return -1; /* Reached end of range */
3877
3878 *ocptr = othercase;
3879 next = othercase + 1;
3880
3881 for (++c; c <= d; c++)
3882 {
3883 if (UCD_OTHERCASE(c) != next) break;
3884 next++;
3885 }
3886
3887 *odptr = next - 1; /* End of othercase range */
3888 *cptr = c; /* Rest of input range */
3889 return 0;
3890 }
3891 #endif /* SUPPORT_UCP */
3892
3893
3894
3895 /*************************************************
3896 * Add a character or range to a class *
3897 *************************************************/
3898
3899 /* This function packages up the logic of adding a character or range of
3900 characters to a class. The character values in the arguments will be within the
3901 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3902 mutually recursive with the function immediately below.
3903
3904 Arguments:
3905 classbits the bit map for characters < 256
3906 uchardptr points to the pointer for extra data
3907 options the options word
3908 cd contains pointers to tables etc.
3909 start start of range character
3910 end end of range character
3911
3912 Returns: the number of < 256 characters added
3913 the pointer to extra data is updated
3914 */
3915
3916 static int
3917 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3918 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
3919 {
3920 pcre_uint32 c;
3921 int n8 = 0;
3922
3923 /* If caseless matching is required, scan the range and process alternate
3924 cases. In Unicode, there are 8-bit characters that have alternate cases that
3925 are greater than 255 and vice-versa. Sometimes we can just extend the original
3926 range. */
3927
3928 if ((options & PCRE_CASELESS) != 0)
3929 {
3930 #ifdef SUPPORT_UCP
3931 if ((options & PCRE_UTF8) != 0)
3932 {
3933 int rc;
3934 pcre_uint32 oc, od;
3935
3936 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
3937 c = start;
3938
3939 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3940 {
3941 /* Handle a single character that has more than one other case. */
3942
3943 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3944 PRIV(ucd_caseless_sets) + rc, oc);
3945
3946 /* Do nothing if the other case range is within the original range. */
3947
3948 else if (oc >= start && od <= end) continue;
3949
3950 /* Extend the original range if there is overlap, noting that if oc < c, we
3951 can't have od > end because a subrange is always shorter than the basic
3952 range. Otherwise, use a recursive call to add the additional range. */
3953
3954 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3955 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
3956 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3957 }
3958 }
3959 else
3960 #endif /* SUPPORT_UCP */
3961
3962 /* Not UTF-mode, or no UCP */
3963
3964 for (c = start; c <= end && c < 256; c++)
3965 {
3966 SETBIT(classbits, cd->fcc[c]);
3967 n8++;
3968 }
3969 }
3970
3971 /* Now handle the original range. Adjust the final value according to the bit
3972 length - this means that the same lists of (e.g.) horizontal spaces can be used
3973 in all cases. */
3974
3975 #if defined COMPILE_PCRE8
3976 #ifdef SUPPORT_UTF
3977 if ((options & PCRE_UTF8) == 0)
3978 #endif
3979 if (end > 0xff) end = 0xff;
3980
3981 #elif defined COMPILE_PCRE16
3982 #ifdef SUPPORT_UTF
3983 if ((options & PCRE_UTF16) == 0)
3984 #endif
3985 if (end > 0xffff) end = 0xffff;
3986
3987 #endif /* COMPILE_PCRE[8|16] */
3988
3989 /* If all characters are less than 256, use the bit map. Otherwise use extra
3990 data. */
3991
3992 if (end < 0x100)
3993 {
3994 for (c = start; c <= end; c++)
3995 {
3996 n8++;
3997 SETBIT(classbits, c);
3998 }
3999 }
4000
4001 else
4002 {
4003 pcre_uchar *uchardata = *uchardptr;
4004
4005 #ifdef SUPPORT_UTF
4006 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
4007 {
4008 if (start < end)
4009 {
4010 *uchardata++ = XCL_RANGE;
4011 uchardata += PRIV(ord2utf)(start, uchardata);
4012 uchardata += PRIV(ord2utf)(end, uchardata);
4013 }
4014 else if (start == end)
4015 {
4016 *uchardata++ = XCL_SINGLE;
4017 uchardata += PRIV(ord2utf)(start, uchardata);
4018 }
4019 }
4020 else
4021 #endif /* SUPPORT_UTF */
4022
4023 /* Without UTF support, character values are constrained by the bit length,
4024 and can only be > 256 for 16-bit and 32-bit libraries. */
4025
4026 #ifdef COMPILE_PCRE8
4027 {}
4028 #else
4029 if (start < end)
4030 {
4031 *uchardata++ = XCL_RANGE;
4032 *uchardata++ = start;
4033 *uchardata++ = end;
4034 }
4035 else if (start == end)
4036 {
4037 *uchardata++ = XCL_SINGLE;
4038 *uchardata++ = start;
4039 }
4040 #endif
4041
4042 *uchardptr = uchardata; /* Updata extra data pointer */
4043 }
4044
4045 return n8; /* Number of 8-bit characters */
4046 }
4047
4048
4049
4050
4051 /*************************************************
4052 * Add a list of characters to a class *
4053 *************************************************/
4054
4055 /* This function is used for adding a list of case-equivalent characters to a
4056 class, and also for adding a list of horizontal or vertical whitespace. If the
4057 list is in order (which it should be), ranges of characters are detected and
4058 handled appropriately. This function is mutually recursive with the function
4059 above.
4060
4061 Arguments:
4062 classbits the bit map for characters < 256
4063 uchardptr points to the pointer for extra data
4064 options the options word
4065 cd contains pointers to tables etc.
4066 p points to row of 32-bit values, terminated by NOTACHAR
4067 except character to omit; this is used when adding lists of
4068 case-equivalent characters to avoid including the one we
4069 already know about
4070
4071 Returns: the number of < 256 characters added
4072 the pointer to extra data is updated
4073 */
4074
4075 static int
4076 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4077 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4078 {
4079 int n8 = 0;
4080 while (p[0] < NOTACHAR)
4081 {
4082 int n = 0;
4083 if (p[0] != except)
4084 {
4085 while(p[n+1] == p[0] + n + 1) n++;
4086 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4087 }
4088 p += n + 1;
4089 }
4090 return n8;
4091 }
4092
4093
4094
4095 /*************************************************
4096 * Add characters not in a list to a class *
4097 *************************************************/
4098
4099 /* This function is used for adding the complement of a list of horizontal or
4100 vertical whitespace to a class. The list must be in order.
4101
4102 Arguments:
4103 classbits the bit map for characters < 256
4104 uchardptr points to the pointer for extra data
4105 options the options word
4106 cd contains pointers to tables etc.
4107 p points to row of 32-bit values, terminated by NOTACHAR
4108
4109 Returns: the number of < 256 characters added
4110 the pointer to extra data is updated
4111 */
4112
4113 static int
4114 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4115 int options, compile_data *cd, const pcre_uint32 *p)
4116 {
4117 BOOL utf = (options & PCRE_UTF8) != 0;
4118 int n8 = 0;
4119 if (p[0] > 0)
4120 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4121 while (p[0] < NOTACHAR)
4122 {
4123 while (p[1] == p[0] + 1) p++;
4124 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4125 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4126 p++;
4127 }
4128 return n8;
4129 }
4130
4131
4132
4133 /*************************************************
4134 * Compile one branch *
4135 *************************************************/
4136
4137 /* Scan the pattern, compiling it into the a vector. If the options are
4138 changed during the branch, the pointer is used to change the external options
4139 bits. This function is used during the pre-compile phase when we are trying
4140 to find out the amount of memory needed, as well as during the real compile
4141 phase. The value of lengthptr distinguishes the two phases.
4142
4143 Arguments:
4144 optionsptr pointer to the option bits
4145 codeptr points to the pointer to the current code point
4146 ptrptr points to the current pattern pointer
4147 errorcodeptr points to error code variable
4148 firstcharptr place to put the first required character
4149 firstcharflagsptr place to put the first character flags, or a negative number
4150 reqcharptr place to put the last required character
4151 reqcharflagsptr place to put the last required character flags, or a negative number
4152 bcptr points to current branch chain
4153 cond_depth conditional nesting depth
4154 cd contains pointers to tables etc.
4155 lengthptr NULL during the real compile phase
4156 points to length accumulator during pre-compile phase
4157
4158 Returns: TRUE on success
4159 FALSE, with *errorcodeptr set non-zero on error
4160 */
4161
4162 static BOOL
4163 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4164 const pcre_uchar **ptrptr, int *errorcodeptr,
4165 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4166 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4167 branch_chain *bcptr, int cond_depth,
4168 compile_data *cd, int *lengthptr)
4169 {
4170 int repeat_type, op_type;
4171 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4172 int bravalue = 0;
4173 int greedy_default, greedy_non_default;
4174 pcre_uint32 firstchar, reqchar;
4175 pcre_int32 firstcharflags, reqcharflags;
4176 pcre_uint32 zeroreqchar, zerofirstchar;
4177 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4178 pcre_int32 req_caseopt, reqvary, tempreqvary;
4179 int options = *optionsptr; /* May change dynamically */
4180 int after_manual_callout = 0;
4181 int length_prevgroup = 0;
4182 register pcre_uint32 c;
4183 int escape;
4184 register pcre_uchar *code = *codeptr;
4185 pcre_uchar *last_code = code;
4186 pcre_uchar *orig_code = code;
4187 pcre_uchar *tempcode;
4188 BOOL inescq = FALSE;
4189 BOOL groupsetfirstchar = FALSE;
4190 const pcre_uchar *ptr = *ptrptr;
4191 const pcre_uchar *tempptr;
4192 const pcre_uchar *nestptr = NULL;
4193 pcre_uchar *previous = NULL;
4194 pcre_uchar *previous_callout = NULL;
4195 pcre_uchar *save_hwm = NULL;
4196 pcre_uint8 classbits[32];
4197
4198 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4199 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4200 dynamically as we process the pattern. */
4201
4202 #ifdef SUPPORT_UTF
4203 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4204 BOOL utf = (options & PCRE_UTF8) != 0;
4205 #ifndef COMPILE_PCRE32
4206 pcre_uchar utf_chars[6];
4207 #endif
4208 #else
4209 BOOL utf = FALSE;
4210 #endif
4211
4212 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4213 class_uchardata always so that it can be passed to add_to_class() always,
4214 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4215 alternative calls for the different cases. */
4216
4217 pcre_uchar *class_uchardata;
4218 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4219 BOOL xclass;
4220 pcre_uchar *class_uchardata_base;
4221 #endif
4222
4223 #ifdef PCRE_DEBUG
4224 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4225 #endif
4226
4227 /* Set up the default and non-default settings for greediness */
4228
4229 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4230 greedy_non_default = greedy_default ^ 1;
4231
4232 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4233 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4234 matches a non-fixed char first char; reqchar just remains unset if we never
4235 find one.
4236
4237 When we hit a repeat whose minimum is zero, we may have to adjust these values
4238 to take the zero repeat into account. This is implemented by setting them to
4239 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4240 item types that can be repeated set these backoff variables appropriately. */
4241
4242 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4243 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4244
4245 /* The variable req_caseopt contains either the REQ_CASELESS value
4246 or zero, according to the current setting of the caseless flag. The
4247 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4248 firstchar or reqchar variables to record the case status of the
4249 value. This is used only for ASCII characters. */
4250
4251 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4252
4253 /* Switch on next character until the end of the branch */
4254
4255 for (;; ptr++)
4256 {
4257 BOOL negate_class;
4258 BOOL should_flip_negation;
4259 BOOL possessive_quantifier;
4260 BOOL is_quantifier;
4261 BOOL is_recurse;
4262 BOOL reset_bracount;
4263 int class_has_8bitchar;
4264 int class_one_char;
4265 int newoptions;
4266 int recno;
4267 int refsign;
4268 int skipbytes;
4269 pcre_uint32 subreqchar, subfirstchar;
4270 pcre_int32 subreqcharflags, subfirstcharflags;
4271 int terminator;
4272 unsigned int mclength;
4273 unsigned int tempbracount;
4274 pcre_uint32 ec;
4275 pcre_uchar mcbuffer[8];
4276
4277 /* Get next character in the pattern */
4278
4279 c = *ptr;
4280
4281 /* If we are at the end of a nested substitution, revert to the outer level
4282 string. Nesting only happens one level deep. */
4283
4284 if (c == CHAR_NULL && nestptr != NULL)
4285 {
4286 ptr = nestptr;
4287 nestptr = NULL;
4288 c = *ptr;
4289 }
4290
4291 /* If we are in the pre-compile phase, accumulate the length used for the
4292 previous cycle of this loop. */
4293
4294 if (lengthptr != NULL)
4295 {
4296 #ifdef PCRE_DEBUG
4297 if (code > cd->hwm) cd->hwm = code; /* High water info */
4298 #endif
4299 if (code > cd->start_workspace + cd->workspace_size -
4300 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4301 {
4302 *errorcodeptr = ERR52;
4303 goto FAILED;
4304 }
4305
4306 /* There is at least one situation where code goes backwards: this is the
4307 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4308 the class is simply eliminated. However, it is created first, so we have to
4309 allow memory for it. Therefore, don't ever reduce the length at this point.
4310 */
4311
4312 if (code < last_code) code = last_code;
4313
4314 /* Paranoid check for integer overflow */
4315
4316 if (OFLOW_MAX - *lengthptr < code - last_code)
4317 {
4318 *errorcodeptr = ERR20;
4319 goto FAILED;
4320 }
4321
4322 *lengthptr += (int)(code - last_code);
4323 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4324 (int)(code - last_code), c, c));
4325
4326 /* If "previous" is set and it is not at the start of the work space, move
4327 it back to there, in order to avoid filling up the work space. Otherwise,
4328 if "previous" is NULL, reset the current code pointer to the start. */
4329
4330 if (previous != NULL)
4331 {
4332 if (previous > orig_code)
4333 {
4334 memmove(orig_code, previous, IN_UCHARS(code - previous));
4335 code -= previous - orig_code;
4336 previous = orig_code;
4337 }
4338 }
4339 else code = orig_code;
4340
4341 /* Remember where this code item starts so we can pick up the length
4342 next time round. */
4343
4344 last_code = code;
4345 }
4346
4347 /* In the real compile phase, just check the workspace used by the forward
4348 reference list. */
4349
4350 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4351 WORK_SIZE_SAFETY_MARGIN)
4352 {
4353 *errorcodeptr = ERR52;
4354 goto FAILED;
4355 }
4356
4357 /* If in \Q...\E, check for the end; if not, we have a literal */
4358
4359 if (inescq && c != CHAR_NULL)
4360 {
4361 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4362 {
4363 inescq = FALSE;
4364 ptr++;
4365 continue;
4366 }
4367 else
4368 {
4369 if (previous_callout != NULL)
4370 {
4371 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4372 complete_callout(previous_callout, ptr, cd);
4373 previous_callout = NULL;
4374 }
4375 if ((options & PCRE_AUTO_CALLOUT) != 0)
4376 {
4377 previous_callout = code;
4378 code = auto_callout(code, ptr, cd);
4379 }
4380 goto NORMAL_CHAR;
4381 }
4382 }
4383
4384 is_quantifier =
4385 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4386 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4387
4388 /* Fill in length of a previous callout, except when the next thing is a
4389 quantifier or when processing a property substitution string in UCP mode. */
4390
4391 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4392 after_manual_callout-- <= 0)
4393 {
4394 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4395 complete_callout(previous_callout, ptr, cd);
4396 previous_callout = NULL;
4397 }
4398
4399 /* In extended mode, skip white space and comments. */
4400
4401 if ((options & PCRE_EXTENDED) != 0)
4402 {
4403 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
4404 if (c == CHAR_NUMBER_SIGN)
4405 {
4406 ptr++;
4407 while (*ptr != CHAR_NULL)
4408 {
4409 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
4410 ptr++;
4411 #ifdef SUPPORT_UTF
4412 if (utf) FORWARDCHAR(ptr);
4413 #endif
4414 }
4415 if (*ptr != CHAR_NULL) continue;
4416
4417 /* Else fall through to handle end of string */
4418 c = 0;
4419 }
4420 }
4421
4422 /* No auto callout for quantifiers, or while processing property strings that
4423 are substituted for \w etc in UCP mode. */
4424
4425 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4426 {
4427 previous_callout = code;
4428 code = auto_callout(code, ptr, cd);
4429 }
4430
4431 switch(c)
4432 {
4433 /* ===================================================================*/
4434 case 0: /* The branch terminates at string end */
4435 case CHAR_VERTICAL_LINE: /* or | or ) */
4436 case CHAR_RIGHT_PARENTHESIS:
4437 *firstcharptr = firstchar;
4438 *firstcharflagsptr = firstcharflags;
4439 *reqcharptr = reqchar;
4440 *reqcharflagsptr = reqcharflags;
4441 *codeptr = code;
4442 *ptrptr = ptr;
4443 if (lengthptr != NULL)
4444 {
4445 if (OFLOW_MAX - *lengthptr < code - last_code)
4446 {
4447 *errorcodeptr = ERR20;
4448 goto FAILED;
4449 }
4450 *lengthptr += (int)(code - last_code); /* To include callout length */
4451 DPRINTF((">> end branch\n"));
4452 }
4453 return TRUE;
4454
4455
4456 /* ===================================================================*/
4457 /* Handle single-character metacharacters. In multiline mode, ^ disables
4458 the setting of any following char as a first character. */
4459
4460 case CHAR_CIRCUMFLEX_ACCENT:
4461 previous = NULL;
4462 if ((options & PCRE_MULTILINE) != 0)
4463 {
4464 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4465 *code++ = OP_CIRCM;
4466 }
4467 else *code++ = OP_CIRC;
4468 break;
4469
4470 case CHAR_DOLLAR_SIGN:
4471 previous = NULL;
4472 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4473 break;
4474
4475 /* There can never be a first char if '.' is first, whatever happens about
4476 repeats. The value of reqchar doesn't change either. */
4477
4478 case CHAR_DOT:
4479 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4480 zerofirstchar = firstchar;
4481 zerofirstcharflags = firstcharflags;
4482 zeroreqchar = reqchar;
4483 zeroreqcharflags = reqcharflags;
4484 previous = code;
4485 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4486 break;
4487
4488
4489 /* ===================================================================*/
4490 /* Character classes. If the included characters are all < 256, we build a
4491 32-byte bitmap of the permitted characters, except in the special case
4492 where there is only one such character. For negated classes, we build the
4493 map as usual, then invert it at the end. However, we use a different opcode
4494 so that data characters > 255 can be handled correctly.
4495
4496 If the class contains characters outside the 0-255 range, a different
4497 opcode is compiled. It may optionally have a bit map for characters < 256,
4498 but those above are are explicitly listed afterwards. A flag byte tells
4499 whether the bitmap is present, and whether this is a negated class or not.
4500
4501 In JavaScript compatibility mode, an isolated ']' causes an error. In
4502 default (Perl) mode, it is treated as a data character. */
4503
4504 case CHAR_RIGHT_SQUARE_BRACKET:
4505 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4506 {
4507 *errorcodeptr = ERR64;
4508 goto FAILED;
4509 }
4510 goto NORMAL_CHAR;
4511
4512 case CHAR_LEFT_SQUARE_BRACKET:
4513 previous = code;
4514
4515 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4516 they are encountered at the top level, so we'll do that too. */
4517
4518 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4519 ptr[1] == CHAR_EQUALS_SIGN) &&
4520 check_posix_syntax(ptr, &tempptr))
4521 {
4522 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4523 goto FAILED;
4524 }
4525
4526 /* If the first character is '^', set the negation flag and skip it. Also,
4527 if the first few characters (either before or after ^) are \Q\E or \E we
4528 skip them too. This makes for compatibility with Perl. */
4529
4530 negate_class = FALSE;
4531 for (;;)
4532 {
4533 c = *(++ptr);
4534 if (c == CHAR_BACKSLASH)
4535 {
4536 if (ptr[1] == CHAR_E)
4537 ptr++;
4538 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4539 ptr += 3;
4540 else
4541 break;
4542 }
4543 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4544 negate_class = TRUE;
4545 else break;
4546 }
4547
4548 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4549 an initial ']' is taken as a data character -- the code below handles
4550 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4551 [^] must match any character, so generate OP_ALLANY. */
4552
4553 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4554 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4555 {
4556 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4557 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4558 zerofirstchar = firstchar;
4559 zerofirstcharflags = firstcharflags;
4560 break;
4561 }
4562
4563 /* If a class contains a negative special such as \S, we need to flip the
4564 negation flag at the end, so that support for characters > 255 works
4565 correctly (they are all included in the class). */
4566
4567 should_flip_negation = FALSE;
4568
4569 /* For optimization purposes, we track some properties of the class:
4570 class_has_8bitchar will be non-zero if the class contains at least one <
4571 256 character; class_one_char will be 1 if the class contains just one
4572 character. */
4573
4574 class_has_8bitchar = 0;
4575 class_one_char = 0;
4576
4577 /* Initialize the 32-char bit map to all zeros. We build the map in a
4578 temporary bit of memory, in case the class contains fewer than two
4579 8-bit characters because in that case the compiled code doesn't use the bit
4580 map. */
4581
4582 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4583
4584 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4585 xclass = FALSE;
4586 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4587 class_uchardata_base = class_uchardata; /* Save the start */
4588 #endif
4589
4590 /* Process characters until ] is reached. By writing this as a "do" it
4591 means that an initial ] is taken as a data character. At the start of the
4592 loop, c contains the first byte of the character. */
4593
4594 if (c != CHAR_NULL) do
4595 {
4596 const pcre_uchar *oldptr;
4597
4598 #ifdef SUPPORT_UTF
4599 if (utf && HAS_EXTRALEN(c))
4600 { /* Braces are required because the */
4601 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4602 }
4603 #endif
4604
4605 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4606 /* In the pre-compile phase, accumulate the length of any extra
4607 data and reset the pointer. This is so that very large classes that
4608 contain a zillion > 255 characters no longer overwrite the work space
4609 (which is on the stack). We have to remember that there was XCLASS data,
4610 however. */
4611
4612 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4613 {
4614 xclass = TRUE;
4615 *lengthptr += class_uchardata - class_uchardata_base;
4616 class_uchardata = class_uchardata_base;
4617 }
4618 #endif
4619
4620 /* Inside \Q...\E everything is literal except \E */
4621
4622 if (inescq)
4623 {
4624 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4625 {
4626 inescq = FALSE; /* Reset literal state */
4627 ptr++; /* Skip the 'E' */
4628 continue; /* Carry on with next */
4629 }
4630 goto CHECK_RANGE; /* Could be range if \E follows */
4631 }
4632
4633 /* Handle POSIX class names. Perl allows a negation extension of the
4634 form [:^name:]. A square bracket that doesn't match the syntax is
4635 treated as a literal. We also recognize the POSIX constructions
4636 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4637 5.6 and 5.8 do. */
4638
4639 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4640 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4641 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4642 {
4643 BOOL local_negate = FALSE;
4644 int posix_class, taboffset, tabopt;
4645 register const pcre_uint8 *cbits = cd->cbits;
4646 pcre_uint8 pbits[32];
4647
4648 if (ptr[1] != CHAR_COLON)
4649 {
4650 *errorcodeptr = ERR31;
4651 goto FAILED;
4652 }
4653
4654 ptr += 2;
4655 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4656 {
4657 local_negate = TRUE;
4658 should_flip_negation = TRUE; /* Note negative special */
4659 ptr++;
4660 }
4661
4662 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4663 if (posix_class < 0)
4664 {
4665 *errorcodeptr = ERR30;
4666 goto FAILED;
4667 }
4668
4669 /* If matching is caseless, upper and lower are converted to
4670 alpha. This relies on the fact that the class table starts with
4671 alpha, lower, upper as the first 3 entries. */
4672
4673 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4674 posix_class = 0;
4675
4676 /* When PCRE_UCP is set, some of the POSIX classes are converted to
4677 different escape sequences that use Unicode properties. */
4678
4679 #ifdef SUPPORT_UCP
4680 if ((options & PCRE_UCP) != 0)
4681 {
4682 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4683 if (posix_substitutes[pc] != NULL)
4684 {
4685 nestptr = tempptr + 1;
4686 ptr = posix_substitutes[pc] - 1;
4687 continue;
4688 }
4689 }
4690 #endif
4691 /* In the non-UCP case, we build the bit map for the POSIX class in a
4692 chunk of local store because we may be adding and subtracting from it,
4693 and we don't want to subtract bits that may be in the main map already.
4694 At the end we or the result into the bit map that is being built. */
4695
4696 posix_class *= 3;
4697
4698 /* Copy in the first table (always present) */
4699
4700 memcpy(pbits, cbits + posix_class_maps[posix_class],
4701 32 * sizeof(pcre_uint8));
4702
4703 /* If there is a second table, add or remove it as required. */
4704
4705 taboffset = posix_class_maps[posix_class + 1];
4706 tabopt = posix_class_maps[posix_class + 2];
4707
4708 if (taboffset >= 0)
4709 {
4710 if (tabopt >= 0)
4711 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4712 else
4713 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4714 }
4715
4716 /* Now see if we need to remove any special characters. An option
4717 value of 1 removes vertical space and 2 removes underscore. */
4718
4719 if (tabopt < 0) tabopt = -tabopt;
4720 if (tabopt == 1) pbits[1] &= ~0x3c;
4721 else if (tabopt == 2) pbits[11] &= 0x7f;
4722
4723 /* Add the POSIX table or its complement into the main table that is
4724 being built and we are done. */
4725
4726 if (local_negate)
4727 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4728 else
4729 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4730
4731 ptr = tempptr + 1;
4732 /* Every class contains at least one < 256 character. */
4733 class_has_8bitchar = 1;
4734 /* Every class contains at least two characters. */
4735 class_one_char = 2;
4736 continue; /* End of POSIX syntax handling */
4737 }
4738
4739 /* Backslash may introduce a single character, or it may introduce one
4740 of the specials, which just set a flag. The sequence \b is a special
4741 case. Inside a class (and only there) it is treated as backspace. We
4742 assume that other escapes have more than one character in them, so
4743 speculatively set both class_has_8bitchar and class_one_char bigger
4744 than one. Unrecognized escapes fall through and are either treated
4745 as literal characters (by default), or are faulted if
4746 PCRE_EXTRA is set. */
4747
4748 if (c == CHAR_BACKSLASH)
4749 {
4750 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4751 TRUE);
4752 if (*errorcodeptr != 0) goto FAILED;
4753 if (escape == 0) c = ec;
4754 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4755 else if (escape == ESC_N) /* \N is not supported in a class */
4756 {
4757 *errorcodeptr = ERR71;
4758 goto FAILED;
4759 }
4760 else if (escape == ESC_Q) /* Handle start of quoted string */
4761 {
4762 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4763 {
4764 ptr += 2; /* avoid empty string */
4765 }
4766 else inescq = TRUE;
4767 continue;
4768 }
4769 else if (escape == ESC_E) continue; /* Ignore orphan \E */
4770
4771 else
4772 {
4773 register const pcre_uint8 *cbits = cd->cbits;
4774 /* Every class contains at least two < 256 characters. */
4775 class_has_8bitchar++;
4776 /* Every class contains at least two characters. */
4777 class_one_char += 2;
4778
4779 switch (escape)
4780 {
4781 #ifdef SUPPORT_UCP
4782 case ESC_du: /* These are the values given for \d etc */
4783 case ESC_DU: /* when PCRE_UCP is set. We replace the */
4784 case ESC_wu: /* escape sequence with an appropriate \p */
4785 case ESC_WU: /* or \P to test Unicode properties instead */
4786 case ESC_su: /* of the default ASCII testing. */
4787 case ESC_SU:
4788 nestptr = ptr;
4789 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4790 class_has_8bitchar--; /* Undo! */
4791 continue;
4792 #endif
4793 case ESC_d:
4794 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4795 continue;
4796
4797 case ESC_D:
4798 should_flip_negation = TRUE;
4799 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4800 continue;
4801
4802 case ESC_w:
4803 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4804 continue;
4805
4806 case ESC_W:
4807 should_flip_negation = TRUE;
4808 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4809 continue;
4810
4811 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4812 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4813 previously set by something earlier in the character class.
4814 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4815 we could just adjust the appropriate bit. From PCRE 8.34 we no
4816 longer treat \s and \S specially. */
4817
4818 case ESC_s:
4819 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4820 continue;
4821
4822 case ESC_S:
4823 should_flip_negation = TRUE;
4824 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4825 continue;
4826
4827 /* The rest apply in both UCP and non-UCP cases. */
4828
4829 case ESC_h:
4830 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4831 PRIV(hspace_list), NOTACHAR);
4832 continue;
4833
4834 case ESC_H:
4835 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4836 cd, PRIV(hspace_list));
4837 continue;
4838
4839 case ESC_v:
4840 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4841 PRIV(vspace_list), NOTACHAR);
4842 continue;
4843
4844 case ESC_V:
4845 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4846 cd, PRIV(vspace_list));
4847 continue;
4848
4849 #ifdef SUPPORT_UCP
4850 case ESC_p:
4851 case ESC_P:
4852 {
4853 BOOL negated;
4854 unsigned int ptype = 0, pdata = 0;
4855 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
4856 goto FAILED;
4857 *class_uchardata++ = ((escape == ESC_p) != negated)?
4858 XCL_PROP : XCL_NOTPROP;
4859 *class_uchardata++ = ptype;
4860 *class_uchardata++ = pdata;
4861 class_has_8bitchar--; /* Undo! */
4862 continue;
4863 }
4864 #endif
4865 /* Unrecognized escapes are faulted if PCRE is running in its
4866 strict mode. By default, for compatibility with Perl, they are
4867 treated as literals. */
4868
4869 default:
4870 if ((options & PCRE_EXTRA) != 0)
4871 {
4872 *errorcodeptr = ERR7;
4873 goto FAILED;
4874 }
4875 class_has_8bitchar--; /* Undo the speculative increase. */
4876 class_one_char -= 2; /* Undo the speculative increase. */
4877 c = *ptr; /* Get the final character and fall through */
4878 break;
4879 }
4880 }
4881
4882 /* Fall through if the escape just defined a single character (c >= 0).
4883 This may be greater than 256. */
4884
4885 escape = 0;
4886
4887 } /* End of backslash handling */
4888
4889 /* A character may be followed by '-' to form a range. However, Perl does
4890 not permit ']' to be the end of the range. A '-' character at the end is
4891 treated as a literal. Perl ignores orphaned \E sequences entirely. The
4892 code for handling \Q and \E is messy. */
4893
4894 CHECK_RANGE:
4895 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4896 {
4897 inescq = FALSE;
4898 ptr += 2;
4899 }
4900 oldptr = ptr;
4901
4902 /* Remember if \r or \n were explicitly used */
4903
4904 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4905
4906 /* Check for range */
4907
4908 if (!inescq && ptr[1] == CHAR_MINUS)
4909 {
4910 pcre_uint32 d;
4911 ptr += 2;
4912 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4913
4914 /* If we hit \Q (not followed by \E) at this point, go into escaped
4915 mode. */
4916
4917 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4918 {
4919 ptr += 2;
4920 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4921 { ptr += 2; continue; }
4922 inescq = TRUE;
4923 break;
4924 }
4925
4926 /* Minus (hyphen) at the end of a class is treated as a literal, so put
4927 back the pointer and jump to handle the character that preceded it. */
4928
4929 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4930 {
4931 ptr = oldptr;
4932 goto CLASS_SINGLE_CHARACTER;
4933 }
4934
4935 /* Otherwise, we have a potential range; pick up the next character */
4936
4937 #ifdef SUPPORT_UTF
4938 if (utf)
4939 { /* Braces are required because the */
4940 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4941 }
4942 else
4943 #endif
4944 d = *ptr; /* Not UTF-8 mode */
4945
4946 /* The second part of a range can be a single-character escape, but
4947 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4948 in such circumstances. */
4949
4950 if (!inescq && d == CHAR_BACKSLASH)
4951 {
4952 int descape;
4953 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
4954 if (*errorcodeptr != 0) goto FAILED;
4955
4956 /* \b is backspace; any other special means the '-' was literal. */
4957
4958 if (descape != 0)
4959 {
4960 if (descape == ESC_b) d = CHAR_BS; else
4961 {
4962 ptr = oldptr;
4963 goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4964 }
4965 }
4966 }
4967
4968 /* Check that the two values are in the correct order. Optimize
4969 one-character ranges. */
4970
4971 if (d < c)
4972 {
4973 *errorcodeptr = ERR8;
4974 goto FAILED;
4975 }
4976 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4977
4978 /* We have found a character range, so single character optimizations
4979 cannot be done anymore. Any value greater than 1 indicates that there
4980 is more than one character. */
4981
4982 class_one_char = 2;
4983
4984 /* Remember an explicit \r or \n, and add the range to the class. */
4985
4986 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4987
4988 class_has_8bitchar +=
4989 add_to_class(classbits, &class_uchardata, options, cd, c, d);
4990
4991 continue; /* Go get the next char in the class */
4992 }
4993
4994 /* Handle a single character - we can get here for a normal non-escape
4995 char, or after \ that introduces a single character or for an apparent
4996 range that isn't. Only the value 1 matters for class_one_char, so don't
4997 increase it if it is already 2 or more ... just in case there's a class
4998 with a zillion characters in it. */
4999
5000 CLASS_SINGLE_CHARACTER:
5001 if (class_one_char < 2) class_one_char++;
5002
5003 /* If class_one_char is 1, we have the first single character in the
5004 class, and there have been no prior ranges, or XCLASS items generated by
5005 escapes. If this is the final character in the class, we can optimize by
5006 turning the item into a 1-character OP_CHAR[I] if it's positive, or
5007 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
5008 to be set. Otherwise, there can be no first char if this item is first,
5009 whatever repeat count may follow. In the case of reqchar, save the
5010 previous value for reinstating. */
5011
5012 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5013 {
5014 ptr++;
5015 zeroreqchar = reqchar;
5016 zeroreqcharflags = reqcharflags;
5017
5018 if (negate_class)
5019 {
5020 #ifdef SUPPORT_UCP
5021 int d;
5022 #endif
5023 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5024 zerofirstchar = firstchar;
5025 zerofirstcharflags = firstcharflags;
5026
5027 /* For caseless UTF-8 mode when UCP support is available, check
5028 whether this character has more than one other case. If so, generate
5029 a special OP_NOTPROP item instead of OP_NOTI. */
5030
5031 #ifdef SUPPORT_UCP
5032 if (utf && (options & PCRE_CASELESS) != 0 &&
5033 (d = UCD_CASESET(c)) != 0)
5034 {
5035 *code++ = OP_NOTPROP;
5036 *code++ = PT_CLIST;
5037 *code++ = d;
5038 }
5039 else
5040 #endif
5041 /* Char has only one other case, or UCP not available */
5042
5043 {
5044 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5045 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5046 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5047 code += PRIV(ord2utf)(c, code);
5048 else
5049 #endif
5050 *code++ = c;
5051 }
5052
5053 /* We are finished with this character class */
5054
5055 goto END_CLASS;
5056 }
5057
5058 /* For a single, positive character, get the value into mcbuffer, and
5059 then we can handle this with the normal one-character code. */
5060
5061 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5062 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5063 mclength = PRIV(ord2utf)(c, mcbuffer);
5064 else
5065 #endif
5066 {
5067 mcbuffer[0] = c;
5068 mclength = 1;
5069 }
5070 goto ONE_CHAR;
5071 } /* End of 1-char optimization */
5072
5073 /* There is more than one character in the class, or an XCLASS item
5074 has been generated. Add this character to the class. */
5075
5076 class_has_8bitchar +=
5077 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5078 }
5079
5080 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5081 If we are at the end of an internal nested string, revert to the outer
5082 string. */
5083
5084 while (((c = *(++ptr)) != CHAR_NULL ||
5085 (nestptr != NULL &&
5086 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5087 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5088
5089 /* Check for missing terminating ']' */
5090
5091 if (c == CHAR_NULL)
5092 {
5093 *errorcodeptr = ERR6;
5094 goto FAILED;
5095 }
5096
5097 /* We will need an XCLASS if data has been placed in class_uchardata. In
5098 the second phase this is a sufficient test. However, in the pre-compile
5099 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5100 only if the very last character in the class needs XCLASS will it contain
5101 anything at this point. For this reason, xclass gets set TRUE above when
5102 uchar_classdata is emptied, and that's why this code is the way it is here
5103 instead of just doing a test on class_uchardata below. */
5104
5105 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5106 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5107 #endif
5108
5109 /* If this is the first thing in the branch, there can be no first char
5110 setting, whatever the repeat count. Any reqchar setting must remain
5111 unchanged after any kind of repeat. */
5112
5113 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5114 zerofirstchar = firstchar;
5115 zerofirstcharflags = firstcharflags;
5116 zeroreqchar = reqchar;
5117 zeroreqcharflags = reqcharflags;
5118
5119 /* If there are characters with values > 255, we have to compile an
5120 extended class, with its own opcode, unless there was a negated special
5121 such as \S in the class, and PCRE_UCP is not set, because in that case all
5122 characters > 255 are in the class, so any that were explicitly given as
5123 well can be ignored. If (when there are explicit characters > 255 that must
5124 be listed) there are no characters < 256, we can omit the bitmap in the
5125 actual compiled code. */
5126
5127 #ifdef SUPPORT_UTF
5128 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5129 #elif !defined COMPILE_PCRE8
5130 if (xclass && !should_flip_negation)
5131 #endif
5132 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5133 {
5134 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5135 *code++ = OP_XCLASS;
5136 code += LINK_SIZE;
5137 *code = negate_class? XCL_NOT:0;
5138
5139 /* If the map is required, move up the extra data to make room for it;
5140 otherwise just move the code pointer to the end of the extra data. */
5141
5142 if (class_has_8bitchar > 0)
5143 {
5144 *code++ |= XCL_MAP;
5145 memmove(code + (32 / sizeof(pcre_uchar)), code,
5146 IN_UCHARS(class_uchardata - code));
5147 memcpy(code, classbits, 32);
5148 code = class_uchardata + (32 / sizeof(pcre_uchar));
5149 }
5150 else code = class_uchardata;
5151
5152 /* Now fill in the complete length of the item */
5153
5154 PUT(previous, 1, (int)(code - previous));
5155 break; /* End of class handling */
5156 }
5157 #endif
5158
5159 /* If there are no characters > 255, or they are all to be included or
5160 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5161 whole class was negated and whether there were negative specials such as \S
5162 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5163 negating it if necessary. */
5164
5165 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5166 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5167 {
5168 if (negate_class)
5169 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5170 memcpy(code, classbits, 32);
5171 }
5172 code += 32 / sizeof(pcre_uchar);
5173
5174 END_CLASS:
5175 break;
5176
5177
5178 /* ===================================================================*/
5179 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5180 has been tested above. */
5181
5182 case CHAR_LEFT_CURLY_BRACKET:
5183 if (!is_quantifier) goto NORMAL_CHAR;
5184 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5185 if (*errorcodeptr != 0) goto FAILED;
5186 goto REPEAT;
5187
5188 case CHAR_ASTERISK:
5189 repeat_min = 0;
5190 repeat_max = -1;
5191 goto REPEAT;
5192
5193 case CHAR_PLUS:
5194 repeat_min = 1;
5195 repeat_max = -1;
5196 goto REPEAT;
5197
5198 case CHAR_QUESTION_MARK:
5199 repeat_min = 0;
5200 repeat_max = 1;
5201
5202 REPEAT:
5203 if (previous == NULL)
5204 {
5205 *errorcodeptr = ERR9;
5206 goto FAILED;
5207 }
5208
5209 if (repeat_min == 0)
5210 {
5211 firstchar = zerofirstchar; /* Adjust for zero repeat */
5212 firstcharflags = zerofirstcharflags;
5213 reqchar = zeroreqchar; /* Ditto */
5214 reqcharflags = zeroreqcharflags;
5215 }
5216
5217 /* Remember whether this is a variable length repeat */
5218
5219 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5220
5221 op_type = 0; /* Default single-char op codes */
5222 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5223
5224 /* Save start of previous item, in case we have to move it up in order to
5225 insert something before it. */
5226
5227 tempcode = previous;
5228
5229 /* If the next character is '+', we have a possessive quantifier. This
5230 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5231 If the next character is '?' this is a minimizing repeat, by default,
5232 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5233 repeat type to the non-default. */
5234
5235 if (ptr[1] == CHAR_PLUS)
5236 {
5237 repeat_type = 0; /* Force greedy */
5238 possessive_quantifier = TRUE;
5239 ptr++;
5240 }
5241 else if (ptr[1] == CHAR_QUESTION_MARK)
5242 {
5243 repeat_type = greedy_non_default;
5244 ptr++;
5245 }
5246 else repeat_type = greedy_default;
5247
5248 /* If previous was a recursion call, wrap it in atomic brackets so that
5249 previous becomes the atomic group. All recursions were so wrapped in the
5250 past, but it no longer happens for non-repeated recursions. In fact, the
5251 repeated ones could be re-implemented independently so as not to need this,
5252 but for the moment we rely on the code for repeating groups. */
5253
5254 if (*previous == OP_RECURSE)
5255 {
5256 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5257 *previous = OP_ONCE;
5258 PUT(previous, 1, 2 + 2*LINK_SIZE);
5259 previous[2 + 2*LINK_SIZE] = OP_KET;
5260 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5261 code += 2 + 2 * LINK_SIZE;
5262 length_prevgroup = 3 + 3*LINK_SIZE;
5263
5264 /* When actually compiling, we need to check whether this was a forward
5265 reference, and if so, adjust the offset. */
5266
5267 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5268 {
5269 int offset = GET(cd->hwm, -LINK_SIZE);
5270 if (offset == previous + 1 - cd->start_code)
5271 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5272 }
5273 }
5274
5275 /* Now handle repetition for the different types of item. */
5276
5277 /* If previous was a character or negated character match, abolish the item
5278 and generate a repeat item instead. If a char item has a minimum of more
5279 than one, ensure that it is set in reqchar - it might not be if a sequence
5280 such as x{3} is the first thing in a branch because the x will have gone
5281 into firstchar instead. */
5282
5283 if (*previous == OP_CHAR || *previous == OP_CHARI
5284 || *previous == OP_NOT || *previous == OP_NOTI)
5285 {
5286 switch (*previous)
5287 {
5288 default: /* Make compiler happy. */
5289 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5290 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5291 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5292 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5293 }
5294
5295 /* Deal with UTF characters that take up more than one character. It's
5296 easier to write this out separately than try to macrify it. Use c to
5297 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5298 it's a length rather than a small character. */
5299
5300 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5301 if (utf && NOT_FIRSTCHAR(code[-1]))
5302 {
5303 pcre_uchar *lastchar = code - 1;
5304 BACKCHAR(lastchar);
5305 c = (int)(code - lastchar); /* Length of UTF-8 character */
5306 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5307 c |= UTF_LENGTH; /* Flag c as a length */
5308 }
5309 else
5310 #endif /* SUPPORT_UTF */
5311
5312 /* Handle the case of a single charater - either with no UTF support, or
5313 with UTF disabled, or for a single character UTF character. */
5314 {
5315 c = code[-1];
5316 if (*previous <= OP_CHARI && repeat_min > 1)
5317 {
5318 reqchar = c;
5319 reqcharflags = req_caseopt | cd->req_varyopt;
5320 }
5321 }
5322
5323 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5324 }
5325
5326 /* If previous was a character type match (\d or similar), abolish it and
5327 create a suitable repeat item. The code is shared with single-character
5328 repeats by setting op_type to add a suitable offset into repeat_type. Note
5329 the the Unicode property types will be present only when SUPPORT_UCP is
5330 defined, but we don't wrap the little bits of code here because it just
5331 makes it horribly messy. */
5332
5333 else if (*previous < OP_EODN)
5334 {
5335 pcre_uchar *oldcode;
5336 int prop_type, prop_value;
5337 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5338 c = *previous;
5339
5340 OUTPUT_SINGLE_REPEAT:
5341 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5342 {
5343 prop_type = previous[1];
5344 prop_value = previous[2];
5345 }
5346 else prop_type = prop_value = -1;
5347
5348 oldcode = code;
5349 code = previous; /* Usually overwrite previous item */
5350
5351 /* If the maximum is zero then the minimum must also be zero; Perl allows
5352 this case, so we do too - by simply omitting the item altogether. */
5353
5354 if (repeat_max == 0) goto END_REPEAT;
5355
5356 /* Combine the op_type with the repeat_type */
5357
5358 repeat_type += op_type;
5359
5360 /* A minimum of zero is handled either as the special case * or ?, or as
5361 an UPTO, with the maximum given. */
5362
5363 if (repeat_min == 0)
5364 {
5365 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5366 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5367 else
5368 {
5369 *code++ = OP_UPTO + repeat_type;
5370 PUT2INC(code, 0, repeat_max);
5371 }
5372 }
5373
5374 /* A repeat minimum of 1 is optimized into some special cases. If the
5375 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5376 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5377 one less than the maximum. */
5378
5379 else if (repeat_min == 1)
5380 {
5381 if (repeat_max == -1)
5382 *code++ = OP_PLUS + repeat_type;
5383 else
5384 {
5385 code = oldcode; /* leave previous item in place */
5386 if (repeat_max == 1) goto END_REPEAT;
5387 *code++ = OP_UPTO + repeat_type;
5388 PUT2INC(code, 0, repeat_max - 1);
5389 }
5390 }
5391
5392 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5393 handled as an EXACT followed by an UPTO. */
5394
5395 else
5396 {
5397 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5398 PUT2INC(code, 0, repeat_min);
5399
5400 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5401 we have to insert the character for the previous code. For a repeated
5402 Unicode property match, there are two extra bytes that define the
5403 required property. In UTF-8 mode, long characters have their length in
5404 c, with the UTF_LENGTH bit as a flag. */
5405
5406 if (repeat_max < 0)
5407 {
5408 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5409 if (utf && (c & UTF_LENGTH) != 0)
5410 {
5411 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5412 code += c & 7;
5413 }
5414 else
5415 #endif
5416 {
5417 *code++ = c;
5418 if (prop_type >= 0)
5419 {
5420 *code++ = prop_type;
5421 *code++ = prop_value;
5422 }
5423 }
5424 *code++ = OP_STAR + repeat_type;
5425 }
5426
5427 /* Else insert an UPTO if the max is greater than the min, again
5428 preceded by the character, for the previously inserted code. If the
5429 UPTO is just for 1 instance, we can use QUERY instead. */
5430
5431 else if (repeat_max != repeat_min)
5432 {
5433 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5434 if (utf && (c & UTF_LENGTH) != 0)
5435 {
5436 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5437 code += c & 7;
5438 }
5439 else
5440 #endif
5441 *code++ = c;
5442 if (prop_type >= 0)
5443 {
5444 *code++ = prop_type;
5445 *code++ = prop_value;
5446 }
5447 repeat_max -= repeat_min;
5448
5449 if (repeat_max == 1)
5450 {
5451 *code++ = OP_QUERY + repeat_type;
5452 }
5453 else
5454 {
5455 *code++ = OP_UPTO + repeat_type;
5456 PUT2INC(code, 0, repeat_max);
5457 }
5458 }
5459 }
5460
5461 /* The character or character type itself comes last in all cases. */
5462
5463 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5464 if (utf && (c & UTF_LENGTH) != 0)
5465 {
5466 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5467 code += c & 7;
5468 }
5469 else
5470 #endif
5471 *code++ = c;
5472
5473 /* For a repeated Unicode property match, there are two extra bytes that
5474 define the required property. */
5475
5476 #ifdef SUPPORT_UCP
5477 if (prop_type >= 0)
5478 {
5479 *code++ = prop_type;
5480 *code++ = prop_value;
5481 }
5482 #endif
5483 }
5484
5485 /* If previous was a character class or a back reference, we put the repeat
5486 stuff after it, but just skip the item if the repeat was {0,0}. */
5487
5488 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5489 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5490 *previous == OP_XCLASS ||
5491 #endif
5492 *previous == OP_REF || *previous == OP_REFI ||
5493 *previous == OP_DNREF || *previous == OP_DNREFI)
5494 {
5495 if (repeat_max == 0)
5496 {
5497 code = previous;
5498 goto END_REPEAT;
5499 }
5500
5501 if (repeat_min == 0 && repeat_max == -1)
5502 *code++ = OP_CRSTAR + repeat_type;
5503 else if (repeat_min == 1 && repeat_max == -1)
5504 *code++ = OP_CRPLUS + repeat_type;
5505 else if (repeat_min == 0 && repeat_max == 1)
5506 *code++ = OP_CRQUERY + repeat_type;
5507 else
5508 {
5509 *code++ = OP_CRRANGE + repeat_type;
5510 PUT2INC(code, 0, repeat_min);
5511 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5512 PUT2INC(code, 0, repeat_max);
5513 }
5514 }
5515
5516 /* If previous was a bracket group, we may have to replicate it in certain
5517 cases. Note that at this point we can encounter only the "basic" bracket
5518 opcodes such as BRA and CBRA, as this is the place where they get converted
5519 into the more special varieties such as BRAPOS and SBRA. A test for >=
5520 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5521 ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5522 repetition of assertions, but now it does, for Perl compatibility. */
5523
5524 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5525 {
5526 register int i;
5527 int len = (int)(code - previous);
5528 pcre_uchar *bralink = NULL;
5529 pcre_uchar *brazeroptr = NULL;
5530
5531 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5532 we just ignore the repeat. */
5533
5534 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5535 goto END_REPEAT;
5536
5537 /* There is no sense in actually repeating assertions. The only potential
5538 use of repetition is in cases when the assertion is optional. Therefore,
5539 if the minimum is greater than zero, just ignore the repeat. If the
5540 maximum is not not zero or one, set it to 1. */
5541
5542 if (*previous < OP_ONCE) /* Assertion */
5543 {
5544 if (repeat_min > 0) goto END_REPEAT;
5545 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5546 }
5547
5548 /* The case of a zero minimum is special because of the need to stick
5549 OP_BRAZERO in front of it, and because the group appears once in the
5550 data, whereas in other cases it appears the minimum number of times. For
5551 this reason, it is simplest to treat this case separately, as otherwise
5552 the code gets far too messy. There are several special subcases when the
5553 minimum is zero. */
5554
5555 if (repeat_min == 0)
5556 {
5557 /* If the maximum is also zero, we used to just omit the group from the
5558 output altogether, like this:
5559
5560 ** if (repeat_max == 0)
5561 ** {
5562 ** code = previous;
5563 ** goto END_REPEAT;
5564 ** }
5565
5566 However, that fails when a group or a subgroup within it is referenced
5567 as a subroutine from elsewhere in the pattern, so now we stick in
5568 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5569 don't have a list of which groups are referenced, we cannot do this
5570 selectively.
5571
5572 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5573 and do no more at this point. However, we do need to adjust any
5574 OP_RECURSE calls inside the group that refer to the group itself or any
5575 internal or forward referenced group, because the offset is from the
5576 start of the whole regex. Temporarily terminate the pattern while doing
5577 this. */
5578
5579 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5580 {
5581 *code = OP_END;
5582 adjust_recurse(previous, 1, utf, cd, save_hwm);
5583 memmove(previous + 1, previous, IN_UCHARS(len));
5584 code++;
5585 if (repeat_max == 0)
5586 {
5587 *previous++ = OP_SKIPZERO;
5588 goto END_REPEAT;
5589 }
5590 brazeroptr = previous; /* Save for possessive optimizing */
5591 *previous++ = OP_BRAZERO + repeat_type;
5592 }
5593
5594 /* If the maximum is greater than 1 and limited, we have to replicate
5595 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5596 The first one has to be handled carefully because it's the original
5597 copy, which has to be moved up. The remainder can be handled by code
5598 that is common with the non-zero minimum case below. We have to
5599 adjust the value or repeat_max, since one less copy is required. Once
5600 again, we may have to adjust any OP_RECURSE calls inside the group. */
5601
5602 else
5603 {
5604 int offset;
5605 *code = OP_END;
5606 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5607 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5608 code += 2 + LINK_SIZE;
5609 *previous++ = OP_BRAZERO + repeat_type;
5610 *previous++ = OP_BRA;
5611
5612 /* We chain together the bracket offset fields that have to be
5613 filled in later when the ends of the brackets are reached. */
5614
5615 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5616 bralink = previous;
5617 PUTINC(previous, 0, offset);
5618 }
5619
5620 repeat_max--;
5621 }
5622
5623 /* If the minimum is greater than zero, replicate the group as many
5624 times as necessary, and adjust the maximum to the number of subsequent
5625 copies that we need. If we set a first char from the group, and didn't
5626 set a required char, copy the latter from the former. If there are any
5627 forward reference subroutine calls in the group, there will be entries on
5628 the workspace list; replicate these with an appropriate increment. */
5629
5630 else
5631 {
5632 if (repeat_min > 1)
5633 {
5634 /* In the pre-compile phase, we don't actually do the replication. We
5635 just adjust the length as if we had. Do some paranoid checks for
5636 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5637 integer type when available, otherwise double. */
5638
5639 if (lengthptr != NULL)
5640 {
5641 int delta = (repeat_min - 1)*length_prevgroup;
5642 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5643 (INT64_OR_DOUBLE)length_prevgroup >
5644 (INT64_OR_DOUBLE)INT_MAX ||
5645 OFLOW_MAX - *lengthptr < delta)
5646 {
5647 *errorcodeptr = ERR20;
5648 goto FAILED;
5649 }
5650 *lengthptr += delta;
5651 }
5652
5653 /* This is compiling for real. If there is a set first byte for
5654 the group, and we have not yet set a "required byte", set it. Make
5655 sure there is enough workspace for copying forward references before
5656 doing the copy. */
5657
5658 else
5659 {
5660 if (groupsetfirstchar && reqcharflags < 0)
5661 {
5662 reqchar = firstchar;
5663 reqcharflags = firstcharflags;
5664 }
5665
5666 for (i = 1; i < repeat_min; i++)
5667 {
5668 pcre_uchar *hc;
5669 pcre_uchar *this_hwm = cd->hwm;
5670 memcpy(code, previous, IN_UCHARS(len));
5671
5672 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5673 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5674 {
5675 int save_offset = save_hwm - cd->start_workspace;
5676 int this_offset = this_hwm - cd->start_workspace;
5677 *errorcodeptr = expand_workspace(cd);
5678 if (*errorcodeptr != 0) goto FAILED;
5679 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5680 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5681 }
5682
5683 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5684 {
5685 PUT(cd->hwm, 0, GET(hc, 0) + len);
5686 cd->hwm += LINK_SIZE;
5687 }
5688 save_hwm = this_hwm;
5689 code += len;
5690 }
5691 }
5692 }
5693
5694 if (repeat_max > 0) repeat_max -= repeat_min;
5695 }
5696
5697 /* This code is common to both the zero and non-zero minimum cases. If
5698 the maximum is limited, it replicates the group in a nested fashion,
5699 remembering the bracket starts on a stack. In the case of a zero minimum,
5700 the first one was set up above. In all cases the repeat_max now specifies
5701 the number of additional copies needed. Again, we must remember to
5702 replicate entries on the forward reference list. */
5703
5704 if (repeat_max >= 0)
5705 {
5706 /* In the pre-compile phase, we don't actually do the replication. We
5707 just adjust the length as if we had. For each repetition we must add 1
5708 to the length for BRAZERO and for all but the last repetition we must
5709 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5710 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5711 a 64-bit integer type when available, otherwise double. */
5712
5713 if (lengthptr != NULL && repeat_max > 0)
5714 {
5715 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5716 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5717 if ((INT64_OR_DOUBLE)repeat_max *
5718 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5719 > (INT64_OR_DOUBLE)INT_MAX ||
5720 OFLOW_MAX - *lengthptr < delta)
5721 {
5722 *errorcodeptr = ERR20;
5723 goto FAILED;
5724 }
5725 *lengthptr += delta;
5726 }
5727
5728 /* This is compiling for real */
5729
5730 else for (i = repeat_max - 1; i >= 0; i--)
5731 {
5732 pcre_uchar *hc;
5733 pcre_uchar *this_hwm = cd->hwm;
5734
5735 *code++ = OP_BRAZERO + repeat_type;
5736
5737 /* All but the final copy start a new nesting, maintaining the
5738 chain of brackets outstanding. */
5739
5740 if (i != 0)
5741 {
5742 int offset;
5743 *code++ = OP_BRA;
5744 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5745 bralink = code;
5746 PUTINC(code, 0, offset);
5747 }
5748
5749 memcpy(code, previous, IN_UCHARS(len));
5750
5751 /* Ensure there is enough workspace for forward references before
5752 copying them. */
5753
5754 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5755 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5756 {
5757 int save_offset = save_hwm - cd->start_workspace;
5758 int this_offset = this_hwm - cd->start_workspace;
5759 *errorcodeptr = expand_workspace(cd);
5760 if (*errorcodeptr != 0) goto FAILED;
5761 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5762 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5763 }
5764
5765 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5766 {
5767 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5768 cd->hwm += LINK_SIZE;
5769 }
5770 save_hwm = this_hwm;
5771 code += len;
5772 }
5773
5774 /* Now chain through the pending brackets, and fill in their length
5775 fields (which are holding the chain links pro tem). */
5776
5777 while (bralink != NULL)
5778 {
5779 int oldlinkoffset;
5780 int offset = (int)(code - bralink + 1);
5781 pcre_uchar *bra = code - offset;
5782 oldlinkoffset = GET(bra, 1);
5783 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5784 *code++ = OP_KET;
5785 PUTINC(code, 0, offset);
5786 PUT(bra, 1, offset);
5787 }
5788 }
5789
5790 /* If the maximum is unlimited, set a repeater in the final copy. For
5791 ONCE brackets, that's all we need to do. However, possessively repeated
5792 ONCE brackets can be converted into non-capturing brackets, as the
5793 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5794 deal with possessive ONCEs specially.
5795
5796 Otherwise, when we are doing the actual compile phase, check to see
5797 whether this group is one that could match an empty string. If so,
5798 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5799 that runtime checking can be done. [This check is also applied to ONCE
5800 groups at runtime, but in a different way.]
5801
5802 Then, if the quantifier was possessive and the bracket is not a
5803 conditional, we convert the BRA code to the POS form, and the KET code to
5804 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5805 subpattern at both the start and at the end.) The use of special opcodes
5806 makes it possible to reduce greatly the stack usage in pcre_exec(). If
5807 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5808
5809 Then, if the minimum number of matches is 1 or 0, cancel the possessive
5810 flag so that the default action below, of wrapping everything inside
5811 atomic brackets, does not happen. When the minimum is greater than 1,
5812 there will be earlier copies of the group, and so we still have to wrap
5813 the whole thing. */
5814
5815 else
5816 {
5817 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5818 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5819
5820 /* Convert possessive ONCE brackets to non-capturing */
5821
5822 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5823 possessive_quantifier) *bracode = OP_BRA;
5824
5825 /* For non-possessive ONCE brackets, all we need to do is to
5826 set the KET. */
5827
5828 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5829 *ketcode = OP_KETRMAX + repeat_type;
5830
5831 /* Handle non-ONCE brackets and possessive ONCEs (which have been
5832 converted to non-capturing above). */
5833
5834 else
5835 {
5836 /* In the compile phase, check for empty string matching. */
5837
5838 if (lengthptr == NULL)
5839 {
5840 pcre_uchar *scode = bracode;
5841 do
5842 {
5843 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
5844 {
5845 *bracode += OP_SBRA - OP_BRA;
5846 break;
5847 }
5848 scode += GET(scode, 1);
5849 }
5850 while (*scode == OP_ALT);
5851 }
5852
5853 /* Handle possessive quantifiers. */
5854
5855 if (possessive_quantifier)
5856 {
5857 /* For COND brackets, we wrap the whole thing in a possessively
5858 repeated non-capturing bracket, because we have not invented POS
5859 versions of the COND opcodes. Because we are moving code along, we
5860 must ensure that any pending recursive references are updated. */
5861
5862 if (*bracode == OP_COND || *bracode == OP_SCOND)
5863 {
5864 int nlen = (int)(code - bracode);
5865 *code = OP_END;
5866 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5867 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5868 code += 1 + LINK_SIZE;
5869 nlen += 1 + LINK_SIZE;
5870 *bracode = OP_BRAPOS;
5871 *code++ = OP_KETRPOS;
5872 PUTINC(code, 0, nlen);
5873 PUT(bracode, 1, nlen);
5874 }
5875
5876 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5877
5878 else
5879 {
5880 *bracode += 1; /* Switch to xxxPOS opcodes */
5881 *ketcode = OP_KETRPOS;
5882 }
5883
5884 /* If the minimum is zero, mark it as possessive, then unset the
5885 possessive flag when the minimum is 0 or 1. */
5886
5887 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5888 if (repeat_min < 2) possessive_quantifier = FALSE;
5889 }
5890
5891 /* Non-possessive quantifier */
5892
5893 else *ketcode = OP_KETRMAX + repeat_type;
5894 }
5895 }
5896 }
5897
5898 /* If previous is OP_FAIL, it was generated by an empty class [] in
5899 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5900 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5901 error above. We can just ignore the repeat in JS case. */
5902
5903 else if (*previous == OP_FAIL) goto END_REPEAT;
5904
5905 /* Else there's some kind of shambles */
5906
5907 else
5908 {
5909 *errorcodeptr = ERR11;
5910 goto FAILED;
5911 }
5912
5913 /* If the character following a repeat is '+', or if certain optimization
5914 tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5915 there are special alternative opcodes for this case. For anything else, we
5916 wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5917 notation is just syntactic sugar, taken from Sun's Java package, but the
5918 special opcodes can optimize it.
5919
5920 Some (but not all) possessively repeated subpatterns have already been
5921 completely handled in the code just above. For them, possessive_quantifier
5922 is always FALSE at this stage.
5923
5924 Note that the repeated item starts at tempcode, not at previous, which
5925 might be the first part of a string whose (former) last char we repeated.
5926
5927 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5928 an 'upto' may follow. We skip over an 'exact' item, and then test the
5929 length of what remains before proceeding. */
5930
5931 if (possessive_quantifier)
5932 {
5933 int len;
5934
5935 if (*tempcode == OP_TYPEEXACT)
5936 tempcode += PRIV(OP_lengths)[*tempcode] +
5937 ((tempcode[1 + IMM2_SIZE] == OP_PROP
5938 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5939
5940 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5941 {
5942 tempcode += PRIV(OP_lengths)[*tempcode];
5943 #ifdef SUPPORT_UTF
5944 if (utf && HAS_EXTRALEN(tempcode[-1]))
5945 tempcode += GET_EXTRALEN(tempcode[-1]);
5946 #endif
5947 }
5948
5949 len = (int)(code - tempcode);
5950 if (len > 0) switch (*tempcode)
5951 {
5952 case OP_STAR: *tempcode = OP_POSSTAR; break;
5953 case OP_PLUS: *tempcode = OP_POSPLUS; break;
5954 case OP_QUERY: *tempcode = OP_POSQUERY; break;
5955 case OP_UPTO: *tempcode = OP_POSUPTO; break;
5956
5957 case OP_STARI: *tempcode = OP_POSSTARI; break;
5958 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
5959 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5960 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
5961
5962 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
5963 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
5964 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5965 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
5966
5967 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
5968 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
5969 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5970 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
5971
5972 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
5973 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
5974 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5975 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
5976
5977 /* Because we are moving code along, we must ensure that any
5978 pending recursive references are updated. */
5979
5980 default:
5981 *code = OP_END;
5982 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5983 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5984 code += 1 + LINK_SIZE;
5985 len += 1 + LINK_SIZE;
5986 tempcode[0] = OP_ONCE;
5987 *code++ = OP_KET;
5988 PUTINC(code, 0, len);
5989 PUT(tempcode, 1, len);
5990 break;
5991 }
5992 }
5993
5994 /* In all case we no longer have a previous item. We also set the
5995 "follows varying string" flag for subsequently encountered reqchars if
5996 it isn't already set and we have just passed a varying length item. */
5997
5998 END_REPEAT:
5999 previous = NULL;
6000 cd->req_varyopt |= reqvary;
6001 break;
6002
6003
6004 /* ===================================================================*/
6005 /* Start of nested parenthesized sub-expression, or comment or lookahead or
6006 lookbehind or option setting or condition or all the other extended
6007 parenthesis forms. */
6008
6009 case CHAR_LEFT_PARENTHESIS:
6010 newoptions = options;
6011 skipbytes = 0;
6012 bravalue = OP_CBRA;
6013 save_hwm = cd->hwm;
6014 reset_bracount = FALSE;
6015
6016 /* First deal with various "verbs" that can be introduced by '*'. */
6017
6018 ptr++;
6019 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6020 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6021 {
6022 int i, namelen;
6023 int arglen = 0;
6024 const char *vn = verbnames;
6025 const pcre_uchar *name = ptr + 1;
6026 const pcre_uchar *arg = NULL;
6027 previous = NULL;
6028 ptr++;
6029 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6030 namelen = (int)(ptr - name);
6031
6032 /* It appears that Perl allows any characters whatsoever, other than
6033 a closing parenthesis, to appear in arguments, so we no longer insist on
6034 letters, digits, and underscores. */
6035
6036 if (*ptr == CHAR_COLON)
6037 {
6038 arg = ++ptr;
6039 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6040 arglen = (int)(ptr - arg);
6041 if ((unsigned int)arglen > MAX_MARK)
6042 {
6043 *errorcodeptr = ERR75;
6044 goto FAILED;
6045 }
6046 }
6047
6048 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6049 {
6050 *errorcodeptr = ERR60;
6051 goto FAILED;
6052 }
6053
6054 /* Scan the table of verb names */
6055
6056 for (i = 0; i < verbcount; i++)
6057 {
6058 if (namelen == verbs[i].len &&
6059 STRNCMP_UC_C8(name, vn, namelen) == 0)
6060 {
6061 int setverb;
6062
6063 /* Check for open captures before ACCEPT and convert it to
6064 ASSERT_ACCEPT if in an assertion. */
6065
6066 if (verbs[i].op == OP_ACCEPT)
6067 {
6068 open_capitem *oc;
6069 if (arglen != 0)
6070 {
6071 *errorcodeptr = ERR59;
6072 goto FAILED;
6073 }
6074 cd->had_accept = TRUE;
6075 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6076 {
6077 *code++ = OP_CLOSE;
6078 PUT2INC(code, 0, oc->number);
6079 }
6080 setverb = *code++ =
6081 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6082
6083 /* Do not set firstchar after *ACCEPT */
6084 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6085 }
6086
6087 /* Handle other cases with/without an argument */
6088
6089 else if (arglen == 0)
6090 {
6091 if (verbs[i].op < 0) /* Argument is mandatory */
6092 {
6093 *errorcodeptr = ERR66;
6094 goto FAILED;
6095 }
6096 setverb = *code++ = verbs[i].op;
6097 }
6098
6099 else
6100 {
6101 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6102 {
6103 *errorcodeptr = ERR59;
6104 goto FAILED;
6105 }
6106 setverb = *code++ = verbs[i].op_arg;
6107 *code++ = arglen;
6108 memcpy(code, arg, IN_UCHARS(arglen));
6109 code += arglen;
6110 *code++ = 0;
6111 }
6112
6113 switch (setverb)
6114 {
6115 case OP_THEN:
6116 case OP_THEN_ARG:
6117 cd->external_flags |= PCRE_HASTHEN;
6118 break;
6119
6120 case OP_PRUNE:
6121 case OP_PRUNE_ARG:
6122 case OP_SKIP:
6123 case OP_SKIP_ARG:
6124 cd->had_pruneorskip = TRUE;
6125 break;
6126 }
6127
6128 break; /* Found verb, exit loop */
6129 }
6130
6131 vn += verbs[i].len + 1;
6132 }
6133
6134 if (i < verbcount) continue; /* Successfully handled a verb */
6135 *errorcodeptr = ERR60; /* Verb not recognized */
6136 goto FAILED;
6137 }
6138
6139 /* Deal with the extended parentheses; all are introduced by '?', and the
6140 appearance of any of them means that this is not a capturing group. */
6141
6142 else if (*ptr == CHAR_QUESTION_MARK)
6143 {
6144 int i, set, unset, namelen;
6145 int *optset;
6146 const pcre_uchar *name;
6147 pcre_uchar *slot;
6148
6149 switch (*(++ptr))
6150 {
6151 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
6152 ptr++;
6153 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6154 if (*ptr == CHAR_NULL)
6155 {
6156 *errorcodeptr = ERR18;
6157 goto FAILED;
6158 }
6159 continue;
6160
6161
6162 /* ------------------------------------------------------------ */
6163 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6164 reset_bracount = TRUE;
6165 /* Fall through */
6166
6167 /* ------------------------------------------------------------ */
6168 case CHAR_COLON: /* Non-capturing bracket */
6169 bravalue = OP_BRA;
6170 ptr++;
6171 break;
6172
6173
6174 /* ------------------------------------------------------------ */
6175 case CHAR_LEFT_PARENTHESIS:
6176 bravalue = OP_COND; /* Conditional group */
6177 tempptr = ptr;
6178
6179 /* A condition can be an assertion, a number (referring to a numbered
6180 group), a name (referring to a named group), or 'R', referring to
6181 recursion. R<digits> and R&name are also permitted for recursion tests.
6182
6183 There are several syntaxes for testing a named group: (?(name)) is used
6184 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
6185
6186 There are two unfortunate ambiguities, caused by history. (a) 'R' can
6187 be the recursive thing or the name 'R' (and similarly for 'R' followed
6188 by digits), and (b) a number could be a name that consists of digits.
6189 In both cases, we look for a name first; if not found, we try the other
6190 cases.
6191
6192 For compatibility with auto-callouts, we allow a callout to be
6193 specified before a condition that is an assertion. First, check for the
6194 syntax of a callout; if found, adjust the temporary pointer that is
6195 used to check for an assertion condition. That's all that is needed! */
6196
6197 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6198 {
6199 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6200 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6201 tempptr += i + 1;
6202 }
6203
6204 /* For conditions that are assertions, check the syntax, and then exit
6205 the switch. This will take control down to where bracketed groups,
6206 including assertions, are processed. */
6207
6208 if (tempptr[1] == CHAR_QUESTION_MARK &&
6209 (tempptr[2] == CHAR_EQUALS_SIGN ||
6210 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6211 tempptr[2] == CHAR_LESS_THAN_SIGN))
6212 break;
6213
6214 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6215 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6216
6217 code[1+LINK_SIZE] = OP_CREF;
6218 skipbytes = 1+IMM2_SIZE;
6219 refsign = -1;
6220
6221 /* Check for a test for recursion in a named group. */
6222
6223 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
6224 {
6225 terminator = -1;
6226 ptr += 2;
6227 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6228 }
6229
6230 /* Check for a test for a named group's having been set, using the Perl
6231 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6232 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6233 consist entirely of digits, there is scope for ambiguity. */
6234
6235 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6236 {
6237 terminator = CHAR_GREATER_THAN_SIGN;
6238 ptr++;
6239 }
6240 else if (ptr[1] == CHAR_APOSTROPHE)
6241 {
6242 terminator = CHAR_APOSTROPHE;
6243 ptr++;
6244 }
6245 else
6246 {
6247 terminator = CHAR_NULL;
6248 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6249 }
6250
6251 /* When a name is one of a number of duplicates, a different opcode is
6252 used and it needs more memory. Unfortunately we cannot tell whether a
6253 name is a duplicate in the first pass, so we have to allow for more
6254 memory except when we know it is a relative numerical reference. */
6255
6256 if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6257
6258 /* We now expect to read a name (possibly all digits); any thing else
6259 is an error. In the case of all digits, also get it as a number. */
6260
6261 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6262 {
6263 ptr += 1; /* To get the right offset */
6264 *errorcodeptr = ERR28;
6265 goto FAILED;
6266 }
6267
6268 recno = 0;
6269 name = ++ptr;
6270 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6271 {
6272 if (recno >= 0)
6273 recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
6274 ptr++;
6275 }
6276 namelen = (int)(ptr - name);
6277
6278 /* Check the terminator */
6279
6280 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6281 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6282 {
6283 ptr--; /* Error offset */
6284 *errorcodeptr = ERR26;
6285 goto FAILED;
6286 }
6287
6288 /* Do no further checking in the pre-compile phase. */
6289
6290 if (lengthptr != NULL) break;
6291
6292 /* In the real compile we do the work of looking for the actual
6293 reference. If the string started with "+" or "-" we require the rest to
6294 be digits, in which case recno will be set. */
6295
6296 if (refsign > 0)
6297 {
6298 if (recno <= 0)
6299 {
6300 *errorcodeptr = ERR58;
6301 goto FAILED;
6302 }
6303 recno = (refsign == CHAR_MINUS)?
6304 cd->bracount - recno + 1 : recno +cd->bracount;
6305 if (recno <= 0 || recno > cd->final_bracount)
6306 {
6307 *errorcodeptr = ERR15;
6308 goto FAILED;
6309 }
6310 PUT2(code, 2+LINK_SIZE, recno);
6311 break;
6312 }
6313
6314 /* Otherwise (did not start with "+" or "-"), start by looking for the
6315 name. */
6316
6317 slot = cd->name_table;
6318 for (i = 0; i < cd->names_found; i++)
6319 {
6320 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6321 slot += cd->name_entry_size;
6322 }
6323
6324 /* Found the named subpattern. If the name is duplicated, add one to
6325 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6326 appropriate data values. Otherwise, just insert the unique subpattern
6327 number. */
6328
6329 if (i < cd->names_found)
6330 {
6331 int offset = i++;
6332 int count = 1;
6333 recno = GET2(slot, 0); /* Number from first found */
6334 for (; i < cd->names_found; i++)
6335 {
6336 slot += cd->name_entry_size;
6337 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6338 count++;
6339 }
6340 if (count > 1)
6341 {
6342 PUT2(code, 2+LINK_SIZE, offset);
6343 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6344 skipbytes += IMM2_SIZE;
6345 code[1+LINK_SIZE]++;
6346 }
6347 else /* Not a duplicated name */
6348 {
6349 PUT2(code, 2+LINK_SIZE, recno);
6350 }
6351 }
6352
6353 /* If terminator == CHAR_NULL it means that the name followed directly
6354 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6355 are some further alternatives to try. For the cases where terminator !=
6356 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
6357 now checked all the possibilities, so give an error. */
6358
6359 else if (terminator != CHAR_NULL)
6360 {
6361 *errorcodeptr = ERR15;
6362 goto FAILED;
6363 }
6364
6365 /* Check for (?(R) for recursion. Allow digits after R to specify a
6366 specific group number. */
6367
6368 else if (*name == CHAR_R)
6369 {
6370 recno = 0;
6371 for (i = 1; i < namelen; i++)
6372 {
6373 if (!IS_DIGIT(name[i]))
6374 {
6375 *errorcodeptr = ERR15;
6376 goto FAILED;
6377 }
6378 recno = recno * 10 + name[i] - CHAR_0;
6379 }
6380 if (recno == 0) recno = RREF_ANY;
6381 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
6382 PUT2(code, 2+LINK_SIZE, recno);
6383 }
6384
6385 /* Similarly, check for the (?(DEFINE) "condition", which is always
6386 false. */
6387
6388 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6389 {
6390 code[1+LINK_SIZE] = OP_DEF;
6391 skipbytes = 1;
6392 }
6393
6394 /* Check for the "name" actually being a subpattern number. We are
6395 in the second pass here, so final_bracount is set. */
6396
6397 else if (recno > 0 && recno <= cd->final_bracount)
6398 {
6399 PUT2(code, 2+LINK_SIZE, recno);
6400 }
6401
6402 /* Either an unidentified subpattern, or a reference to (?(0) */
6403
6404 else
6405 {
6406 *errorcodeptr = (recno == 0)? ERR35: ERR15;
6407 goto FAILED;
6408 }
6409 break;
6410
6411
6412 /* ------------------------------------------------------------ */
6413 case CHAR_EQUALS_SIGN: /* Positive lookahead */
6414 bravalue = OP_ASSERT;
6415 cd->assert_depth += 1;
6416 ptr++;
6417 break;
6418
6419
6420 /* ------------------------------------------------------------ */
6421 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
6422 ptr++;
6423 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
6424 {
6425 *code++ = OP_FAIL;
6426 previous = NULL;
6427 continue;
6428 }
6429 bravalue = OP_ASSERT_NOT;
6430 cd->assert_depth += 1;
6431 break;
6432
6433
6434 /* ------------------------------------------------------------ */
6435 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
6436 switch (ptr[1])
6437 {
6438 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
6439 bravalue = OP_ASSERTBACK;
6440 cd->assert_depth += 1;
6441 ptr += 2;
6442 break;
6443
6444 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
6445 bravalue = OP_ASSERTBACK_NOT;
6446 cd->assert_depth += 1;
6447 ptr += 2;
6448 break;
6449
6450 default: /* Could be name define, else bad */
6451 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
6452 goto DEFINE_NAME;
6453 ptr++; /* Correct offset for error */
6454 *errorcodeptr = ERR24;
6455 goto FAILED;
6456 }
6457 break;
6458
6459
6460 /* ------------------------------------------------------------ */
6461 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
6462 bravalue = OP_ONCE;
6463 ptr++;
6464 break;
6465
6466
6467 /* ------------------------------------------------------------ */
6468 case CHAR_C: /* Callout - may be followed by digits; */
6469 previous_callout = code; /* Save for later completion */
6470 after_manual_callout = 1; /* Skip one item before completing */
6471 *code++ = OP_CALLOUT;
6472 {
6473 int n = 0;
6474 ptr++;
6475 while(IS_DIGIT(*ptr))
6476 n = n * 10 + *ptr++ - CHAR_0;
6477 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6478 {
6479 *errorcodeptr = ERR39;
6480 goto FAILED;
6481 }
6482 if (n > 255)
6483 {
6484 *errorcodeptr = ERR38;
6485 goto FAILED;
6486 }
6487 *code++ = n;
6488 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6489 PUT(code, LINK_SIZE, 0); /* Default length */
6490 code += 2 * LINK_SIZE;
6491 }
6492 previous = NULL;
6493 continue;
6494
6495
6496 /* ------------------------------------------------------------ */
6497 case CHAR_P: /* Python-style named subpattern handling */
6498 if (*(++ptr) == CHAR_EQUALS_SIGN ||
6499 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
6500 {
6501 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6502 terminator = CHAR_RIGHT_PARENTHESIS;
6503 goto NAMED_REF_OR_RECURSE;
6504 }
6505 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
6506 {
6507 *errorcodeptr = ERR41;
6508 goto FAILED;
6509 }
6510 /* Fall through to handle (?P< as (?< is handled */
6511
6512
6513 /* ------------------------------------------------------------ */
6514 DEFINE_NAME: /* Come here from (?< handling */
6515 case CHAR_APOSTROPHE:
6516 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6517 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6518 name = ++ptr;
6519
6520 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6521 namelen = (int)(ptr - name);
6522
6523 /* In the pre-compile phase, do a syntax check, remember the longest
6524 name, and then remember the group in a vector, expanding it if
6525 necessary. Duplicates for the same number are skipped; other duplicates
6526 are checked for validity. In the actual compile, there is nothing to
6527 do. */
6528
6529 if (lengthptr != NULL)
6530 {
6531 named_group *ng;
6532 pcre_uint32 number = cd->bracount + 1;
6533
6534 if (*ptr != (pcre_uchar)terminator)
6535 {
6536 *errorcodeptr = ERR42;
6537 goto FAILED;
6538 }
6539
6540 if (cd->names_found >= MAX_NAME_COUNT)
6541 {
6542 *errorcodeptr = ERR49;
6543 goto FAILED;
6544 }
6545
6546 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6547 {
6548 cd->name_entry_size = namelen + IMM2_SIZE + 1;
6549 if (namelen > MAX_NAME_SIZE)
6550 {
6551 *errorcodeptr = ERR48;
6552 goto FAILED;
6553 }
6554 }
6555
6556 /* Scan the list to check for duplicates. For duplicate names, if the
6557 number is the same, break the loop, which causes the name to be
6558 discarded; otherwise, if DUPNAMES is not set, give an error.
6559 If it is set, allow the name with a different number, but continue
6560 scanning in case this is a duplicate with the same number. For
6561 non-duplicate names, give an error if the number is duplicated. */
6562
6563 ng = cd->named_groups;
6564 for (i = 0; i < cd->names_found; i++, ng++)
6565 {
6566 if (namelen == ng->length &&
6567 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6568 {
6569 if (ng->number == number) break;
6570 if ((options & PCRE_DUPNAMES) == 0)
6571 {
6572 *errorcodeptr = ERR43;
6573 goto FAILED;
6574 }
6575 cd->dupnames = TRUE; /* Duplicate names exist */
6576 }
6577 else if (ng->number == number)
6578 {
6579 *errorcodeptr = ERR65;
6580 goto FAILED;
6581 }
6582 }
6583
6584 if (i >= cd->names_found) /* Not a duplicate with same number */
6585 {
6586 /* Increase the list size if necessary */
6587
6588 if (cd->names_found >= cd->named_group_list_size)
6589 {
6590 int newsize = cd->named_group_list_size * 2;
6591 named_group *newspace = (PUBL(malloc))
6592 (newsize * sizeof(named_group));
6593
6594 if (newspace == NULL)
6595 {
6596 *errorcodeptr = ERR21;
6597 goto FAILED;
6598 }
6599
6600 memcpy(newspace, cd->named_groups,
6601 cd->named_group_list_size * sizeof(named_group));
6602 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
6603 (PUBL(free))((void *)cd->named_groups);
6604 cd->named_groups = newspace;
6605 cd->named_group_list_size = newsize;
6606 }
6607
6608 cd->named_groups[cd->names_found].name = name;
6609 cd->named_groups[cd->names_found].length = namelen;
6610 cd->named_groups[cd->names_found].number = number;
6611 cd->names_found++;
6612 }
6613 }
6614
6615 ptr++; /* Move past > or ' in both passes. */
6616 goto NUMBERED_GROUP;
6617
6618
6619 /* ------------------------------------------------------------ */
6620 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
6621 terminator = CHAR_RIGHT_PARENTHESIS;
6622 is_recurse = TRUE;
6623 /* Fall through */
6624
6625 /* We come here from the Python syntax above that handles both
6626 references (?P=name) and recursion (?P>name), as well as falling
6627 through from the Perl recursion syntax (?&name). We also come here from
6628 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6629 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6630
6631 NAMED_REF_OR_RECURSE:
6632 name = ++ptr;
6633 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6634 namelen = (int)(ptr - name);
6635
6636 /* In the pre-compile phase, do a syntax check. We used to just set
6637 a dummy reference number, because it was not used in the first pass.
6638 However, with the change of recursive back references to be atomic,
6639 we have to look for the number so that this state can be identified, as
6640 otherwise the incorrect length is computed. If it's not a backwards
6641 reference, the dummy number will do. */
6642
6643 if (lengthptr != NULL)
6644 {
6645 named_group *ng;
6646
6647 if (namelen == 0)
6648 {
6649 *errorcodeptr = ERR62;
6650 goto FAILED;
6651 }
6652 if (*ptr != (pcre_uchar)terminator)
6653 {
6654 *errorcodeptr = ERR42;
6655 goto FAILED;
6656 }
6657 if (namelen > MAX_NAME_SIZE)