/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1376 - (show annotations)
Sat Oct 12 18:02:11 2013 UTC (5 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 295562 byte(s)
Add U+0085 and U+180E to what \s matches in UCP mode, to match Perl.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Tables of names of POSIX character classes and their lengths. The names are
264 now all in a single string, to reduce the number of relocations when a shared
265 library is dynamically loaded. The list of lengths is terminated by a zero
266 length entry. The first three must be alpha, lower, upper, as this is assumed
267 for handling case independence. */
268
269 static const char posix_names[] =
270 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
271 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
272 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
273 STRING_word0 STRING_xdigit;
274
275 static const pcre_uint8 posix_name_lengths[] = {
276 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
277
278 /* Table of class bit maps for each POSIX class. Each class is formed from a
279 base map, with an optional addition or removal of another map. Then, for some
280 classes, there is some additional tweaking: for [:blank:] the vertical space
281 characters are removed, and for [:alpha:] and [:alnum:] the underscore
282 character is removed. The triples in the table consist of the base map offset,
283 second map offset or -1 if no second map, and a non-negative value for map
284 addition or a negative value for map subtraction (if there are two maps). The
285 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
286 remove vertical space characters, 2 => remove underscore. */
287
288 static const int posix_class_maps[] = {
289 cbit_word, cbit_digit, -2, /* alpha */
290 cbit_lower, -1, 0, /* lower */
291 cbit_upper, -1, 0, /* upper */
292 cbit_word, -1, 2, /* alnum - word without underscore */
293 cbit_print, cbit_cntrl, 0, /* ascii */
294 cbit_space, -1, 1, /* blank - a GNU extension */
295 cbit_cntrl, -1, 0, /* cntrl */
296 cbit_digit, -1, 0, /* digit */
297 cbit_graph, -1, 0, /* graph */
298 cbit_print, -1, 0, /* print */
299 cbit_punct, -1, 0, /* punct */
300 cbit_space, -1, 0, /* space */
301 cbit_word, -1, 0, /* word - a Perl extension */
302 cbit_xdigit,-1, 0 /* xdigit */
303 };
304
305 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
306 substitutes must be in the order of the names, defined above, and there are
307 both positive and negative cases. NULL means no substitute. */
308
309 #ifdef SUPPORT_UCP
310 static const pcre_uchar string_PNd[] = {
311 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
312 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
313 static const pcre_uchar string_pNd[] = {
314 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
315 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
316 static const pcre_uchar string_PXsp[] = {
317 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
318 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319 static const pcre_uchar string_pXsp[] = {
320 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322 static const pcre_uchar string_PXwd[] = {
323 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
324 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325 static const pcre_uchar string_pXwd[] = {
326 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328
329 static const pcre_uchar *substitutes[] = {
330 string_PNd, /* \D */
331 string_pNd, /* \d */
332 string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
333 string_pXsp, /* \s */
334 string_PXwd, /* \W */
335 string_pXwd /* \w */
336 };
337
338 static const pcre_uchar string_pL[] = {
339 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
340 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pLl[] = {
342 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_pLu[] = {
345 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
346 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXan[] = {
348 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 static const pcre_uchar string_h[] = {
351 CHAR_BACKSLASH, CHAR_h, '\0' };
352 static const pcre_uchar string_pXps[] = {
353 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355 static const pcre_uchar string_PL[] = {
356 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
357 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358 static const pcre_uchar string_PLl[] = {
359 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
360 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361 static const pcre_uchar string_PLu[] = {
362 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
363 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
364 static const pcre_uchar string_PXan[] = {
365 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
366 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
367 static const pcre_uchar string_H[] = {
368 CHAR_BACKSLASH, CHAR_H, '\0' };
369 static const pcre_uchar string_PXps[] = {
370 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372
373 static const pcre_uchar *posix_substitutes[] = {
374 string_pL, /* alpha */
375 string_pLl, /* lower */
376 string_pLu, /* upper */
377 string_pXan, /* alnum */
378 NULL, /* ascii */
379 string_h, /* blank */
380 NULL, /* cntrl */
381 string_pNd, /* digit */
382 NULL, /* graph */
383 NULL, /* print */
384 NULL, /* punct */
385 string_pXps, /* space */ /* NOTE: Xps is POSIX space */
386 string_pXwd, /* word */
387 NULL, /* xdigit */
388 /* Negated cases */
389 string_PL, /* ^alpha */
390 string_PLl, /* ^lower */
391 string_PLu, /* ^upper */
392 string_PXan, /* ^alnum */
393 NULL, /* ^ascii */
394 string_H, /* ^blank */
395 NULL, /* ^cntrl */
396 string_PNd, /* ^digit */
397 NULL, /* ^graph */
398 NULL, /* ^print */
399 NULL, /* ^punct */
400 string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
401 string_PXwd, /* ^word */
402 NULL /* ^xdigit */
403 };
404 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
405 #endif
406
407 #define STRING(a) # a
408 #define XSTRING(s) STRING(s)
409
410 /* The texts of compile-time error messages. These are "char *" because they
411 are passed to the outside world. Do not ever re-use any error number, because
412 they are documented. Always add a new error instead. Messages marked DEAD below
413 are no longer used. This used to be a table of strings, but in order to reduce
414 the number of relocations needed when a shared library is loaded dynamically,
415 it is now one long string. We cannot use a table of offsets, because the
416 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
417 simply count through to the one we want - this isn't a performance issue
418 because these strings are used only when there is a compilation error.
419
420 Each substring ends with \0 to insert a null character. This includes the final
421 substring, so that the whole string ends with \0\0, which can be detected when
422 counting through. */
423
424 static const char error_texts[] =
425 "no error\0"
426 "\\ at end of pattern\0"
427 "\\c at end of pattern\0"
428 "unrecognized character follows \\\0"
429 "numbers out of order in {} quantifier\0"
430 /* 5 */
431 "number too big in {} quantifier\0"
432 "missing terminating ] for character class\0"
433 "invalid escape sequence in character class\0"
434 "range out of order in character class\0"
435 "nothing to repeat\0"
436 /* 10 */
437 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
438 "internal error: unexpected repeat\0"
439 "unrecognized character after (? or (?-\0"
440 "POSIX named classes are supported only within a class\0"
441 "missing )\0"
442 /* 15 */
443 "reference to non-existent subpattern\0"
444 "erroffset passed as NULL\0"
445 "unknown option bit(s) set\0"
446 "missing ) after comment\0"
447 "parentheses nested too deeply\0" /** DEAD **/
448 /* 20 */
449 "regular expression is too large\0"
450 "failed to get memory\0"
451 "unmatched parentheses\0"
452 "internal error: code overflow\0"
453 "unrecognized character after (?<\0"
454 /* 25 */
455 "lookbehind assertion is not fixed length\0"
456 "malformed number or name after (?(\0"
457 "conditional group contains more than two branches\0"
458 "assertion expected after (?(\0"
459 "(?R or (?[+-]digits must be followed by )\0"
460 /* 30 */
461 "unknown POSIX class name\0"
462 "POSIX collating elements are not supported\0"
463 "this version of PCRE is compiled without UTF support\0"
464 "spare error\0" /** DEAD **/
465 "character value in \\x{} or \\o{} is too large\0"
466 /* 35 */
467 "invalid condition (?(0)\0"
468 "\\C not allowed in lookbehind assertion\0"
469 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
470 "number after (?C is > 255\0"
471 "closing ) for (?C expected\0"
472 /* 40 */
473 "recursive call could loop indefinitely\0"
474 "unrecognized character after (?P\0"
475 "syntax error in subpattern name (missing terminator)\0"
476 "two named subpatterns have the same name\0"
477 "invalid UTF-8 string\0"
478 /* 45 */
479 "support for \\P, \\p, and \\X has not been compiled\0"
480 "malformed \\P or \\p sequence\0"
481 "unknown property name after \\P or \\p\0"
482 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
483 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
484 /* 50 */
485 "repeated subpattern is too long\0" /** DEAD **/
486 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
487 "internal error: overran compiling workspace\0"
488 "internal error: previously-checked referenced subpattern not found\0"
489 "DEFINE group contains more than one branch\0"
490 /* 55 */
491 "repeating a DEFINE group is not allowed\0" /** DEAD **/
492 "inconsistent NEWLINE options\0"
493 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
494 "a numbered reference must not be zero\0"
495 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
496 /* 60 */
497 "(*VERB) not recognized or malformed\0"
498 "number is too big\0"
499 "subpattern name expected\0"
500 "digit expected after (?+\0"
501 "] is an invalid data character in JavaScript compatibility mode\0"
502 /* 65 */
503 "different names for subpatterns of the same number are not allowed\0"
504 "(*MARK) must have an argument\0"
505 "this version of PCRE is not compiled with Unicode property support\0"
506 "\\c must be followed by an ASCII character\0"
507 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
508 /* 70 */
509 "internal error: unknown opcode in find_fixedlength()\0"
510 "\\N is not supported in a class\0"
511 "too many forward references\0"
512 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
513 "invalid UTF-16 string\0"
514 /* 75 */
515 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
516 "character value in \\u.... sequence is too large\0"
517 "invalid UTF-32 string\0"
518 "setting UTF is disabled by the application\0"
519 "non-hex character in \\x{} (closing brace missing?)\0"
520 /* 80 */
521 "non-octal character in \\o{} (closing brace missing?)\0"
522 "missing opening brace after \\o\0"
523 ;
524
525 /* Table to identify digits and hex digits. This is used when compiling
526 patterns. Note that the tables in chartables are dependent on the locale, and
527 may mark arbitrary characters as digits - but the PCRE compiling code expects
528 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
529 a private table here. It costs 256 bytes, but it is a lot faster than doing
530 character value tests (at least in some simple cases I timed), and in some
531 applications one wants PCRE to compile efficiently as well as match
532 efficiently.
533
534 For convenience, we use the same bit definitions as in chartables:
535
536 0x04 decimal digit
537 0x08 hexadecimal digit
538
539 Then we can use ctype_digit and ctype_xdigit in the code. */
540
541 /* Using a simple comparison for decimal numbers rather than a memory read
542 is much faster, and the resulting code is simpler (the compiler turns it
543 into a subtraction and unsigned comparison). */
544
545 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
546
547 #ifndef EBCDIC
548
549 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
550 UTF-8 mode. */
551
552 static const pcre_uint8 digitab[] =
553 {
554 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
555 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
556 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
557 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
558 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
559 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
560 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
561 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
562 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
563 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
564 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
565 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
566 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
586
587 #else
588
589 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
590
591 static const pcre_uint8 digitab[] =
592 {
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
609 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
617 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
623 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
624 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
625
626 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
627 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
628 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
629 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
631 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
635 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
636 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
638 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
640 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
643 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
644 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
645 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
646 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
647 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
648 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
649 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
650 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
651 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
652 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
653 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
654 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
655 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
656 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
657 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
658 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
659 #endif
660
661
662 /* This table is used to check whether auto-possessification is possible
663 between adjacent character-type opcodes. The left-hand (repeated) opcode is
664 used to select the row, and the right-hand opcode is use to select the column.
665 A value of 1 means that auto-possessification is OK. For example, the second
666 value in the first row means that \D+\d can be turned into \D++\d.
667
668 The Unicode property types (\P and \p) have to be present to fill out the table
669 because of what their opcode values are, but the table values should always be
670 zero because property types are handled separately in the code. The last four
671 columns apply to items that cannot be repeated, so there is no need to have
672 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
673 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
674
675 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
676 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
677
678 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
679 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
680 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
681 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
682 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
683 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
684 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
685 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
686 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
687 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
688 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
689 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
690 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
691 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
692 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
693 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
694 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
695 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
696 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
697 };
698
699
700 /* This table is used to check whether auto-possessification is possible
701 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
702 left-hand (repeated) opcode is used to select the row, and the right-hand
703 opcode is used to select the column. The values are as follows:
704
705 0 Always return FALSE (never auto-possessify)
706 1 Character groups are distinct (possessify if both are OP_PROP)
707 2 Check character categories in the same group (general or particular)
708 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
709
710 4 Check left general category vs right particular category
711 5 Check right general category vs left particular category
712
713 6 Left alphanum vs right general category
714 7 Left space vs right general category
715 8 Left word vs right general category
716
717 9 Right alphanum vs left general category
718 10 Right space vs left general category
719 11 Right word vs left general category
720
721 12 Left alphanum vs right particular category
722 13 Left space vs right particular category
723 14 Left word vs right particular category
724
725 15 Right alphanum vs left particular category
726 16 Right space vs left particular category
727 17 Right word vs left particular category
728 */
729
730 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
731 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
732 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
733 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
734 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
735 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
736 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
737 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
738 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
739 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
740 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
741 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
742 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
743 };
744
745 /* This table is used to check whether auto-possessification is possible
746 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
747 specifies a general category and the other specifies a particular category. The
748 row is selected by the general category and the column by the particular
749 category. The value is 1 if the particular category is not part of the general
750 category. */
751
752 static const pcre_uint8 catposstab[7][30] = {
753 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
754 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
755 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
756 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
757 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
758 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
759 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
760 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
761 };
762
763 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
764 a general or particular category. The properties in each row are those
765 that apply to the character set in question. Duplication means that a little
766 unnecessary work is done when checking, but this keeps things much simpler
767 because they can all use the same code. For more details see the comment where
768 this table is used.
769
770 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
771 "space", but from Perl 5.18 it's included, so both categories are treated the
772 same here. */
773
774 static const pcre_uint8 posspropstab[3][4] = {
775 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
776 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
777 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
778 };
779
780
781
782 /*************************************************
783 * Find an error text *
784 *************************************************/
785
786 /* The error texts are now all in one long string, to save on relocations. As
787 some of the text is of unknown length, we can't use a table of offsets.
788 Instead, just count through the strings. This is not a performance issue
789 because it happens only when there has been a compilation error.
790
791 Argument: the error number
792 Returns: pointer to the error string
793 */
794
795 static const char *
796 find_error_text(int n)
797 {
798 const char *s = error_texts;
799 for (; n > 0; n--)
800 {
801 while (*s++ != CHAR_NULL) {};
802 if (*s == CHAR_NULL) return "Error text not found (please report)";
803 }
804 return s;
805 }
806
807
808
809 /*************************************************
810 * Expand the workspace *
811 *************************************************/
812
813 /* This function is called during the second compiling phase, if the number of
814 forward references fills the existing workspace, which is originally a block on
815 the stack. A larger block is obtained from malloc() unless the ultimate limit
816 has been reached or the increase will be rather small.
817
818 Argument: pointer to the compile data block
819 Returns: 0 if all went well, else an error number
820 */
821
822 static int
823 expand_workspace(compile_data *cd)
824 {
825 pcre_uchar *newspace;
826 int newsize = cd->workspace_size * 2;
827
828 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
829 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
830 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
831 return ERR72;
832
833 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
834 if (newspace == NULL) return ERR21;
835 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
836 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
837 if (cd->workspace_size > COMPILE_WORK_SIZE)
838 (PUBL(free))((void *)cd->start_workspace);
839 cd->start_workspace = newspace;
840 cd->workspace_size = newsize;
841 return 0;
842 }
843
844
845
846 /*************************************************
847 * Check for counted repeat *
848 *************************************************/
849
850 /* This function is called when a '{' is encountered in a place where it might
851 start a quantifier. It looks ahead to see if it really is a quantifier or not.
852 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
853 where the ddds are digits.
854
855 Arguments:
856 p pointer to the first char after '{'
857
858 Returns: TRUE or FALSE
859 */
860
861 static BOOL
862 is_counted_repeat(const pcre_uchar *p)
863 {
864 if (!IS_DIGIT(*p)) return FALSE;
865 p++;
866 while (IS_DIGIT(*p)) p++;
867 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
868
869 if (*p++ != CHAR_COMMA) return FALSE;
870 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
871
872 if (!IS_DIGIT(*p)) return FALSE;
873 p++;
874 while (IS_DIGIT(*p)) p++;
875
876 return (*p == CHAR_RIGHT_CURLY_BRACKET);
877 }
878
879
880
881 /*************************************************
882 * Handle escapes *
883 *************************************************/
884
885 /* This function is called when a \ has been encountered. It either returns a
886 positive value for a simple escape such as \n, or 0 for a data character which
887 will be placed in chptr. A backreference to group n is returned as negative n.
888 When UTF-8 is enabled, a positive value greater than 255 may be returned in
889 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
890 character of the escape sequence.
891
892 Arguments:
893 ptrptr points to the pattern position pointer
894 chptr points to a returned data character
895 errorcodeptr points to the errorcode variable
896 bracount number of previous extracting brackets
897 options the options bits
898 isclass TRUE if inside a character class
899
900 Returns: zero => a data character
901 positive => a special escape sequence
902 negative => a back reference
903 on error, errorcodeptr is set
904 */
905
906 static int
907 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
908 int bracount, int options, BOOL isclass)
909 {
910 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
911 BOOL utf = (options & PCRE_UTF8) != 0;
912 const pcre_uchar *ptr = *ptrptr + 1;
913 pcre_uint32 c;
914 int escape = 0;
915 int i;
916
917 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
918 ptr--; /* Set pointer back to the last byte */
919
920 /* If backslash is at the end of the pattern, it's an error. */
921
922 if (c == CHAR_NULL) *errorcodeptr = ERR1;
923
924 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
925 in a table. A non-zero result is something that can be returned immediately.
926 Otherwise further processing may be required. */
927
928 #ifndef EBCDIC /* ASCII/UTF-8 coding */
929 /* Not alphanumeric */
930 else if (c < CHAR_0 || c > CHAR_z) {}
931 else if ((i = escapes[c - CHAR_0]) != 0)
932 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
933
934 #else /* EBCDIC coding */
935 /* Not alphanumeric */
936 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
937 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
938 #endif
939
940 /* Escapes that need further processing, or are illegal. */
941
942 else
943 {
944 const pcre_uchar *oldptr;
945 BOOL braced, negated, overflow;
946 int s;
947
948 switch (c)
949 {
950 /* A number of Perl escapes are not handled by PCRE. We give an explicit
951 error. */
952
953 case CHAR_l:
954 case CHAR_L:
955 *errorcodeptr = ERR37;
956 break;
957
958 case CHAR_u:
959 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
960 {
961 /* In JavaScript, \u must be followed by four hexadecimal numbers.
962 Otherwise it is a lowercase u letter. */
963 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
964 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
965 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
966 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
967 {
968 c = 0;
969 for (i = 0; i < 4; ++i)
970 {
971 register pcre_uint32 cc = *(++ptr);
972 #ifndef EBCDIC /* ASCII/UTF-8 coding */
973 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
974 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
975 #else /* EBCDIC coding */
976 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
977 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
978 #endif
979 }
980
981 #if defined COMPILE_PCRE8
982 if (c > (utf ? 0x10ffffU : 0xffU))
983 #elif defined COMPILE_PCRE16
984 if (c > (utf ? 0x10ffffU : 0xffffU))
985 #elif defined COMPILE_PCRE32
986 if (utf && c > 0x10ffffU)
987 #endif
988 {
989 *errorcodeptr = ERR76;
990 }
991 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
992 }
993 }
994 else
995 *errorcodeptr = ERR37;
996 break;
997
998 case CHAR_U:
999 /* In JavaScript, \U is an uppercase U letter. */
1000 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1001 break;
1002
1003 /* In a character class, \g is just a literal "g". Outside a character
1004 class, \g must be followed by one of a number of specific things:
1005
1006 (1) A number, either plain or braced. If positive, it is an absolute
1007 backreference. If negative, it is a relative backreference. This is a Perl
1008 5.10 feature.
1009
1010 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1011 is part of Perl's movement towards a unified syntax for back references. As
1012 this is synonymous with \k{name}, we fudge it up by pretending it really
1013 was \k.
1014
1015 (3) For Oniguruma compatibility we also support \g followed by a name or a
1016 number either in angle brackets or in single quotes. However, these are
1017 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1018 the ESC_g code (cf \k). */
1019
1020 case CHAR_g:
1021 if (isclass) break;
1022 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1023 {
1024 escape = ESC_g;
1025 break;
1026 }
1027
1028 /* Handle the Perl-compatible cases */
1029
1030 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1031 {
1032 const pcre_uchar *p;
1033 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1034 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1035 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1036 {
1037 escape = ESC_k;
1038 break;
1039 }
1040 braced = TRUE;
1041 ptr++;
1042 }
1043 else braced = FALSE;
1044
1045 if (ptr[1] == CHAR_MINUS)
1046 {
1047 negated = TRUE;
1048 ptr++;
1049 }
1050 else negated = FALSE;
1051
1052 /* The integer range is limited by the machine's int representation. */
1053 s = 0;
1054 overflow = FALSE;
1055 while (IS_DIGIT(ptr[1]))
1056 {
1057 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1058 {
1059 overflow = TRUE;
1060 break;
1061 }
1062 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1063 }
1064 if (overflow) /* Integer overflow */
1065 {
1066 while (IS_DIGIT(ptr[1]))
1067 ptr++;
1068 *errorcodeptr = ERR61;
1069 break;
1070 }
1071
1072 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1073 {
1074 *errorcodeptr = ERR57;
1075 break;
1076 }
1077
1078 if (s == 0)
1079 {
1080 *errorcodeptr = ERR58;
1081 break;
1082 }
1083
1084 if (negated)
1085 {
1086 if (s > bracount)
1087 {
1088 *errorcodeptr = ERR15;
1089 break;
1090 }
1091 s = bracount - (s - 1);
1092 }
1093
1094 escape = -s;
1095 break;
1096
1097 /* The handling of escape sequences consisting of a string of digits
1098 starting with one that is not zero is not straightforward. Perl has changed
1099 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1100 recommended to avoid the ambiguities in the old syntax.
1101
1102 Outside a character class, the digits are read as a decimal number. If the
1103 number is less than 8 (used to be 10), or if there are that many previous
1104 extracting left brackets, then it is a back reference. Otherwise, up to
1105 three octal digits are read to form an escaped byte. Thus \123 is likely to
1106 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1107 the octal value is greater than 377, the least significant 8 bits are
1108 taken. \8 and \9 are treated as the literal characters 8 and 9.
1109
1110 Inside a character class, \ followed by a digit is always either a literal
1111 8 or 9 or an octal number. */
1112
1113 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1114 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1115
1116 if (!isclass)
1117 {
1118 oldptr = ptr;
1119 /* The integer range is limited by the machine's int representation. */
1120 s = (int)(c -CHAR_0);
1121 overflow = FALSE;
1122 while (IS_DIGIT(ptr[1]))
1123 {
1124 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1125 {
1126 overflow = TRUE;
1127 break;
1128 }
1129 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1130 }
1131 if (overflow) /* Integer overflow */
1132 {
1133 while (IS_DIGIT(ptr[1]))
1134 ptr++;
1135 *errorcodeptr = ERR61;
1136 break;
1137 }
1138 if (s < 8 || s <= bracount) /* Check for back reference */
1139 {
1140 escape = -s;
1141 break;
1142 }
1143 ptr = oldptr; /* Put the pointer back and fall through */
1144 }
1145
1146 /* Handle a digit following \ when the number is not a back reference. If
1147 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1148 then treat the digit as a following literal. At least by Perl 5.18 this
1149 changed so as not to insert the binary zero. */
1150
1151 if ((c = *ptr) >= CHAR_8) break;
1152
1153 /* Fall through with a digit less than 8 */
1154
1155 /* \0 always starts an octal number, but we may drop through to here with a
1156 larger first octal digit. The original code used just to take the least
1157 significant 8 bits of octal numbers (I think this is what early Perls used
1158 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1159 but no more than 3 octal digits. */
1160
1161 case CHAR_0:
1162 c -= CHAR_0;
1163 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1164 c = c * 8 + *(++ptr) - CHAR_0;
1165 #ifdef COMPILE_PCRE8
1166 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1167 #endif
1168 break;
1169
1170 /* \o is a relatively new Perl feature, supporting a more general way of
1171 specifying character codes in octal. The only supported form is \o{ddd}. */
1172
1173 case CHAR_o:
1174 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1175 {
1176 ptr += 2;
1177 c = 0;
1178 overflow = FALSE;
1179 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1180 {
1181 register pcre_uint32 cc = *ptr++;
1182 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1183 #ifdef COMPILE_PCRE32
1184 if (c >= 0x20000000l) { overflow = TRUE; break; }
1185 #endif
1186 c = (c << 3) + cc - CHAR_0 ;
1187 #if defined COMPILE_PCRE8
1188 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1189 #elif defined COMPILE_PCRE16
1190 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1191 #elif defined COMPILE_PCRE32
1192 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1193 #endif
1194 }
1195 if (overflow)
1196 {
1197 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1198 *errorcodeptr = ERR34;
1199 }
1200 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1201 {
1202 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1203 }
1204 else *errorcodeptr = ERR80;
1205 }
1206 break;
1207
1208 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1209 numbers. Otherwise it is a lowercase x letter. */
1210
1211 case CHAR_x:
1212 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1213 {
1214 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1215 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1216 {
1217 c = 0;
1218 for (i = 0; i < 2; ++i)
1219 {
1220 register pcre_uint32 cc = *(++ptr);
1221 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1222 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1223 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1224 #else /* EBCDIC coding */
1225 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1226 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1227 #endif
1228 }
1229 }
1230 } /* End JavaScript handling */
1231
1232 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1233 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1234 digits. If not, { used to be treated as a data character. However, Perl
1235 seems to read hex digits up to the first non-such, and ignore the rest, so
1236 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1237 now gives an error. */
1238
1239 else
1240 {
1241 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1242 {
1243 ptr += 2;
1244 c = 0;
1245 overflow = FALSE;
1246 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1247 {
1248 register pcre_uint32 cc = *ptr++;
1249 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1250
1251 #ifdef COMPILE_PCRE32
1252 if (c >= 0x10000000l) { overflow = TRUE; break; }
1253 #endif
1254
1255 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1256 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1257 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1258 #else /* EBCDIC coding */
1259 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1260 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1261 #endif
1262
1263 #if defined COMPILE_PCRE8
1264 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1265 #elif defined COMPILE_PCRE16
1266 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1267 #elif defined COMPILE_PCRE32
1268 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1269 #endif
1270 }
1271
1272 if (overflow)
1273 {
1274 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1275 *errorcodeptr = ERR34;
1276 }
1277
1278 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1279 {
1280 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1281 }
1282
1283 /* If the sequence of hex digits does not end with '}', give an error.
1284 We used just to recognize this construct and fall through to the normal
1285 \x handling, but nowadays Perl gives an error, which seems much more
1286 sensible, so we do too. */
1287
1288 else *errorcodeptr = ERR79;
1289 } /* End of \x{} processing */
1290
1291 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1292
1293 else
1294 {
1295 c = 0;
1296 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1297 {
1298 pcre_uint32 cc; /* Some compilers don't like */
1299 cc = *(++ptr); /* ++ in initializers */
1300 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1301 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1302 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1303 #else /* EBCDIC coding */
1304 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1305 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1306 #endif
1307 }
1308 } /* End of \xdd handling */
1309 } /* End of Perl-style \x handling */
1310 break;
1311
1312 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1313 An error is given if the byte following \c is not an ASCII character. This
1314 coding is ASCII-specific, but then the whole concept of \cx is
1315 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1316
1317 case CHAR_c:
1318 c = *(++ptr);
1319 if (c == CHAR_NULL)
1320 {
1321 *errorcodeptr = ERR2;
1322 break;
1323 }
1324 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1325 if (c > 127) /* Excludes all non-ASCII in either mode */
1326 {
1327 *errorcodeptr = ERR68;
1328 break;
1329 }
1330 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1331 c ^= 0x40;
1332 #else /* EBCDIC coding */
1333 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1334 c ^= 0xC0;
1335 #endif
1336 break;
1337
1338 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1339 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1340 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1341 odd, but there used to be some cases other than the default, and there may
1342 be again in future, so I haven't "optimized" it. */
1343
1344 default:
1345 if ((options & PCRE_EXTRA) != 0) switch(c)
1346 {
1347 default:
1348 *errorcodeptr = ERR3;
1349 break;
1350 }
1351 break;
1352 }
1353 }
1354
1355 /* Perl supports \N{name} for character names, as well as plain \N for "not
1356 newline". PCRE does not support \N{name}. However, it does support
1357 quantification such as \N{2,3}. */
1358
1359 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1360 !is_counted_repeat(ptr+2))
1361 *errorcodeptr = ERR37;
1362
1363 /* If PCRE_UCP is set, we change the values for \d etc. */
1364
1365 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1366 escape += (ESC_DU - ESC_D);
1367
1368 /* Set the pointer to the final character before returning. */
1369
1370 *ptrptr = ptr;
1371 *chptr = c;
1372 return escape;
1373 }
1374
1375
1376
1377 #ifdef SUPPORT_UCP
1378 /*************************************************
1379 * Handle \P and \p *
1380 *************************************************/
1381
1382 /* This function is called after \P or \p has been encountered, provided that
1383 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1384 pointing at the P or p. On exit, it is pointing at the final character of the
1385 escape sequence.
1386
1387 Argument:
1388 ptrptr points to the pattern position pointer
1389 negptr points to a boolean that is set TRUE for negation else FALSE
1390 ptypeptr points to an unsigned int that is set to the type value
1391 pdataptr points to an unsigned int that is set to the detailed property value
1392 errorcodeptr points to the error code variable
1393
1394 Returns: TRUE if the type value was found, or FALSE for an invalid type
1395 */
1396
1397 static BOOL
1398 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1399 unsigned int *pdataptr, int *errorcodeptr)
1400 {
1401 pcre_uchar c;
1402 int i, bot, top;
1403 const pcre_uchar *ptr = *ptrptr;
1404 pcre_uchar name[32];
1405
1406 c = *(++ptr);
1407 if (c == CHAR_NULL) goto ERROR_RETURN;
1408
1409 *negptr = FALSE;
1410
1411 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1412 negation. */
1413
1414 if (c == CHAR_LEFT_CURLY_BRACKET)
1415 {
1416 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1417 {
1418 *negptr = TRUE;
1419 ptr++;
1420 }
1421 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1422 {
1423 c = *(++ptr);
1424 if (c == CHAR_NULL) goto ERROR_RETURN;
1425 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1426 name[i] = c;
1427 }
1428 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1429 name[i] = 0;
1430 }
1431
1432 /* Otherwise there is just one following character */
1433
1434 else
1435 {
1436 name[0] = c;
1437 name[1] = 0;
1438 }
1439
1440 *ptrptr = ptr;
1441
1442 /* Search for a recognized property name using binary chop */
1443
1444 bot = 0;
1445 top = PRIV(utt_size);
1446
1447 while (bot < top)
1448 {
1449 int r;
1450 i = (bot + top) >> 1;
1451 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1452 if (r == 0)
1453 {
1454 *ptypeptr = PRIV(utt)[i].type;
1455 *pdataptr = PRIV(utt)[i].value;
1456 return TRUE;
1457 }
1458 if (r > 0) bot = i + 1; else top = i;
1459 }
1460
1461 *errorcodeptr = ERR47;
1462 *ptrptr = ptr;
1463 return FALSE;
1464
1465 ERROR_RETURN:
1466 *errorcodeptr = ERR46;
1467 *ptrptr = ptr;
1468 return FALSE;
1469 }
1470 #endif
1471
1472
1473
1474 /*************************************************
1475 * Read repeat counts *
1476 *************************************************/
1477
1478 /* Read an item of the form {n,m} and return the values. This is called only
1479 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1480 so the syntax is guaranteed to be correct, but we need to check the values.
1481
1482 Arguments:
1483 p pointer to first char after '{'
1484 minp pointer to int for min
1485 maxp pointer to int for max
1486 returned as -1 if no max
1487 errorcodeptr points to error code variable
1488
1489 Returns: pointer to '}' on success;
1490 current ptr on error, with errorcodeptr set non-zero
1491 */
1492
1493 static const pcre_uchar *
1494 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1495 {
1496 int min = 0;
1497 int max = -1;
1498
1499 /* Read the minimum value and do a paranoid check: a negative value indicates
1500 an integer overflow. */
1501
1502 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1503 if (min < 0 || min > 65535)
1504 {
1505 *errorcodeptr = ERR5;
1506 return p;
1507 }
1508
1509 /* Read the maximum value if there is one, and again do a paranoid on its size.
1510 Also, max must not be less than min. */
1511
1512 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1513 {
1514 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1515 {
1516 max = 0;
1517 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1518 if (max < 0 || max > 65535)
1519 {
1520 *errorcodeptr = ERR5;
1521 return p;
1522 }
1523 if (max < min)
1524 {
1525 *errorcodeptr = ERR4;
1526 return p;
1527 }
1528 }
1529 }
1530
1531 /* Fill in the required variables, and pass back the pointer to the terminating
1532 '}'. */
1533
1534 *minp = min;
1535 *maxp = max;
1536 return p;
1537 }
1538
1539
1540
1541 /*************************************************
1542 * Find first significant op code *
1543 *************************************************/
1544
1545 /* This is called by several functions that scan a compiled expression looking
1546 for a fixed first character, or an anchoring op code etc. It skips over things
1547 that do not influence this. For some calls, it makes sense to skip negative
1548 forward and all backward assertions, and also the \b assertion; for others it
1549 does not.
1550
1551 Arguments:
1552 code pointer to the start of the group
1553 skipassert TRUE if certain assertions are to be skipped
1554
1555 Returns: pointer to the first significant opcode
1556 */
1557
1558 static const pcre_uchar*
1559 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1560 {
1561 for (;;)
1562 {
1563 switch ((int)*code)
1564 {
1565 case OP_ASSERT_NOT:
1566 case OP_ASSERTBACK:
1567 case OP_ASSERTBACK_NOT:
1568 if (!skipassert) return code;
1569 do code += GET(code, 1); while (*code == OP_ALT);
1570 code += PRIV(OP_lengths)[*code];
1571 break;
1572
1573 case OP_WORD_BOUNDARY:
1574 case OP_NOT_WORD_BOUNDARY:
1575 if (!skipassert) return code;
1576 /* Fall through */
1577
1578 case OP_CALLOUT:
1579 case OP_CREF:
1580 case OP_DNCREF:
1581 case OP_RREF:
1582 case OP_DNRREF:
1583 case OP_DEF:
1584 code += PRIV(OP_lengths)[*code];
1585 break;
1586
1587 default:
1588 return code;
1589 }
1590 }
1591 /* Control never reaches here */
1592 }
1593
1594
1595
1596 /*************************************************
1597 * Find the fixed length of a branch *
1598 *************************************************/
1599
1600 /* Scan a branch and compute the fixed length of subject that will match it,
1601 if the length is fixed. This is needed for dealing with backward assertions.
1602 In UTF8 mode, the result is in characters rather than bytes. The branch is
1603 temporarily terminated with OP_END when this function is called.
1604
1605 This function is called when a backward assertion is encountered, so that if it
1606 fails, the error message can point to the correct place in the pattern.
1607 However, we cannot do this when the assertion contains subroutine calls,
1608 because they can be forward references. We solve this by remembering this case
1609 and doing the check at the end; a flag specifies which mode we are running in.
1610
1611 Arguments:
1612 code points to the start of the pattern (the bracket)
1613 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1614 atend TRUE if called when the pattern is complete
1615 cd the "compile data" structure
1616
1617 Returns: the fixed length,
1618 or -1 if there is no fixed length,
1619 or -2 if \C was encountered (in UTF-8 mode only)
1620 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1621 or -4 if an unknown opcode was encountered (internal error)
1622 */
1623
1624 static int
1625 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1626 {
1627 int length = -1;
1628
1629 register int branchlength = 0;
1630 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1631
1632 /* Scan along the opcodes for this branch. If we get to the end of the
1633 branch, check the length against that of the other branches. */
1634
1635 for (;;)
1636 {
1637 int d;
1638 pcre_uchar *ce, *cs;
1639 register pcre_uchar op = *cc;
1640
1641 switch (op)
1642 {
1643 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1644 OP_BRA (normal non-capturing bracket) because the other variants of these
1645 opcodes are all concerned with unlimited repeated groups, which of course
1646 are not of fixed length. */
1647
1648 case OP_CBRA:
1649 case OP_BRA:
1650 case OP_ONCE:
1651 case OP_ONCE_NC:
1652 case OP_COND:
1653 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1654 if (d < 0) return d;
1655 branchlength += d;
1656 do cc += GET(cc, 1); while (*cc == OP_ALT);
1657 cc += 1 + LINK_SIZE;
1658 break;
1659
1660 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1661 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1662 an ALT. If it is END it's the end of the outer call. All can be handled by
1663 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1664 because they all imply an unlimited repeat. */
1665
1666 case OP_ALT:
1667 case OP_KET:
1668 case OP_END:
1669 case OP_ACCEPT:
1670 case OP_ASSERT_ACCEPT:
1671 if (length < 0) length = branchlength;
1672 else if (length != branchlength) return -1;
1673 if (*cc != OP_ALT) return length;
1674 cc += 1 + LINK_SIZE;
1675 branchlength = 0;
1676 break;
1677
1678 /* A true recursion implies not fixed length, but a subroutine call may
1679 be OK. If the subroutine is a forward reference, we can't deal with
1680 it until the end of the pattern, so return -3. */
1681
1682 case OP_RECURSE:
1683 if (!atend) return -3;
1684 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1685 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1686 if (cc > cs && cc < ce) return -1; /* Recursion */
1687 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1688 if (d < 0) return d;
1689 branchlength += d;
1690 cc += 1 + LINK_SIZE;
1691 break;
1692
1693 /* Skip over assertive subpatterns */
1694
1695 case OP_ASSERT:
1696 case OP_ASSERT_NOT:
1697 case OP_ASSERTBACK:
1698 case OP_ASSERTBACK_NOT:
1699 do cc += GET(cc, 1); while (*cc == OP_ALT);
1700 cc += PRIV(OP_lengths)[*cc];
1701 break;
1702
1703 /* Skip over things that don't match chars */
1704
1705 case OP_MARK:
1706 case OP_PRUNE_ARG:
1707 case OP_SKIP_ARG:
1708 case OP_THEN_ARG:
1709 cc += cc[1] + PRIV(OP_lengths)[*cc];
1710 break;
1711
1712 case OP_CALLOUT:
1713 case OP_CIRC:
1714 case OP_CIRCM:
1715 case OP_CLOSE:
1716 case OP_COMMIT:
1717 case OP_CREF:
1718 case OP_DEF:
1719 case OP_DNCREF:
1720 case OP_DNRREF:
1721 case OP_DOLL:
1722 case OP_DOLLM:
1723 case OP_EOD:
1724 case OP_EODN:
1725 case OP_FAIL:
1726 case OP_NOT_WORD_BOUNDARY:
1727 case OP_PRUNE:
1728 case OP_REVERSE:
1729 case OP_RREF:
1730 case OP_SET_SOM:
1731 case OP_SKIP:
1732 case OP_SOD:
1733 case OP_SOM:
1734 case OP_THEN:
1735 case OP_WORD_BOUNDARY:
1736 cc += PRIV(OP_lengths)[*cc];
1737 break;
1738
1739 /* Handle literal characters */
1740
1741 case OP_CHAR:
1742 case OP_CHARI:
1743 case OP_NOT:
1744 case OP_NOTI:
1745 branchlength++;
1746 cc += 2;
1747 #ifdef SUPPORT_UTF
1748 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1749 #endif
1750 break;
1751
1752 /* Handle exact repetitions. The count is already in characters, but we
1753 need to skip over a multibyte character in UTF8 mode. */
1754
1755 case OP_EXACT:
1756 case OP_EXACTI:
1757 case OP_NOTEXACT:
1758 case OP_NOTEXACTI:
1759 branchlength += (int)GET2(cc,1);
1760 cc += 2 + IMM2_SIZE;
1761 #ifdef SUPPORT_UTF
1762 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1763 #endif
1764 break;
1765
1766 case OP_TYPEEXACT:
1767 branchlength += GET2(cc,1);
1768 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1769 cc += 2;
1770 cc += 1 + IMM2_SIZE + 1;
1771 break;
1772
1773 /* Handle single-char matchers */
1774
1775 case OP_PROP:
1776 case OP_NOTPROP:
1777 cc += 2;
1778 /* Fall through */
1779
1780 case OP_HSPACE:
1781 case OP_VSPACE:
1782 case OP_NOT_HSPACE:
1783 case OP_NOT_VSPACE:
1784 case OP_NOT_DIGIT:
1785 case OP_DIGIT:
1786 case OP_NOT_WHITESPACE:
1787 case OP_WHITESPACE:
1788 case OP_NOT_WORDCHAR:
1789 case OP_WORDCHAR:
1790 case OP_ANY:
1791 case OP_ALLANY:
1792 branchlength++;
1793 cc++;
1794 break;
1795
1796 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1797 otherwise \C is coded as OP_ALLANY. */
1798
1799 case OP_ANYBYTE:
1800 return -2;
1801
1802 /* Check a class for variable quantification */
1803
1804 case OP_CLASS:
1805 case OP_NCLASS:
1806 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1807 case OP_XCLASS:
1808 /* The original code caused an unsigned overflow in 64 bit systems,
1809 so now we use a conditional statement. */
1810 if (op == OP_XCLASS)
1811 cc += GET(cc, 1);
1812 else
1813 cc += PRIV(OP_lengths)[OP_CLASS];
1814 #else
1815 cc += PRIV(OP_lengths)[OP_CLASS];
1816 #endif
1817
1818 switch (*cc)
1819 {
1820 case OP_CRPLUS:
1821 case OP_CRMINPLUS:
1822 case OP_CRSTAR:
1823 case OP_CRMINSTAR:
1824 case OP_CRQUERY:
1825 case OP_CRMINQUERY:
1826 return -1;
1827
1828 case OP_CRRANGE:
1829 case OP_CRMINRANGE:
1830 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1831 branchlength += (int)GET2(cc,1);
1832 cc += 1 + 2 * IMM2_SIZE;
1833 break;
1834
1835 default:
1836 branchlength++;
1837 }
1838 break;
1839
1840 /* Anything else is variable length */
1841
1842 case OP_ANYNL:
1843 case OP_BRAMINZERO:
1844 case OP_BRAPOS:
1845 case OP_BRAPOSZERO:
1846 case OP_BRAZERO:
1847 case OP_CBRAPOS:
1848 case OP_EXTUNI:
1849 case OP_KETRMAX:
1850 case OP_KETRMIN:
1851 case OP_KETRPOS:
1852 case OP_MINPLUS:
1853 case OP_MINPLUSI:
1854 case OP_MINQUERY:
1855 case OP_MINQUERYI:
1856 case OP_MINSTAR:
1857 case OP_MINSTARI:
1858 case OP_MINUPTO:
1859 case OP_MINUPTOI:
1860 case OP_NOTMINPLUS:
1861 case OP_NOTMINPLUSI:
1862 case OP_NOTMINQUERY:
1863 case OP_NOTMINQUERYI:
1864 case OP_NOTMINSTAR:
1865 case OP_NOTMINSTARI:
1866 case OP_NOTMINUPTO:
1867 case OP_NOTMINUPTOI:
1868 case OP_NOTPLUS:
1869 case OP_NOTPLUSI:
1870 case OP_NOTPOSPLUS:
1871 case OP_NOTPOSPLUSI:
1872 case OP_NOTPOSQUERY:
1873 case OP_NOTPOSQUERYI:
1874 case OP_NOTPOSSTAR:
1875 case OP_NOTPOSSTARI:
1876 case OP_NOTPOSUPTO:
1877 case OP_NOTPOSUPTOI:
1878 case OP_NOTQUERY:
1879 case OP_NOTQUERYI:
1880 case OP_NOTSTAR:
1881 case OP_NOTSTARI:
1882 case OP_NOTUPTO:
1883 case OP_NOTUPTOI:
1884 case OP_PLUS:
1885 case OP_PLUSI:
1886 case OP_POSPLUS:
1887 case OP_POSPLUSI:
1888 case OP_POSQUERY:
1889 case OP_POSQUERYI:
1890 case OP_POSSTAR:
1891 case OP_POSSTARI:
1892 case OP_POSUPTO:
1893 case OP_POSUPTOI:
1894 case OP_QUERY:
1895 case OP_QUERYI:
1896 case OP_REF:
1897 case OP_REFI:
1898 case OP_DNREF:
1899 case OP_DNREFI:
1900 case OP_SBRA:
1901 case OP_SBRAPOS:
1902 case OP_SCBRA:
1903 case OP_SCBRAPOS:
1904 case OP_SCOND:
1905 case OP_SKIPZERO:
1906 case OP_STAR:
1907 case OP_STARI:
1908 case OP_TYPEMINPLUS:
1909 case OP_TYPEMINQUERY:
1910 case OP_TYPEMINSTAR:
1911 case OP_TYPEMINUPTO:
1912 case OP_TYPEPLUS:
1913 case OP_TYPEPOSPLUS:
1914 case OP_TYPEPOSQUERY:
1915 case OP_TYPEPOSSTAR:
1916 case OP_TYPEPOSUPTO:
1917 case OP_TYPEQUERY:
1918 case OP_TYPESTAR:
1919 case OP_TYPEUPTO:
1920 case OP_UPTO:
1921 case OP_UPTOI:
1922 return -1;
1923
1924 /* Catch unrecognized opcodes so that when new ones are added they
1925 are not forgotten, as has happened in the past. */
1926
1927 default:
1928 return -4;
1929 }
1930 }
1931 /* Control never gets here */
1932 }
1933
1934
1935
1936 /*************************************************
1937 * Scan compiled regex for specific bracket *
1938 *************************************************/
1939
1940 /* This little function scans through a compiled pattern until it finds a
1941 capturing bracket with the given number, or, if the number is negative, an
1942 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1943 so that it can be called from pcre_study() when finding the minimum matching
1944 length.
1945
1946 Arguments:
1947 code points to start of expression
1948 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1949 number the required bracket number or negative to find a lookbehind
1950
1951 Returns: pointer to the opcode for the bracket, or NULL if not found
1952 */
1953
1954 const pcre_uchar *
1955 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
1956 {
1957 for (;;)
1958 {
1959 register pcre_uchar c = *code;
1960
1961 if (c == OP_END) return NULL;
1962
1963 /* XCLASS is used for classes that cannot be represented just by a bit
1964 map. This includes negated single high-valued characters. The length in
1965 the table is zero; the actual length is stored in the compiled code. */
1966
1967 if (c == OP_XCLASS) code += GET(code, 1);
1968
1969 /* Handle recursion */
1970
1971 else if (c == OP_REVERSE)
1972 {
1973 if (number < 0) return (pcre_uchar *)code;
1974 code += PRIV(OP_lengths)[c];
1975 }
1976
1977 /* Handle capturing bracket */
1978
1979 else if (c == OP_CBRA || c == OP_SCBRA ||
1980 c == OP_CBRAPOS || c == OP_SCBRAPOS)
1981 {
1982 int n = (int)GET2(code, 1+LINK_SIZE);
1983 if (n == number) return (pcre_uchar *)code;
1984 code += PRIV(OP_lengths)[c];
1985 }
1986
1987 /* Otherwise, we can get the item's length from the table, except that for
1988 repeated character types, we have to test for \p and \P, which have an extra
1989 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1990 must add in its length. */
1991
1992 else
1993 {
1994 switch(c)
1995 {
1996 case OP_TYPESTAR:
1997 case OP_TYPEMINSTAR:
1998 case OP_TYPEPLUS:
1999 case OP_TYPEMINPLUS:
2000 case OP_TYPEQUERY:
2001 case OP_TYPEMINQUERY:
2002 case OP_TYPEPOSSTAR:
2003 case OP_TYPEPOSPLUS:
2004 case OP_TYPEPOSQUERY:
2005 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2006 break;
2007
2008 case OP_TYPEUPTO:
2009 case OP_TYPEMINUPTO:
2010 case OP_TYPEEXACT:
2011 case OP_TYPEPOSUPTO:
2012 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2013 code += 2;
2014 break;
2015
2016 case OP_MARK:
2017 case OP_PRUNE_ARG:
2018 case OP_SKIP_ARG:
2019 case OP_THEN_ARG:
2020 code += code[1];
2021 break;
2022 }
2023
2024 /* Add in the fixed length from the table */
2025
2026 code += PRIV(OP_lengths)[c];
2027
2028 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2029 a multi-byte character. The length in the table is a minimum, so we have to
2030 arrange to skip the extra bytes. */
2031
2032 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2033 if (utf) switch(c)
2034 {
2035 case OP_CHAR:
2036 case OP_CHARI:
2037 case OP_EXACT:
2038 case OP_EXACTI:
2039 case OP_UPTO:
2040 case OP_UPTOI:
2041 case OP_MINUPTO:
2042 case OP_MINUPTOI:
2043 case OP_POSUPTO:
2044 case OP_POSUPTOI:
2045 case OP_STAR:
2046 case OP_STARI:
2047 case OP_MINSTAR:
2048 case OP_MINSTARI:
2049 case OP_POSSTAR:
2050 case OP_POSSTARI:
2051 case OP_PLUS:
2052 case OP_PLUSI:
2053 case OP_MINPLUS:
2054 case OP_MINPLUSI:
2055 case OP_POSPLUS:
2056 case OP_POSPLUSI:
2057 case OP_QUERY:
2058 case OP_QUERYI:
2059 case OP_MINQUERY:
2060 case OP_MINQUERYI:
2061 case OP_POSQUERY:
2062 case OP_POSQUERYI:
2063 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2064 break;
2065 }
2066 #else
2067 (void)(utf); /* Keep compiler happy by referencing function argument */
2068 #endif
2069 }
2070 }
2071 }
2072
2073
2074
2075 /*************************************************
2076 * Scan compiled regex for recursion reference *
2077 *************************************************/
2078
2079 /* This little function scans through a compiled pattern until it finds an
2080 instance of OP_RECURSE.
2081
2082 Arguments:
2083 code points to start of expression
2084 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2085
2086 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2087 */
2088
2089 static const pcre_uchar *
2090 find_recurse(const pcre_uchar *code, BOOL utf)
2091 {
2092 for (;;)
2093 {
2094 register pcre_uchar c = *code;
2095 if (c == OP_END) return NULL;
2096 if (c == OP_RECURSE) return code;
2097
2098 /* XCLASS is used for classes that cannot be represented just by a bit
2099 map. This includes negated single high-valued characters. The length in
2100 the table is zero; the actual length is stored in the compiled code. */
2101
2102 if (c == OP_XCLASS) code += GET(code, 1);
2103
2104 /* Otherwise, we can get the item's length from the table, except that for
2105 repeated character types, we have to test for \p and \P, which have an extra
2106 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2107 must add in its length. */
2108
2109 else
2110 {
2111 switch(c)
2112 {
2113 case OP_TYPESTAR:
2114 case OP_TYPEMINSTAR:
2115 case OP_TYPEPLUS:
2116 case OP_TYPEMINPLUS:
2117 case OP_TYPEQUERY:
2118 case OP_TYPEMINQUERY:
2119 case OP_TYPEPOSSTAR:
2120 case OP_TYPEPOSPLUS:
2121 case OP_TYPEPOSQUERY:
2122 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2123 break;
2124
2125 case OP_TYPEPOSUPTO:
2126 case OP_TYPEUPTO:
2127 case OP_TYPEMINUPTO:
2128 case OP_TYPEEXACT:
2129 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2130 code += 2;
2131 break;
2132
2133 case OP_MARK:
2134 case OP_PRUNE_ARG:
2135 case OP_SKIP_ARG:
2136 case OP_THEN_ARG:
2137 code += code[1];
2138 break;
2139 }
2140
2141 /* Add in the fixed length from the table */
2142
2143 code += PRIV(OP_lengths)[c];
2144
2145 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2146 by a multi-byte character. The length in the table is a minimum, so we have
2147 to arrange to skip the extra bytes. */
2148
2149 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2150 if (utf) switch(c)
2151 {
2152 case OP_CHAR:
2153 case OP_CHARI:
2154 case OP_NOT:
2155 case OP_NOTI:
2156 case OP_EXACT:
2157 case OP_EXACTI:
2158 case OP_NOTEXACT:
2159 case OP_NOTEXACTI:
2160 case OP_UPTO:
2161 case OP_UPTOI:
2162 case OP_NOTUPTO:
2163 case OP_NOTUPTOI:
2164 case OP_MINUPTO:
2165 case OP_MINUPTOI:
2166 case OP_NOTMINUPTO:
2167 case OP_NOTMINUPTOI:
2168 case OP_POSUPTO:
2169 case OP_POSUPTOI:
2170 case OP_NOTPOSUPTO:
2171 case OP_NOTPOSUPTOI:
2172 case OP_STAR:
2173 case OP_STARI:
2174 case OP_NOTSTAR:
2175 case OP_NOTSTARI:
2176 case OP_MINSTAR:
2177 case OP_MINSTARI:
2178 case OP_NOTMINSTAR:
2179 case OP_NOTMINSTARI:
2180 case OP_POSSTAR:
2181 case OP_POSSTARI:
2182 case OP_NOTPOSSTAR:
2183 case OP_NOTPOSSTARI:
2184 case OP_PLUS:
2185 case OP_PLUSI:
2186 case OP_NOTPLUS:
2187 case OP_NOTPLUSI:
2188 case OP_MINPLUS:
2189 case OP_MINPLUSI:
2190 case OP_NOTMINPLUS:
2191 case OP_NOTMINPLUSI:
2192 case OP_POSPLUS:
2193 case OP_POSPLUSI:
2194 case OP_NOTPOSPLUS:
2195 case OP_NOTPOSPLUSI:
2196 case OP_QUERY:
2197 case OP_QUERYI:
2198 case OP_NOTQUERY:
2199 case OP_NOTQUERYI:
2200 case OP_MINQUERY:
2201 case OP_MINQUERYI:
2202 case OP_NOTMINQUERY:
2203 case OP_NOTMINQUERYI:
2204 case OP_POSQUERY:
2205 case OP_POSQUERYI:
2206 case OP_NOTPOSQUERY:
2207 case OP_NOTPOSQUERYI:
2208 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2209 break;
2210 }
2211 #else
2212 (void)(utf); /* Keep compiler happy by referencing function argument */
2213 #endif
2214 }
2215 }
2216 }
2217
2218
2219
2220 /*************************************************
2221 * Scan compiled branch for non-emptiness *
2222 *************************************************/
2223
2224 /* This function scans through a branch of a compiled pattern to see whether it
2225 can match the empty string or not. It is called from could_be_empty()
2226 below and from compile_branch() when checking for an unlimited repeat of a
2227 group that can match nothing. Note that first_significant_code() skips over
2228 backward and negative forward assertions when its final argument is TRUE. If we
2229 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2230 bracket whose current branch will already have been scanned.
2231
2232 Arguments:
2233 code points to start of search
2234 endcode points to where to stop
2235 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2236 cd contains pointers to tables etc.
2237 recurses chain of recurse_check to catch mutual recursion
2238
2239 Returns: TRUE if what is matched could be empty
2240 */
2241
2242 typedef struct recurse_check {
2243 struct recurse_check *prev;
2244 const pcre_uchar *group;
2245 } recurse_check;
2246
2247 static BOOL
2248 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2249 BOOL utf, compile_data *cd, recurse_check *recurses)
2250 {
2251 register pcre_uchar c;
2252 recurse_check this_recurse;
2253
2254 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2255 code < endcode;
2256 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2257 {
2258 const pcre_uchar *ccode;
2259
2260 c = *code;
2261
2262 /* Skip over forward assertions; the other assertions are skipped by
2263 first_significant_code() with a TRUE final argument. */
2264
2265 if (c == OP_ASSERT)
2266 {
2267 do code += GET(code, 1); while (*code == OP_ALT);
2268 c = *code;
2269 continue;
2270 }
2271
2272 /* For a recursion/subroutine call, if its end has been reached, which
2273 implies a backward reference subroutine call, we can scan it. If it's a
2274 forward reference subroutine call, we can't. To detect forward reference
2275 we have to scan up the list that is kept in the workspace. This function is
2276 called only when doing the real compile, not during the pre-compile that
2277 measures the size of the compiled pattern. */
2278
2279 if (c == OP_RECURSE)
2280 {
2281 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2282 BOOL empty_branch;
2283
2284 /* Test for forward reference or uncompleted reference. This is disabled
2285 when called to scan a completed pattern by setting cd->start_workspace to
2286 NULL. */
2287
2288 if (cd->start_workspace != NULL)
2289 {
2290 const pcre_uchar *tcode;
2291 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2292 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2293 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2294 }
2295
2296 /* If we are scanning a completed pattern, there are no forward references
2297 and all groups are complete. We need to detect whether this is a recursive
2298 call, as otherwise there will be an infinite loop. If it is a recursion,
2299 just skip over it. Simple recursions are easily detected. For mutual
2300 recursions we keep a chain on the stack. */
2301
2302 else
2303 {
2304 recurse_check *r = recurses;
2305 const pcre_uchar *endgroup = scode;
2306
2307 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2308 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2309
2310 for (r = recurses; r != NULL; r = r->prev)
2311 if (r->group == scode) break;
2312 if (r != NULL) continue; /* Mutual recursion */
2313 }
2314
2315 /* Completed reference; scan the referenced group, remembering it on the
2316 stack chain to detect mutual recursions. */
2317
2318 empty_branch = FALSE;
2319 this_recurse.prev = recurses;
2320 this_recurse.group = scode;
2321
2322 do
2323 {
2324 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2325 {
2326 empty_branch = TRUE;
2327 break;
2328 }
2329 scode += GET(scode, 1);
2330 }
2331 while (*scode == OP_ALT);
2332
2333 if (!empty_branch) return FALSE; /* All branches are non-empty */
2334 continue;
2335 }
2336
2337 /* Groups with zero repeats can of course be empty; skip them. */
2338
2339 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2340 c == OP_BRAPOSZERO)
2341 {
2342 code += PRIV(OP_lengths)[c];
2343 do code += GET(code, 1); while (*code == OP_ALT);
2344 c = *code;
2345 continue;
2346 }
2347
2348 /* A nested group that is already marked as "could be empty" can just be
2349 skipped. */
2350
2351 if (c == OP_SBRA || c == OP_SBRAPOS ||
2352 c == OP_SCBRA || c == OP_SCBRAPOS)
2353 {
2354 do code += GET(code, 1); while (*code == OP_ALT);
2355 c = *code;
2356 continue;
2357 }
2358
2359 /* For other groups, scan the branches. */
2360
2361 if (c == OP_BRA || c == OP_BRAPOS ||
2362 c == OP_CBRA || c == OP_CBRAPOS ||
2363 c == OP_ONCE || c == OP_ONCE_NC ||
2364 c == OP_COND)
2365 {
2366 BOOL empty_branch;
2367 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2368
2369 /* If a conditional group has only one branch, there is a second, implied,
2370 empty branch, so just skip over the conditional, because it could be empty.
2371 Otherwise, scan the individual branches of the group. */
2372
2373 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2374 code += GET(code, 1);
2375 else
2376 {
2377 empty_branch = FALSE;
2378 do
2379 {
2380 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2381 empty_branch = TRUE;
2382 code += GET(code, 1);
2383 }
2384 while (*code == OP_ALT);
2385 if (!empty_branch) return FALSE; /* All branches are non-empty */
2386 }
2387
2388 c = *code;
2389 continue;
2390 }
2391
2392 /* Handle the other opcodes */
2393
2394 switch (c)
2395 {
2396 /* Check for quantifiers after a class. XCLASS is used for classes that
2397 cannot be represented just by a bit map. This includes negated single
2398 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2399 actual length is stored in the compiled code, so we must update "code"
2400 here. */
2401
2402 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2403 case OP_XCLASS:
2404 ccode = code += GET(code, 1);
2405 goto CHECK_CLASS_REPEAT;
2406 #endif
2407
2408 case OP_CLASS:
2409 case OP_NCLASS:
2410 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2411
2412 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2413 CHECK_CLASS_REPEAT:
2414 #endif
2415
2416 switch (*ccode)
2417 {
2418 case OP_CRSTAR: /* These could be empty; continue */
2419 case OP_CRMINSTAR:
2420 case OP_CRQUERY:
2421 case OP_CRMINQUERY:
2422 break;
2423
2424 default: /* Non-repeat => class must match */
2425 case OP_CRPLUS: /* These repeats aren't empty */
2426 case OP_CRMINPLUS:
2427 return FALSE;
2428
2429 case OP_CRRANGE:
2430 case OP_CRMINRANGE:
2431 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2432 break;
2433 }
2434 break;
2435
2436 /* Opcodes that must match a character */
2437
2438 case OP_ANY:
2439 case OP_ALLANY:
2440 case OP_ANYBYTE:
2441
2442 case OP_PROP:
2443 case OP_NOTPROP:
2444 case OP_ANYNL:
2445
2446 case OP_NOT_HSPACE:
2447 case OP_HSPACE:
2448 case OP_NOT_VSPACE:
2449 case OP_VSPACE:
2450 case OP_EXTUNI:
2451
2452 case OP_NOT_DIGIT:
2453 case OP_DIGIT:
2454 case OP_NOT_WHITESPACE:
2455 case OP_WHITESPACE:
2456 case OP_NOT_WORDCHAR:
2457 case OP_WORDCHAR:
2458
2459 case OP_CHAR:
2460 case OP_CHARI:
2461 case OP_NOT:
2462 case OP_NOTI:
2463
2464 case OP_PLUS:
2465 case OP_PLUSI:
2466 case OP_MINPLUS:
2467 case OP_MINPLUSI:
2468
2469 case OP_NOTPLUS:
2470 case OP_NOTPLUSI:
2471 case OP_NOTMINPLUS:
2472 case OP_NOTMINPLUSI:
2473
2474 case OP_POSPLUS:
2475 case OP_POSPLUSI:
2476 case OP_NOTPOSPLUS:
2477 case OP_NOTPOSPLUSI:
2478
2479 case OP_EXACT:
2480 case OP_EXACTI:
2481 case OP_NOTEXACT:
2482 case OP_NOTEXACTI:
2483
2484 case OP_TYPEPLUS:
2485 case OP_TYPEMINPLUS:
2486 case OP_TYPEPOSPLUS:
2487 case OP_TYPEEXACT:
2488
2489 return FALSE;
2490
2491 /* These are going to continue, as they may be empty, but we have to
2492 fudge the length for the \p and \P cases. */
2493
2494 case OP_TYPESTAR:
2495 case OP_TYPEMINSTAR:
2496 case OP_TYPEPOSSTAR:
2497 case OP_TYPEQUERY:
2498 case OP_TYPEMINQUERY:
2499 case OP_TYPEPOSQUERY:
2500 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2501 break;
2502
2503 /* Same for these */
2504
2505 case OP_TYPEUPTO:
2506 case OP_TYPEMINUPTO:
2507 case OP_TYPEPOSUPTO:
2508 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2509 code += 2;
2510 break;
2511
2512 /* End of branch */
2513
2514 case OP_KET:
2515 case OP_KETRMAX:
2516 case OP_KETRMIN:
2517 case OP_KETRPOS:
2518 case OP_ALT:
2519 return TRUE;
2520
2521 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2522 MINUPTO, and POSUPTO and their caseless and negative versions may be
2523 followed by a multibyte character. */
2524
2525 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2526 case OP_STAR:
2527 case OP_STARI:
2528 case OP_NOTSTAR:
2529 case OP_NOTSTARI:
2530
2531 case OP_MINSTAR:
2532 case OP_MINSTARI:
2533 case OP_NOTMINSTAR:
2534 case OP_NOTMINSTARI:
2535
2536 case OP_POSSTAR:
2537 case OP_POSSTARI:
2538 case OP_NOTPOSSTAR:
2539 case OP_NOTPOSSTARI:
2540
2541 case OP_QUERY:
2542 case OP_QUERYI:
2543 case OP_NOTQUERY:
2544 case OP_NOTQUERYI:
2545
2546 case OP_MINQUERY:
2547 case OP_MINQUERYI:
2548 case OP_NOTMINQUERY:
2549 case OP_NOTMINQUERYI:
2550
2551 case OP_POSQUERY:
2552 case OP_POSQUERYI:
2553 case OP_NOTPOSQUERY:
2554 case OP_NOTPOSQUERYI:
2555
2556 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2557 break;
2558
2559 case OP_UPTO:
2560 case OP_UPTOI:
2561 case OP_NOTUPTO:
2562 case OP_NOTUPTOI:
2563
2564 case OP_MINUPTO:
2565 case OP_MINUPTOI:
2566 case OP_NOTMINUPTO:
2567 case OP_NOTMINUPTOI:
2568
2569 case OP_POSUPTO:
2570 case OP_POSUPTOI:
2571 case OP_NOTPOSUPTO:
2572 case OP_NOTPOSUPTOI:
2573
2574 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2575 break;
2576 #endif
2577
2578 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2579 string. */
2580
2581 case OP_MARK:
2582 case OP_PRUNE_ARG:
2583 case OP_SKIP_ARG:
2584 case OP_THEN_ARG:
2585 code += code[1];
2586 break;
2587
2588 /* None of the remaining opcodes are required to match a character. */
2589
2590 default:
2591 break;
2592 }
2593 }
2594
2595 return TRUE;
2596 }
2597
2598
2599
2600 /*************************************************
2601 * Scan compiled regex for non-emptiness *
2602 *************************************************/
2603
2604 /* This function is called to check for left recursive calls. We want to check
2605 the current branch of the current pattern to see if it could match the empty
2606 string. If it could, we must look outwards for branches at other levels,
2607 stopping when we pass beyond the bracket which is the subject of the recursion.
2608 This function is called only during the real compile, not during the
2609 pre-compile.
2610
2611 Arguments:
2612 code points to start of the recursion
2613 endcode points to where to stop (current RECURSE item)
2614 bcptr points to the chain of current (unclosed) branch starts
2615 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2616 cd pointers to tables etc
2617
2618 Returns: TRUE if what is matched could be empty
2619 */
2620
2621 static BOOL
2622 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2623 branch_chain *bcptr, BOOL utf, compile_data *cd)
2624 {
2625 while (bcptr != NULL && bcptr->current_branch >= code)
2626 {
2627 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2628 return FALSE;
2629 bcptr = bcptr->outer;
2630 }
2631 return TRUE;
2632 }
2633
2634
2635
2636 /*************************************************
2637 * Base opcode of repeated opcodes *
2638 *************************************************/
2639
2640 /* Returns the base opcode for repeated single character type opcodes. If the
2641 opcode is not a repeated character type, it returns with the original value.
2642
2643 Arguments: c opcode
2644 Returns: base opcode for the type
2645 */
2646
2647 static pcre_uchar
2648 get_repeat_base(pcre_uchar c)
2649 {
2650 return (c > OP_TYPEPOSUPTO)? c :
2651 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2652 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2653 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2654 (c >= OP_STARI)? OP_STARI :
2655 OP_STAR;
2656 }
2657
2658
2659
2660 #ifdef SUPPORT_UCP
2661 /*************************************************
2662 * Check a character and a property *
2663 *************************************************/
2664
2665 /* This function is called by check_auto_possessive() when a property item
2666 is adjacent to a fixed character.
2667
2668 Arguments:
2669 c the character
2670 ptype the property type
2671 pdata the data for the type
2672 negated TRUE if it's a negated property (\P or \p{^)
2673
2674 Returns: TRUE if auto-possessifying is OK
2675 */
2676
2677 static BOOL
2678 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2679 BOOL negated)
2680 {
2681 const pcre_uint32 *p;
2682 const ucd_record *prop = GET_UCD(c);
2683
2684 switch(ptype)
2685 {
2686 case PT_LAMP:
2687 return (prop->chartype == ucp_Lu ||
2688 prop->chartype == ucp_Ll ||
2689 prop->chartype == ucp_Lt) == negated;
2690
2691 case PT_GC:
2692 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2693
2694 case PT_PC:
2695 return (pdata == prop->chartype) == negated;
2696
2697 case PT_SC:
2698 return (pdata == prop->script) == negated;
2699
2700 /* These are specials */
2701
2702 case PT_ALNUM:
2703 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2704 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2705
2706 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2707 means that Perl space and POSIX space are now identical. PCRE was changed
2708 at release 8.34. */
2709
2710 case PT_SPACE: /* Perl space */
2711 case PT_PXSPACE: /* POSIX space */
2712 switch(c)
2713 {
2714 HSPACE_CASES:
2715 VSPACE_CASES:
2716 return negated;
2717
2718 default:
2719 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2720 }
2721 break; /* Control never reaches here */
2722
2723 case PT_WORD:
2724 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2725 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2726 c == CHAR_UNDERSCORE) == negated;
2727
2728 case PT_CLIST:
2729 p = PRIV(ucd_caseless_sets) + prop->caseset;
2730 for (;;)
2731 {
2732 if (c < *p) return !negated;
2733 if (c == *p++) return negated;
2734 }
2735 break; /* Control never reaches here */
2736 }
2737
2738 return FALSE;
2739 }
2740 #endif /* SUPPORT_UCP */
2741
2742
2743
2744 /*************************************************
2745 * Fill the character property list *
2746 *************************************************/
2747
2748 /* Checks whether the code points to an opcode that can take part in auto-
2749 possessification, and if so, fills a list with its properties.
2750
2751 Arguments:
2752 code points to start of expression
2753 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2754 fcc points to case-flipping table
2755 list points to output list
2756 list[0] will be filled with the opcode
2757 list[1] will be non-zero if this opcode
2758 can match an empty character string
2759 list[2..7] depends on the opcode
2760
2761 Returns: points to the start of the next opcode if *code is accepted
2762 NULL if *code is not accepted
2763 */
2764
2765 static const pcre_uchar *
2766 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2767 const pcre_uint8 *fcc, pcre_uint32 *list)
2768 {
2769 pcre_uchar c = *code;
2770 const pcre_uchar *end;
2771 const pcre_uint32 *clist_src;
2772 pcre_uint32 *clist_dest;
2773 pcre_uint32 chr;
2774 pcre_uchar base;
2775
2776 list[0] = c;
2777 list[1] = FALSE;
2778 code++;
2779
2780 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2781 {
2782 base = get_repeat_base(c);
2783 c -= (base - OP_STAR);
2784
2785 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2786 code += IMM2_SIZE;
2787
2788 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2789
2790 switch(base)
2791 {
2792 case OP_STAR:
2793 list[0] = OP_CHAR;
2794 break;
2795
2796 case OP_STARI:
2797 list[0] = OP_CHARI;
2798 break;
2799
2800 case OP_NOTSTAR:
2801 list[0] = OP_NOT;
2802 break;
2803
2804 case OP_NOTSTARI:
2805 list[0] = OP_NOTI;
2806 break;
2807
2808 case OP_TYPESTAR:
2809 list[0] = *code;
2810 code++;
2811 break;
2812 }
2813 c = list[0];
2814 }
2815
2816 switch(c)
2817 {
2818 case OP_NOT_DIGIT:
2819 case OP_DIGIT:
2820 case OP_NOT_WHITESPACE:
2821 case OP_WHITESPACE:
2822 case OP_NOT_WORDCHAR:
2823 case OP_WORDCHAR:
2824 case OP_ANY:
2825 case OP_ALLANY:
2826 case OP_ANYNL:
2827 case OP_NOT_HSPACE:
2828 case OP_HSPACE:
2829 case OP_NOT_VSPACE:
2830 case OP_VSPACE:
2831 case OP_EXTUNI:
2832 case OP_EODN:
2833 case OP_EOD:
2834 case OP_DOLL:
2835 case OP_DOLLM:
2836 return code;
2837
2838 case OP_CHAR:
2839 case OP_NOT:
2840 GETCHARINCTEST(chr, code);
2841 list[2] = chr;
2842 list[3] = NOTACHAR;
2843 return code;
2844
2845 case OP_CHARI:
2846 case OP_NOTI:
2847 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2848 GETCHARINCTEST(chr, code);
2849 list[2] = chr;
2850
2851 #ifdef SUPPORT_UCP
2852 if (chr < 128 || (chr < 256 && !utf))
2853 list[3] = fcc[chr];
2854 else
2855 list[3] = UCD_OTHERCASE(chr);
2856 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2857 list[3] = (chr < 256) ? fcc[chr] : chr;
2858 #else
2859 list[3] = fcc[chr];
2860 #endif
2861
2862 /* The othercase might be the same value. */
2863
2864 if (chr == list[3])
2865 list[3] = NOTACHAR;
2866 else
2867 list[4] = NOTACHAR;
2868 return code;
2869
2870 #ifdef SUPPORT_UCP
2871 case OP_PROP:
2872 case OP_NOTPROP:
2873 if (code[0] != PT_CLIST)
2874 {
2875 list[2] = code[0];
2876 list[3] = code[1];
2877 return code + 2;
2878 }
2879
2880 /* Convert only if we have enough space. */
2881
2882 clist_src = PRIV(ucd_caseless_sets) + code[1];
2883 clist_dest = list + 2;
2884 code += 2;
2885
2886 do {
2887 if (clist_dest >= list + 8)
2888 {
2889 /* Early return if there is not enough space. This should never
2890 happen, since all clists are shorter than 5 character now. */
2891 list[2] = code[0];
2892 list[3] = code[1];
2893 return code;
2894 }
2895 *clist_dest++ = *clist_src;
2896 }
2897 while(*clist_src++ != NOTACHAR);
2898
2899 /* All characters are stored. The terminating NOTACHAR
2900 is copied form the clist itself. */
2901
2902 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2903 return code;
2904 #endif
2905
2906 case OP_NCLASS:
2907 case OP_CLASS:
2908 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2909 case OP_XCLASS:
2910
2911 if (c == OP_XCLASS)
2912 end = code + GET(code, 0);
2913 else
2914 #endif
2915 end = code + 32 / sizeof(pcre_uchar);
2916
2917 switch(*end)
2918 {
2919 case OP_CRSTAR:
2920 case OP_CRMINSTAR:
2921 case OP_CRQUERY:
2922 case OP_CRMINQUERY:
2923 list[1] = TRUE;
2924 end++;
2925 break;
2926
2927 case OP_CRRANGE:
2928 case OP_CRMINRANGE:
2929 list[1] = (GET2(end, 1) == 0);
2930 end += 1 + 2 * IMM2_SIZE;
2931 break;
2932 }
2933 list[2] = end - code;
2934 return end;
2935 }
2936 return NULL; /* Opcode not accepted */
2937 }
2938
2939
2940
2941 /*************************************************
2942 * Scan further character sets for match *
2943 *************************************************/
2944
2945 /* Checks whether the base and the current opcode have a common character, in
2946 which case the base cannot be possessified.
2947
2948 Arguments:
2949 code points to the byte code
2950 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2951 cd static compile data
2952 base_list the data list of the base opcode
2953
2954 Returns: TRUE if the auto-possessification is possible
2955 */
2956
2957 static BOOL
2958 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
2959 const pcre_uint32* base_list)
2960 {
2961 pcre_uchar c;
2962 pcre_uint32 list[8];
2963 const pcre_uint32* chr_ptr;
2964 const pcre_uint32* ochr_ptr;
2965 const pcre_uint32* list_ptr;
2966 const pcre_uchar *next_code;
2967 pcre_uint32 chr;
2968
2969 /* Note: the base_list[1] contains whether the current opcode has greedy
2970 (represented by a non-zero value) quantifier. This is a different from
2971 other character type lists, which stores here that the character iterator
2972 matches to an empty string (also represented by a non-zero value). */
2973
2974 for(;;)
2975 {
2976 c = *code;
2977
2978 /* Skip over callouts */
2979
2980 if (c == OP_CALLOUT)
2981 {
2982 code += PRIV(OP_lengths)[c];
2983 continue;
2984 }
2985
2986 if (c == OP_ALT)
2987 {
2988 do code += GET(code, 1); while (*code == OP_ALT);
2989 c = *code;
2990 }
2991
2992 switch(c)
2993 {
2994 case OP_END:
2995 case OP_KETRPOS:
2996 /* TRUE only in greedy case. The non-greedy case could be replaced by
2997 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
2998 uses more memory, which we cannot get at this stage.) */
2999
3000 return base_list[1] != 0;
3001
3002 case OP_KET:
3003 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3004 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3005 cannot be converted to a possessive form. */
3006
3007 if (base_list[1] == 0) return FALSE;
3008
3009 switch(*(code - GET(code, 1)))
3010 {
3011 case OP_ASSERT:
3012 case OP_ASSERT_NOT:
3013 case OP_ASSERTBACK:
3014 case OP_ASSERTBACK_NOT:
3015 case OP_ONCE:
3016 case OP_ONCE_NC:
3017 /* Atomic sub-patterns and assertions can always auto-possessify their
3018 last iterator. */
3019 return TRUE;
3020 }
3021
3022 code += PRIV(OP_lengths)[c];
3023 continue;
3024
3025 case OP_ONCE:
3026 case OP_ONCE_NC:
3027 case OP_BRA:
3028 case OP_CBRA:
3029 next_code = code;
3030 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3031
3032 /* We do not support repeated brackets, because they can lead to
3033 infinite recursion. */
3034
3035 if (*next_code != OP_KET) return FALSE;
3036
3037 next_code = code + GET(code, 1);
3038 code += PRIV(OP_lengths)[c];
3039
3040 while (*next_code == OP_ALT)
3041 {
3042 if (!compare_opcodes(code, utf, cd, base_list)) return FALSE;
3043 code = next_code + 1 + LINK_SIZE;
3044 next_code += GET(next_code, 1);
3045 }
3046 continue;
3047
3048 case OP_BRAZERO:
3049 case OP_BRAMINZERO:
3050
3051 next_code = code + 1;
3052 if (*next_code != OP_BRA && *next_code != OP_CBRA)
3053 return FALSE;
3054
3055 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3056
3057 /* We do not support repeated brackets, because they can lead to
3058 infinite recursion. */
3059 if (*next_code != OP_KET) return FALSE;
3060
3061 /* The bracket content will be checked by the
3062 OP_BRA/OP_CBRA case above. */
3063 next_code += 1 + LINK_SIZE;
3064 if (!compare_opcodes(next_code, utf, cd, base_list)) return FALSE;
3065
3066 code += PRIV(OP_lengths)[c];
3067 continue;
3068 }
3069
3070 /* Check for a supported opcode, and load its properties. */
3071
3072 code = get_chr_property_list(code, utf, cd->fcc, list);
3073 if (code == NULL) return FALSE; /* Unsupported */
3074
3075 /* If either opcode is a small character list, set pointers for comparing
3076 characters from that list with another list, or with a property. */
3077
3078 if (base_list[0] == OP_CHAR)
3079 {
3080 chr_ptr = base_list + 2;
3081 list_ptr = list;
3082 }
3083 else if (list[0] == OP_CHAR)
3084 {
3085 chr_ptr = list + 2;
3086 list_ptr = base_list;
3087 }
3088
3089 /* Some property combinations also acceptable. Unicode property opcodes are
3090 processed specially; the rest can be handled with a lookup table. */
3091
3092 else
3093 {
3094 pcre_uint32 leftop, rightop;
3095
3096 if (list[1] != 0) return FALSE; /* Must match at least one character */
3097 leftop = base_list[0];
3098 rightop = list[0];
3099
3100 #ifdef SUPPORT_UCP
3101 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3102 {
3103 if (rightop == OP_EOD) return TRUE;
3104 if (rightop == OP_PROP || rightop == OP_NOTPROP)
3105 {
3106 int n;
3107 const pcre_uint8 *p;
3108 BOOL same = leftop == rightop;
3109 BOOL lisprop = leftop == OP_PROP;
3110 BOOL risprop = rightop == OP_PROP;
3111 BOOL bothprop = lisprop && risprop;
3112
3113 /* There's a table that specifies how each combination is to be
3114 processed:
3115 0 Always return FALSE (never auto-possessify)
3116 1 Character groups are distinct (possessify if both are OP_PROP)
3117 2 Check character categories in the same group (general or particular)
3118 3 Return TRUE if the two opcodes are not the same
3119 ... see comments below
3120 */
3121
3122 n = propposstab[base_list[2]][list[2]];
3123 switch(n)
3124 {
3125 case 0: return FALSE;
3126 case 1: return bothprop;
3127 case 2: return (base_list[3] == list[3]) != same;
3128 case 3: return !same;
3129
3130 case 4: /* Left general category, right particular category */
3131 return risprop && catposstab[base_list[3]][list[3]] == same;
3132
3133 case 5: /* Right general category, left particular category */
3134 return lisprop && catposstab[list[3]][base_list[3]] == same;
3135
3136 /* This code is logically tricky. Think hard before fiddling with it.
3137 The posspropstab table has four entries per row. Each row relates to
3138 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3139 Only WORD actually needs all four entries, but using repeats for the
3140 others means they can all use the same code below.
3141
3142 The first two entries in each row are Unicode general categories, and
3143 apply always, because all the characters they include are part of the
3144 PCRE character set. The third and fourth entries are a general and a
3145 particular category, respectively, that include one or more relevant
3146 characters. One or the other is used, depending on whether the check
3147 is for a general or a particular category. However, in both cases the
3148 category contains more characters than the specials that are defined
3149 for the property being tested against. Therefore, it cannot be used
3150 in a NOTPROP case.
3151
3152 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3153 Underscore is covered by ucp_P or ucp_Po. */
3154
3155 case 6: /* Left alphanum vs right general category */
3156 case 7: /* Left space vs right general category */
3157 case 8: /* Left word vs right general category */
3158 p = posspropstab[n-6];
3159 return risprop && lisprop ==
3160 (list[3] != p[0] &&
3161 list[3] != p[1] &&
3162 (list[3] != p[2] || !lisprop));
3163
3164 case 9: /* Right alphanum vs left general category */
3165 case 10: /* Right space vs left general category */
3166 case 11: /* Right word vs left general category */
3167 p = posspropstab[n-9];
3168 return lisprop && risprop ==
3169 (base_list[3] != p[0] &&
3170 base_list[3] != p[1] &&
3171 (base_list[3] != p[2] || !risprop));
3172
3173 case 12: /* Left alphanum vs right particular category */
3174 case 13: /* Left space vs right particular category */
3175 case 14: /* Left word vs right particular category */
3176 p = posspropstab[n-12];
3177 return risprop && lisprop ==
3178 (catposstab[p[0]][list[3]] &&
3179 catposstab[p[1]][list[3]] &&
3180 (list[3] != p[3] || !lisprop));
3181
3182 case 15: /* Right alphanum vs left particular category */
3183 case 16: /* Right space vs left particular category */
3184 case 17: /* Right word vs left particular category */
3185 p = posspropstab[n-15];
3186 return lisprop && risprop ==
3187 (catposstab[p[0]][base_list[3]] &&
3188 catposstab[p[1]][base_list[3]] &&
3189 (base_list[3] != p[3] || !risprop));
3190 }
3191 }
3192 return FALSE;
3193 }
3194
3195 else
3196 #endif /* SUPPORT_UCP */
3197
3198 return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3199 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3200 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3201 }
3202
3203 /* Control reaches here only if one of the items is a small character list.
3204 All characters are checked against the other side. */
3205
3206 do
3207 {
3208 chr = *chr_ptr;
3209
3210 switch(list_ptr[0])
3211 {
3212 case OP_CHAR:
3213 ochr_ptr = list_ptr + 2;
3214 do
3215 {
3216 if (chr == *ochr_ptr) return FALSE;
3217 ochr_ptr++;
3218 }
3219 while(*ochr_ptr != NOTACHAR);
3220 break;
3221
3222 case OP_NOT:
3223 ochr_ptr = list_ptr + 2;
3224 do
3225 {
3226 if (chr == *ochr_ptr)
3227 break;
3228 ochr_ptr++;
3229 }
3230 while(*ochr_ptr != NOTACHAR);
3231 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3232 break;
3233
3234 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3235 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3236
3237 case OP_DIGIT:
3238 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3239 break;
3240
3241 case OP_NOT_DIGIT:
3242 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3243 break;
3244
3245 case OP_WHITESPACE:
3246 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3247 break;
3248
3249 case OP_NOT_WHITESPACE:
3250 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3251 break;
3252
3253 case OP_WORDCHAR:
3254 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3255 break;
3256
3257 case OP_NOT_WORDCHAR:
3258 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3259 break;
3260
3261 case OP_HSPACE:
3262 switch(chr)
3263 {
3264 HSPACE_CASES: return FALSE;
3265 default: break;
3266 }
3267 break;
3268
3269 case OP_NOT_HSPACE:
3270 switch(chr)
3271 {
3272 HSPACE_CASES: break;
3273 default: return FALSE;
3274 }
3275 break;
3276
3277 case OP_ANYNL:
3278 case OP_VSPACE:
3279 switch(chr)
3280 {
3281 VSPACE_CASES: return FALSE;
3282 default: break;
3283 }
3284 break;
3285
3286 case OP_NOT_VSPACE:
3287 switch(chr)
3288 {
3289 VSPACE_CASES: break;
3290 default: return FALSE;
3291 }
3292 break;
3293
3294 case OP_DOLL:
3295 case OP_EODN:
3296 switch (chr)
3297 {
3298 case CHAR_CR:
3299 case CHAR_LF:
3300 case CHAR_VT:
3301 case CHAR_FF:
3302 case CHAR_NEL:
3303 #ifndef EBCDIC
3304 case 0x2028:
3305 case 0x2029:
3306 #endif /* Not EBCDIC */
3307 return FALSE;
3308 }
3309 break;
3310
3311 case OP_EOD: /* Can always possessify before \z */
3312 break;
3313
3314 case OP_PROP:
3315 case OP_NOTPROP:
3316 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3317 list_ptr[0] == OP_NOTPROP))
3318 return FALSE;
3319 break;
3320
3321 /* The class comparisons work only when the class is the second item
3322 of the pair, because there are at present no possessive forms of the
3323 class opcodes. Note also that the "code" variable that is used below
3324 points after the second item, and that the pointer for the first item
3325 is not available, so even if there were possessive forms of the class
3326 opcodes, the correct comparison could not be done. */
3327
3328 case OP_NCLASS:
3329 if (chr > 255) return FALSE;
3330 /* Fall through */
3331
3332 case OP_CLASS:
3333 if (list_ptr != list) return FALSE; /* Class is first opcode */
3334 if (chr > 255) break;
3335 if ((((pcre_uint8 *)(code - list_ptr[2]))[chr >> 3] & (1 << (chr & 7))) != 0)
3336 return FALSE;
3337 break;
3338
3339 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3340 case OP_XCLASS:
3341 if (list_ptr != list) return FALSE; /* Class is first opcode */
3342 if (PRIV(xclass)(chr, code - list_ptr[2] + LINK_SIZE, utf))
3343 return FALSE;
3344 break;
3345 #endif
3346
3347 default:
3348 return FALSE;
3349 }
3350
3351 chr_ptr++;
3352 }
3353 while(*chr_ptr != NOTACHAR);
3354
3355 /* At least one character must be matched from this opcode. */
3356
3357 if (list[1] == 0) return TRUE;
3358 }
3359
3360 return FALSE;
3361 }
3362
3363
3364
3365 /*************************************************
3366 * Scan compiled regex for auto-possession *
3367 *************************************************/
3368
3369 /* Replaces single character iterations with their possessive alternatives
3370 if appropriate. This function modifies the compiled opcode!
3371
3372 Arguments:
3373 code points to start of the byte code
3374 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3375 cd static compile data
3376
3377 Returns: nothing
3378 */
3379
3380 static void
3381 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3382 {
3383 register pcre_uchar c;
3384 const pcre_uchar *end;
3385 pcre_uint32 list[8];
3386
3387 for (;;)
3388 {
3389 c = *code;
3390
3391 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3392 {
3393 c -= get_repeat_base(c) - OP_STAR;
3394 end = (c <= OP_MINUPTO) ?
3395 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3396 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3397
3398 if (end != NULL && compare_opcodes(end, utf, cd, list))
3399 {
3400 switch(c)
3401 {
3402 case OP_STAR:
3403 *code += OP_POSSTAR - OP_STAR;
3404 break;
3405
3406 case OP_MINSTAR:
3407 *code += OP_POSSTAR - OP_MINSTAR;
3408 break;
3409
3410 case OP_PLUS:
3411 *code += OP_POSPLUS - OP_PLUS;
3412 break;
3413
3414 case OP_MINPLUS:
3415 *code += OP_POSPLUS - OP_MINPLUS;
3416 break;
3417
3418 case OP_QUERY:
3419 *code += OP_POSQUERY - OP_QUERY;
3420 break;
3421
3422 case OP_MINQUERY:
3423 *code += OP_POSQUERY - OP_MINQUERY;
3424 break;
3425
3426 case OP_UPTO:
3427 *code += OP_POSUPTO - OP_UPTO;
3428 break;
3429
3430 case OP_MINUPTO:
3431 *code += OP_MINUPTO - OP_UPTO;
3432 break;
3433 }
3434 }
3435 c = *code;
3436 }
3437
3438 switch(c)
3439 {
3440 case OP_END:
3441 return;
3442
3443 case OP_TYPESTAR:
3444 case OP_TYPEMINSTAR:
3445 case OP_TYPEPLUS:
3446 case OP_TYPEMINPLUS:
3447 case OP_TYPEQUERY:
3448 case OP_TYPEMINQUERY:
3449 case OP_TYPEPOSSTAR:
3450 case OP_TYPEPOSPLUS:
3451 case OP_TYPEPOSQUERY:
3452 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3453 break;
3454
3455 case OP_TYPEUPTO:
3456 case OP_TYPEMINUPTO:
3457 case OP_TYPEEXACT:
3458 case OP_TYPEPOSUPTO:
3459 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3460 code += 2;
3461 break;
3462
3463 case OP_XCLASS:
3464 code += GET(code, 1);
3465 break;
3466
3467 case OP_MARK:
3468 case OP_PRUNE_ARG:
3469 case OP_SKIP_ARG:
3470 case OP_THEN_ARG:
3471 code += code[1];
3472 break;
3473 }
3474
3475 /* Add in the fixed length from the table */
3476
3477 code += PRIV(OP_lengths)[c];
3478
3479 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3480 a multi-byte character. The length in the table is a minimum, so we have to
3481 arrange to skip the extra bytes. */
3482
3483 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3484 if (utf) switch(c)
3485 {
3486 case OP_CHAR:
3487 case OP_CHARI:
3488 case OP_NOT:
3489 case OP_NOTI:
3490 case OP_STAR:
3491 case OP_MINSTAR:
3492 case OP_PLUS:
3493 case OP_MINPLUS:
3494 case OP_QUERY:
3495 case OP_MINQUERY:
3496 case OP_UPTO:
3497 case OP_MINUPTO:
3498 case OP_EXACT:
3499 case OP_POSSTAR:
3500 case OP_POSPLUS:
3501 case OP_POSQUERY:
3502 case OP_POSUPTO:
3503 case OP_STARI:
3504 case OP_MINSTARI:
3505 case OP_PLUSI:
3506 case OP_MINPLUSI:
3507 case OP_QUERYI:
3508 case OP_MINQUERYI:
3509 case OP_UPTOI:
3510 case OP_MINUPTOI:
3511 case OP_EXACTI:
3512 case OP_POSSTARI:
3513 case OP_POSPLUSI:
3514 case OP_POSQUERYI:
3515 case OP_POSUPTOI:
3516 case OP_NOTSTAR:
3517 case OP_NOTMINSTAR:
3518 case OP_NOTPLUS:
3519 case OP_NOTMINPLUS:
3520 case OP_NOTQUERY:
3521 case OP_NOTMINQUERY:
3522 case OP_NOTUPTO:
3523 case OP_NOTMINUPTO:
3524 case OP_NOTEXACT:
3525 case OP_NOTPOSSTAR:
3526 case OP_NOTPOSPLUS:
3527 case OP_NOTPOSQUERY:
3528 case OP_NOTPOSUPTO:
3529 case OP_NOTSTARI:
3530 case OP_NOTMINSTARI:
3531 case OP_NOTPLUSI:
3532 case OP_NOTMINPLUSI:
3533 case OP_NOTQUERYI:
3534 case OP_NOTMINQUERYI:
3535 case OP_NOTUPTOI:
3536 case OP_NOTMINUPTOI:
3537 case OP_NOTEXACTI:
3538 case OP_NOTPOSSTARI:
3539 case OP_NOTPOSPLUSI:
3540 case OP_NOTPOSQUERYI:
3541 case OP_NOTPOSUPTOI:
3542 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3543 break;
3544 }
3545 #else
3546 (void)(utf); /* Keep compiler happy by referencing function argument */
3547 #endif
3548 }
3549 }
3550
3551
3552
3553 /*************************************************
3554 * Check for POSIX class syntax *
3555 *************************************************/
3556
3557 /* This function is called when the sequence "[:" or "[." or "[=" is
3558 encountered in a character class. It checks whether this is followed by a
3559 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3560 reach an unescaped ']' without the special preceding character, return FALSE.
3561
3562 Originally, this function only recognized a sequence of letters between the
3563 terminators, but it seems that Perl recognizes any sequence of characters,
3564 though of course unknown POSIX names are subsequently rejected. Perl gives an
3565 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3566 didn't consider this to be a POSIX class. Likewise for [:1234:].
3567
3568 The problem in trying to be exactly like Perl is in the handling of escapes. We
3569 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3570 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3571 below handles the special case of \], but does not try to do any other escape
3572 processing. This makes it different from Perl for cases such as [:l\ower:]
3573 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3574 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
3575 I think.
3576
3577 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3578 It seems that the appearance of a nested POSIX class supersedes an apparent
3579 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3580 a digit.
3581
3582 In Perl, unescaped square brackets may also appear as part of class names. For
3583 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3584 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3585 seem right at all. PCRE does not allow closing square brackets in POSIX class
3586 names.
3587
3588 Arguments:
3589 ptr pointer to the initial [
3590 endptr where to return the end pointer
3591
3592 Returns: TRUE or FALSE
3593 */
3594
3595 static BOOL
3596 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3597 {
3598 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3599 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3600 for (++ptr; *ptr != CHAR_NULL; ptr++)
3601 {
3602 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3603 ptr++;
3604 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3605 else
3606 {
3607 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3608 {
3609 *endptr = ptr;
3610 return TRUE;
3611 }
3612 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3613 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3614 ptr[1] == CHAR_EQUALS_SIGN) &&
3615 check_posix_syntax(ptr, endptr))
3616 return FALSE;
3617 }
3618 }
3619 return FALSE;
3620 }
3621
3622
3623
3624
3625 /*************************************************
3626 * Check POSIX class name *
3627 *************************************************/
3628
3629 /* This function is called to check the name given in a POSIX-style class entry
3630 such as [:alnum:].
3631
3632 Arguments:
3633 ptr points to the first letter
3634 len the length of the name
3635
3636 Returns: a value representing the name, or -1 if unknown
3637 */
3638
3639 static int
3640 check_posix_name(const pcre_uchar *ptr, int len)
3641 {
3642 const char *pn = posix_names;
3643 register int yield = 0;
3644 while (posix_name_lengths[yield] != 0)
3645 {
3646 if (len == posix_name_lengths[yield] &&
3647 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3648 pn += posix_name_lengths[yield] + 1;
3649 yield++;
3650 }
3651 return -1;
3652 }
3653
3654
3655 /*************************************************
3656 * Adjust OP_RECURSE items in repeated group *
3657 *************************************************/
3658
3659 /* OP_RECURSE items contain an offset from the start of the regex to the group
3660 that is referenced. This means that groups can be replicated for fixed
3661 repetition simply by copying (because the recursion is allowed to refer to
3662 earlier groups that are outside the current group). However, when a group is
3663 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3664 inserted before it, after it has been compiled. This means that any OP_RECURSE
3665 items within it that refer to the group itself or any contained groups have to
3666 have their offsets adjusted. That one of the jobs of this function. Before it
3667 is called, the partially compiled regex must be temporarily terminated with
3668 OP_END.
3669
3670 This function has been extended with the possibility of forward references for
3671 recursions and subroutine calls. It must also check the list of such references
3672 for the group we are dealing with. If it finds that one of the recursions in
3673 the current group is on this list, it adjusts the offset in the list, not the
3674 value in the reference (which is a group number).
3675
3676 Arguments:
3677 group points to the start of the group
3678 adjust the amount by which the group is to be moved
3679 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3680 cd contains pointers to tables etc.
3681 save_hwm the hwm forward reference pointer at the start of the group
3682
3683 Returns: nothing
3684 */
3685
3686 static void
3687 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3688 pcre_uchar *save_hwm)
3689 {
3690 pcre_uchar *ptr = group;
3691
3692 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3693 {
3694 int offset;
3695 pcre_uchar *hc;
3696
3697 /* See if this recursion is on the forward reference list. If so, adjust the
3698 reference. */
3699
3700 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3701 {
3702 offset = (int)GET(hc, 0);
3703 if (cd->start_code + offset == ptr + 1)
3704 {
3705 PUT(hc, 0, offset + adjust);
3706 break;
3707 }
3708 }
3709
3710 /* Otherwise, adjust the recursion offset if it's after the start of this
3711 group. */
3712
3713 if (hc >= cd->hwm)
3714 {
3715 offset = (int)GET(ptr, 1);
3716 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3717 }
3718
3719 ptr += 1 + LINK_SIZE;
3720 }
3721 }
3722
3723
3724
3725 /*************************************************
3726 * Insert an automatic callout point *
3727 *************************************************/
3728
3729 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3730 callout points before each pattern item.
3731
3732 Arguments:
3733 code current code pointer
3734 ptr current pattern pointer
3735 cd pointers to tables etc
3736
3737 Returns: new code pointer
3738 */
3739
3740 static pcre_uchar *
3741 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3742 {
3743 *code++ = OP_CALLOUT;
3744 *code++ = 255;
3745 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
3746 PUT(code, LINK_SIZE, 0); /* Default length */
3747 return code + 2 * LINK_SIZE;
3748 }
3749
3750
3751
3752 /*************************************************
3753 * Complete a callout item *
3754 *************************************************/
3755
3756 /* A callout item contains the length of the next item in the pattern, which
3757 we can't fill in till after we have reached the relevant point. This is used
3758 for both automatic and manual callouts.
3759
3760 Arguments:
3761 previous_callout points to previous callout item
3762 ptr current pattern pointer
3763 cd pointers to tables etc
3764
3765 Returns: nothing
3766 */
3767
3768 static void
3769 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3770 {
3771 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3772 PUT(previous_callout, 2 + LINK_SIZE, length);
3773 }
3774
3775
3776
3777 #ifdef SUPPORT_UCP
3778 /*************************************************
3779 * Get othercase range *
3780 *************************************************/
3781
3782 /* This function is passed the start and end of a class range, in UTF-8 mode
3783 with UCP support. It searches up the characters, looking for ranges of
3784 characters in the "other" case. Each call returns the next one, updating the
3785 start address. A character with multiple other cases is returned on its own
3786 with a special return value.
3787
3788 Arguments:
3789 cptr points to starting character value; updated
3790 d end value
3791 ocptr where to put start of othercase range
3792 odptr where to put end of othercase range
3793
3794 Yield: -1 when no more
3795 0 when a range is returned
3796 >0 the CASESET offset for char with multiple other cases
3797 in this case, ocptr contains the original
3798 */
3799
3800 static int
3801 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
3802 pcre_uint32 *odptr)
3803 {
3804 pcre_uint32 c, othercase, next;
3805 unsigned int co;
3806
3807 /* Find the first character that has an other case. If it has multiple other
3808 cases, return its case offset value. */
3809
3810 for (c = *cptr; c <= d; c++)
3811 {
3812 if ((co = UCD_CASESET(c)) != 0)
3813 {
3814 *ocptr = c++; /* Character that has the set */
3815 *cptr = c; /* Rest of input range */
3816 return (int)co;
3817 }
3818 if ((othercase = UCD_OTHERCASE(c)) != c) break;
3819 }
3820
3821 if (c > d) return -1; /* Reached end of range */
3822
3823 *ocptr = othercase;
3824 next = othercase + 1;
3825
3826 for (++c; c <= d; c++)
3827 {
3828 if (UCD_OTHERCASE(c) != next) break;
3829 next++;
3830 }
3831
3832 *odptr = next - 1; /* End of othercase range */
3833 *cptr = c; /* Rest of input range */
3834 return 0;
3835 }
3836 #endif /* SUPPORT_UCP */
3837
3838
3839
3840 /*************************************************
3841 * Add a character or range to a class *
3842 *************************************************/
3843
3844 /* This function packages up the logic of adding a character or range of
3845 characters to a class. The character values in the arguments will be within the
3846 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3847 mutually recursive with the function immediately below.
3848
3849 Arguments:
3850 classbits the bit map for characters < 256
3851 uchardptr points to the pointer for extra data
3852 options the options word
3853 cd contains pointers to tables etc.
3854 start start of range character
3855 end end of range character
3856
3857 Returns: the number of < 256 characters added
3858 the pointer to extra data is updated
3859 */
3860
3861 static int
3862 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3863 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
3864 {
3865 pcre_uint32 c;
3866 int n8 = 0;
3867
3868 /* If caseless matching is required, scan the range and process alternate
3869 cases. In Unicode, there are 8-bit characters that have alternate cases that
3870 are greater than 255 and vice-versa. Sometimes we can just extend the original
3871 range. */
3872
3873 if ((options & PCRE_CASELESS) != 0)
3874 {
3875 #ifdef SUPPORT_UCP
3876 if ((options & PCRE_UTF8) != 0)
3877 {
3878 int rc;
3879 pcre_uint32 oc, od;
3880
3881 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
3882 c = start;
3883
3884 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3885 {
3886 /* Handle a single character that has more than one other case. */
3887
3888 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3889 PRIV(ucd_caseless_sets) + rc, oc);
3890
3891 /* Do nothing if the other case range is within the original range. */
3892
3893 else if (oc >= start && od <= end) continue;
3894
3895 /* Extend the original range if there is overlap, noting that if oc < c, we
3896 can't have od > end because a subrange is always shorter than the basic
3897 range. Otherwise, use a recursive call to add the additional range. */
3898
3899 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3900 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
3901 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3902 }
3903 }
3904 else
3905 #endif /* SUPPORT_UCP */
3906
3907 /* Not UTF-mode, or no UCP */
3908
3909 for (c = start; c <= end && c < 256; c++)
3910 {
3911 SETBIT(classbits, cd->fcc[c]);
3912 n8++;
3913 }
3914 }
3915
3916 /* Now handle the original range. Adjust the final value according to the bit
3917 length - this means that the same lists of (e.g.) horizontal spaces can be used
3918 in all cases. */
3919
3920 #if defined COMPILE_PCRE8
3921 #ifdef SUPPORT_UTF
3922 if ((options & PCRE_UTF8) == 0)
3923 #endif
3924 if (end > 0xff) end = 0xff;
3925
3926 #elif defined COMPILE_PCRE16
3927 #ifdef SUPPORT_UTF
3928 if ((options & PCRE_UTF16) == 0)
3929 #endif
3930 if (end > 0xffff) end = 0xffff;
3931
3932 #endif /* COMPILE_PCRE[8|16] */
3933
3934 /* If all characters are less than 256, use the bit map. Otherwise use extra
3935 data. */
3936
3937 if (end < 0x100)
3938 {
3939 for (c = start; c <= end; c++)
3940 {
3941 n8++;
3942 SETBIT(classbits, c);
3943 }
3944 }
3945
3946 else
3947 {
3948 pcre_uchar *uchardata = *uchardptr;
3949
3950 #ifdef SUPPORT_UTF
3951 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
3952 {
3953 if (start < end)
3954 {
3955 *uchardata++ = XCL_RANGE;
3956 uchardata += PRIV(ord2utf)(start, uchardata);
3957 uchardata += PRIV(ord2utf)(end, uchardata);
3958 }
3959 else if (start == end)
3960 {
3961 *uchardata++ = XCL_SINGLE;
3962 uchardata += PRIV(ord2utf)(start, uchardata);
3963 }
3964 }
3965 else
3966 #endif /* SUPPORT_UTF */
3967
3968 /* Without UTF support, character values are constrained by the bit length,
3969 and can only be > 256 for 16-bit and 32-bit libraries. */
3970
3971 #ifdef COMPILE_PCRE8
3972 {}
3973 #else
3974 if (start < end)
3975 {
3976 *uchardata++ = XCL_RANGE;
3977 *uchardata++ = start;
3978 *uchardata++ = end;
3979 }
3980 else if (start == end)
3981 {
3982 *uchardata++ = XCL_SINGLE;
3983 *uchardata++ = start;
3984 }
3985 #endif
3986
3987 *uchardptr = uchardata; /* Updata extra data pointer */
3988 }
3989
3990 return n8; /* Number of 8-bit characters */
3991 }
3992
3993
3994
3995
3996 /*************************************************
3997 * Add a list of characters to a class *
3998 *************************************************/
3999
4000 /* This function is used for adding a list of case-equivalent characters to a
4001 class, and also for adding a list of horizontal or vertical whitespace. If the
4002 list is in order (which it should be), ranges of characters are detected and
4003 handled appropriately. This function is mutually recursive with the function
4004 above.
4005
4006 Arguments:
4007 classbits the bit map for characters < 256
4008 uchardptr points to the pointer for extra data
4009 options the options word
4010 cd contains pointers to tables etc.
4011 p points to row of 32-bit values, terminated by NOTACHAR
4012 except character to omit; this is used when adding lists of
4013 case-equivalent characters to avoid including the one we
4014 already know about
4015
4016 Returns: the number of < 256 characters added
4017 the pointer to extra data is updated
4018 */
4019
4020 static int
4021 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4022 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4023 {
4024 int n8 = 0;
4025 while (p[0] < NOTACHAR)
4026 {
4027 int n = 0;
4028 if (p[0] != except)
4029 {
4030 while(p[n+1] == p[0] + n + 1) n++;
4031 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4032 }
4033 p += n + 1;
4034 }
4035 return n8;
4036 }
4037
4038
4039
4040 /*************************************************
4041 * Add characters not in a list to a class *
4042 *************************************************/
4043
4044 /* This function is used for adding the complement of a list of horizontal or
4045 vertical whitespace to a class. The list must be in order.
4046
4047 Arguments:
4048 classbits the bit map for characters < 256
4049 uchardptr points to the pointer for extra data
4050 options the options word
4051 cd contains pointers to tables etc.
4052 p points to row of 32-bit values, terminated by NOTACHAR
4053
4054 Returns: the number of < 256 characters added
4055 the pointer to extra data is updated
4056 */
4057
4058 static int
4059 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4060 int options, compile_data *cd, const pcre_uint32 *p)
4061 {
4062 BOOL utf = (options & PCRE_UTF8) != 0;
4063 int n8 = 0;
4064 if (p[0] > 0)
4065 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4066 while (p[0] < NOTACHAR)
4067 {
4068 while (p[1] == p[0] + 1) p++;
4069 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4070 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4071 p++;
4072 }
4073 return n8;
4074 }
4075
4076
4077
4078 /*************************************************
4079 * Compile one branch *
4080 *************************************************/
4081
4082 /* Scan the pattern, compiling it into the a vector. If the options are
4083 changed during the branch, the pointer is used to change the external options
4084 bits. This function is used during the pre-compile phase when we are trying
4085 to find out the amount of memory needed, as well as during the real compile
4086 phase. The value of lengthptr distinguishes the two phases.
4087
4088 Arguments:
4089 optionsptr pointer to the option bits
4090 codeptr points to the pointer to the current code point
4091 ptrptr points to the current pattern pointer
4092 errorcodeptr points to error code variable
4093 firstcharptr place to put the first required character
4094 firstcharflagsptr place to put the first character flags, or a negative number
4095 reqcharptr place to put the last required character
4096 reqcharflagsptr place to put the last required character flags, or a negative number
4097 bcptr points to current branch chain
4098 cond_depth conditional nesting depth
4099 cd contains pointers to tables etc.
4100 lengthptr NULL during the real compile phase
4101 points to length accumulator during pre-compile phase
4102
4103 Returns: TRUE on success
4104 FALSE, with *errorcodeptr set non-zero on error
4105 */
4106
4107 static BOOL
4108 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4109 const pcre_uchar **ptrptr, int *errorcodeptr,
4110 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4111 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4112 branch_chain *bcptr, int cond_depth,
4113 compile_data *cd, int *lengthptr)
4114 {
4115 int repeat_type, op_type;
4116 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4117 int bravalue = 0;
4118 int greedy_default, greedy_non_default;
4119 pcre_uint32 firstchar, reqchar;
4120 pcre_int32 firstcharflags, reqcharflags;
4121 pcre_uint32 zeroreqchar, zerofirstchar;
4122 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4123 pcre_int32 req_caseopt, reqvary, tempreqvary;
4124 int options = *optionsptr; /* May change dynamically */
4125 int after_manual_callout = 0;
4126 int length_prevgroup = 0;
4127 register pcre_uint32 c;
4128 int escape;
4129 register pcre_uchar *code = *codeptr;
4130 pcre_uchar *last_code = code;
4131 pcre_uchar *orig_code = code;
4132 pcre_uchar *tempcode;
4133 BOOL inescq = FALSE;
4134 BOOL groupsetfirstchar = FALSE;
4135 const pcre_uchar *ptr = *ptrptr;
4136 const pcre_uchar *tempptr;
4137 const pcre_uchar *nestptr = NULL;
4138 pcre_uchar *previous = NULL;
4139 pcre_uchar *previous_callout = NULL;
4140 pcre_uchar *save_hwm = NULL;
4141 pcre_uint8 classbits[32];
4142
4143 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4144 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4145 dynamically as we process the pattern. */
4146
4147 #ifdef SUPPORT_UTF
4148 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4149 BOOL utf = (options & PCRE_UTF8) != 0;
4150 #ifndef COMPILE_PCRE32
4151 pcre_uchar utf_chars[6];
4152 #endif
4153 #else
4154 BOOL utf = FALSE;
4155 #endif
4156
4157 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4158 class_uchardata always so that it can be passed to add_to_class() always,
4159 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4160 alternative calls for the different cases. */
4161
4162 pcre_uchar *class_uchardata;
4163 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4164 BOOL xclass;
4165 pcre_uchar *class_uchardata_base;
4166 #endif
4167
4168 #ifdef PCRE_DEBUG
4169 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4170 #endif
4171
4172 /* Set up the default and non-default settings for greediness */
4173
4174 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4175 greedy_non_default = greedy_default ^ 1;
4176
4177 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4178 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4179 matches a non-fixed char first char; reqchar just remains unset if we never
4180 find one.
4181
4182 When we hit a repeat whose minimum is zero, we may have to adjust these values
4183 to take the zero repeat into account. This is implemented by setting them to
4184 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4185 item types that can be repeated set these backoff variables appropriately. */
4186
4187 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4188 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4189
4190 /* The variable req_caseopt contains either the REQ_CASELESS value
4191 or zero, according to the current setting of the caseless flag. The
4192 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4193 firstchar or reqchar variables to record the case status of the
4194 value. This is used only for ASCII characters. */
4195
4196 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4197
4198 /* Switch on next character until the end of the branch */
4199
4200 for (;; ptr++)
4201 {
4202 BOOL negate_class;
4203 BOOL should_flip_negation;
4204 BOOL possessive_quantifier;
4205 BOOL is_quantifier;
4206 BOOL is_recurse;
4207 BOOL reset_bracount;
4208 int class_has_8bitchar;
4209 int class_one_char;
4210 int newoptions;
4211 int recno;
4212 int refsign;
4213 int skipbytes;
4214 pcre_uint32 subreqchar, subfirstchar;
4215 pcre_int32 subreqcharflags, subfirstcharflags;
4216 int terminator;
4217 unsigned int mclength;
4218 unsigned int tempbracount;
4219 pcre_uint32 ec;
4220 pcre_uchar mcbuffer[8];
4221
4222 /* Get next character in the pattern */
4223
4224 c = *ptr;
4225
4226 /* If we are at the end of a nested substitution, revert to the outer level
4227 string. Nesting only happens one level deep. */
4228
4229 if (c == CHAR_NULL && nestptr != NULL)
4230 {
4231 ptr = nestptr;
4232 nestptr = NULL;
4233 c = *ptr;
4234 }
4235
4236 /* If we are in the pre-compile phase, accumulate the length used for the
4237 previous cycle of this loop. */
4238
4239 if (lengthptr != NULL)
4240 {
4241 #ifdef PCRE_DEBUG
4242 if (code > cd->hwm) cd->hwm = code; /* High water info */
4243 #endif
4244 if (code > cd->start_workspace + cd->workspace_size -
4245 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4246 {
4247 *errorcodeptr = ERR52;
4248 goto FAILED;
4249 }
4250
4251 /* There is at least one situation where code goes backwards: this is the
4252 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4253 the class is simply eliminated. However, it is created first, so we have to
4254 allow memory for it. Therefore, don't ever reduce the length at this point.
4255 */
4256
4257 if (code < last_code) code = last_code;
4258
4259 /* Paranoid check for integer overflow */
4260
4261 if (OFLOW_MAX - *lengthptr < code - last_code)
4262 {
4263 *errorcodeptr = ERR20;
4264 goto FAILED;
4265 }
4266
4267 *lengthptr += (int)(code - last_code);
4268 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4269 (int)(code - last_code), c, c));
4270
4271 /* If "previous" is set and it is not at the start of the work space, move
4272 it back to there, in order to avoid filling up the work space. Otherwise,
4273 if "previous" is NULL, reset the current code pointer to the start. */
4274
4275 if (previous != NULL)
4276 {
4277 if (previous > orig_code)
4278 {
4279 memmove(orig_code, previous, IN_UCHARS(code - previous));
4280 code -= previous - orig_code;
4281 previous = orig_code;
4282 }
4283 }
4284 else code = orig_code;
4285
4286 /* Remember where this code item starts so we can pick up the length
4287 next time round. */
4288
4289 last_code = code;
4290 }
4291
4292 /* In the real compile phase, just check the workspace used by the forward
4293 reference list. */
4294
4295 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4296 WORK_SIZE_SAFETY_MARGIN)
4297 {
4298 *errorcodeptr = ERR52;
4299 goto FAILED;
4300 }
4301
4302 /* If in \Q...\E, check for the end; if not, we have a literal */
4303
4304 if (inescq && c != CHAR_NULL)
4305 {
4306 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4307 {
4308 inescq = FALSE;
4309 ptr++;
4310 continue;
4311 }
4312 else
4313 {
4314 if (previous_callout != NULL)
4315 {
4316 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4317 complete_callout(previous_callout, ptr, cd);
4318 previous_callout = NULL;
4319 }
4320 if ((options & PCRE_AUTO_CALLOUT) != 0)
4321 {
4322 previous_callout = code;
4323 code = auto_callout(code, ptr, cd);
4324 }
4325 goto NORMAL_CHAR;
4326 }
4327 }
4328
4329 is_quantifier =
4330 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4331 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4332
4333 /* Fill in length of a previous callout, except when the next thing is a
4334 quantifier or when processing a property substitution string in UCP mode. */
4335
4336 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4337 after_manual_callout-- <= 0)
4338 {
4339 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4340 complete_callout(previous_callout, ptr, cd);
4341 previous_callout = NULL;
4342 }
4343
4344 /* In extended mode, skip white space and comments. */
4345
4346 if ((options & PCRE_EXTENDED) != 0)
4347 {
4348 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
4349 if (c == CHAR_NUMBER_SIGN)
4350 {
4351 ptr++;
4352 while (*ptr != CHAR_NULL)
4353 {
4354 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
4355 ptr++;
4356 #ifdef SUPPORT_UTF
4357 if (utf) FORWARDCHAR(ptr);
4358 #endif
4359 }
4360 if (*ptr != CHAR_NULL) continue;
4361
4362 /* Else fall through to handle end of string */
4363 c = 0;
4364 }
4365 }
4366
4367 /* No auto callout for quantifiers, or while processing property strings that
4368 are substituted for \w etc in UCP mode. */
4369
4370 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4371 {
4372 previous_callout = code;
4373 code = auto_callout(code, ptr, cd);
4374 }
4375
4376 switch(c)
4377 {
4378 /* ===================================================================*/
4379 case 0: /* The branch terminates at string end */
4380 case CHAR_VERTICAL_LINE: /* or | or ) */
4381 case CHAR_RIGHT_PARENTHESIS:
4382 *firstcharptr = firstchar;
4383 *firstcharflagsptr = firstcharflags;
4384 *reqcharptr = reqchar;
4385 *reqcharflagsptr = reqcharflags;
4386 *codeptr = code;
4387 *ptrptr = ptr;
4388 if (lengthptr != NULL)
4389 {
4390 if (OFLOW_MAX - *lengthptr < code - last_code)
4391 {
4392 *errorcodeptr = ERR20;
4393 goto FAILED;
4394 }
4395 *lengthptr += (int)(code - last_code); /* To include callout length */
4396 DPRINTF((">> end branch\n"));
4397 }
4398 return TRUE;
4399
4400
4401 /* ===================================================================*/
4402 /* Handle single-character metacharacters. In multiline mode, ^ disables
4403 the setting of any following char as a first character. */
4404
4405 case CHAR_CIRCUMFLEX_ACCENT:
4406 previous = NULL;
4407 if ((options & PCRE_MULTILINE) != 0)
4408 {
4409 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4410 *code++ = OP_CIRCM;
4411 }
4412 else *code++ = OP_CIRC;
4413 break;
4414
4415 case CHAR_DOLLAR_SIGN:
4416 previous = NULL;
4417 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4418 break;
4419
4420 /* There can never be a first char if '.' is first, whatever happens about
4421 repeats. The value of reqchar doesn't change either. */
4422
4423 case CHAR_DOT:
4424 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4425 zerofirstchar = firstchar;
4426 zerofirstcharflags = firstcharflags;
4427 zeroreqchar = reqchar;
4428 zeroreqcharflags = reqcharflags;
4429 previous = code;
4430 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4431 break;
4432
4433
4434 /* ===================================================================*/
4435 /* Character classes. If the included characters are all < 256, we build a
4436 32-byte bitmap of the permitted characters, except in the special case
4437 where there is only one such character. For negated classes, we build the
4438 map as usual, then invert it at the end. However, we use a different opcode
4439 so that data characters > 255 can be handled correctly.
4440
4441 If the class contains characters outside the 0-255 range, a different
4442 opcode is compiled. It may optionally have a bit map for characters < 256,
4443 but those above are are explicitly listed afterwards. A flag byte tells
4444 whether the bitmap is present, and whether this is a negated class or not.
4445
4446 In JavaScript compatibility mode, an isolated ']' causes an error. In
4447 default (Perl) mode, it is treated as a data character. */
4448
4449 case CHAR_RIGHT_SQUARE_BRACKET:
4450 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4451 {
4452 *errorcodeptr = ERR64;
4453 goto FAILED;
4454 }
4455 goto NORMAL_CHAR;
4456
4457 case CHAR_LEFT_SQUARE_BRACKET:
4458 previous = code;
4459
4460 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4461 they are encountered at the top level, so we'll do that too. */
4462
4463 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4464 ptr[1] == CHAR_EQUALS_SIGN) &&
4465 check_posix_syntax(ptr, &tempptr))
4466 {
4467 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4468 goto FAILED;
4469 }
4470
4471 /* If the first character is '^', set the negation flag and skip it. Also,
4472 if the first few characters (either before or after ^) are \Q\E or \E we
4473 skip them too. This makes for compatibility with Perl. */
4474
4475 negate_class = FALSE;
4476 for (;;)
4477 {
4478 c = *(++ptr);
4479 if (c == CHAR_BACKSLASH)
4480 {
4481 if (ptr[1] == CHAR_E)
4482 ptr++;
4483 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4484 ptr += 3;
4485 else
4486 break;
4487 }
4488 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4489 negate_class = TRUE;
4490 else break;
4491 }
4492
4493 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4494 an initial ']' is taken as a data character -- the code below handles
4495 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4496 [^] must match any character, so generate OP_ALLANY. */
4497
4498 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4499 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4500 {
4501 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4502 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4503 zerofirstchar = firstchar;
4504 zerofirstcharflags = firstcharflags;
4505 break;
4506 }
4507
4508 /* If a class contains a negative special such as \S, we need to flip the
4509 negation flag at the end, so that support for characters > 255 works
4510 correctly (they are all included in the class). */
4511
4512 should_flip_negation = FALSE;
4513
4514 /* For optimization purposes, we track some properties of the class:
4515 class_has_8bitchar will be non-zero if the class contains at least one <
4516 256 character; class_one_char will be 1 if the class contains just one
4517 character. */
4518
4519 class_has_8bitchar = 0;
4520 class_one_char = 0;
4521
4522 /* Initialize the 32-char bit map to all zeros. We build the map in a
4523 temporary bit of memory, in case the class contains fewer than two
4524 8-bit characters because in that case the compiled code doesn't use the bit
4525 map. */
4526
4527 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4528
4529 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4530 xclass = FALSE;
4531 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4532 class_uchardata_base = class_uchardata; /* Save the start */
4533 #endif
4534
4535 /* Process characters until ] is reached. By writing this as a "do" it
4536 means that an initial ] is taken as a data character. At the start of the
4537 loop, c contains the first byte of the character. */
4538
4539 if (c != CHAR_NULL) do
4540 {
4541 const pcre_uchar *oldptr;
4542
4543 #ifdef SUPPORT_UTF
4544 if (utf && HAS_EXTRALEN(c))
4545 { /* Braces are required because the */
4546 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4547 }
4548 #endif
4549
4550 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4551 /* In the pre-compile phase, accumulate the length of any extra
4552 data and reset the pointer. This is so that very large classes that
4553 contain a zillion > 255 characters no longer overwrite the work space
4554 (which is on the stack). We have to remember that there was XCLASS data,
4555 however. */
4556
4557 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4558 {
4559 xclass = TRUE;
4560 *lengthptr += class_uchardata - class_uchardata_base;
4561 class_uchardata = class_uchardata_base;
4562 }
4563 #endif
4564
4565 /* Inside \Q...\E everything is literal except \E */
4566
4567 if (inescq)
4568 {
4569 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4570 {
4571 inescq = FALSE; /* Reset literal state */
4572 ptr++; /* Skip the 'E' */
4573 continue; /* Carry on with next */
4574 }
4575 goto CHECK_RANGE; /* Could be range if \E follows */
4576 }
4577
4578 /* Handle POSIX class names. Perl allows a negation extension of the
4579 form [:^name:]. A square bracket that doesn't match the syntax is
4580 treated as a literal. We also recognize the POSIX constructions
4581 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4582 5.6 and 5.8 do. */
4583
4584 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4585 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4586 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4587 {
4588 BOOL local_negate = FALSE;
4589 int posix_class, taboffset, tabopt;
4590 register const pcre_uint8 *cbits = cd->cbits;
4591 pcre_uint8 pbits[32];
4592
4593 if (ptr[1] != CHAR_COLON)
4594 {
4595 *errorcodeptr = ERR31;
4596 goto FAILED;
4597 }
4598
4599 ptr += 2;
4600 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4601 {
4602 local_negate = TRUE;
4603 should_flip_negation = TRUE; /* Note negative special */
4604 ptr++;
4605 }
4606
4607 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4608 if (posix_class < 0)
4609 {
4610 *errorcodeptr = ERR30;
4611 goto FAILED;
4612 }
4613
4614 /* If matching is caseless, upper and lower are converted to
4615 alpha. This relies on the fact that the class table starts with
4616 alpha, lower, upper as the first 3 entries. */
4617
4618 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4619 posix_class = 0;
4620
4621 /* When PCRE_UCP is set, some of the POSIX classes are converted to
4622 different escape sequences that use Unicode properties. */
4623
4624 #ifdef SUPPORT_UCP
4625 if ((options & PCRE_UCP) != 0)
4626 {
4627 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4628 if (posix_substitutes[pc] != NULL)
4629 {
4630 nestptr = tempptr + 1;
4631 ptr = posix_substitutes[pc] - 1;
4632 continue;
4633 }
4634 }
4635 #endif
4636 /* In the non-UCP case, we build the bit map for the POSIX class in a
4637 chunk of local store because we may be adding and subtracting from it,
4638 and we don't want to subtract bits that may be in the main map already.
4639 At the end we or the result into the bit map that is being built. */
4640
4641 posix_class *= 3;
4642
4643 /* Copy in the first table (always present) */
4644
4645 memcpy(pbits, cbits + posix_class_maps[posix_class],
4646 32 * sizeof(pcre_uint8));
4647
4648 /* If there is a second table, add or remove it as required. */
4649
4650 taboffset = posix_class_maps[posix_class + 1];
4651 tabopt = posix_class_maps[posix_class + 2];
4652
4653 if (taboffset >= 0)
4654 {
4655 if (tabopt >= 0)
4656 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4657 else
4658 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4659 }
4660
4661 /* Now see if we need to remove any special characters. An option
4662 value of 1 removes vertical space and 2 removes underscore. */
4663
4664 if (tabopt < 0) tabopt = -tabopt;
4665 if (tabopt == 1) pbits[1] &= ~0x3c;
4666 else if (tabopt == 2) pbits[11] &= 0x7f;
4667
4668 /* Add the POSIX table or its complement into the main table that is
4669 being built and we are done. */
4670
4671 if (local_negate)
4672 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4673 else
4674 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4675
4676 ptr = tempptr + 1;
4677 /* Every class contains at least one < 256 character. */
4678 class_has_8bitchar = 1;
4679 /* Every class contains at least two characters. */
4680 class_one_char = 2;
4681 continue; /* End of POSIX syntax handling */
4682 }
4683
4684 /* Backslash may introduce a single character, or it may introduce one
4685 of the specials, which just set a flag. The sequence \b is a special
4686 case. Inside a class (and only there) it is treated as backspace. We
4687 assume that other escapes have more than one character in them, so
4688 speculatively set both class_has_8bitchar and class_one_char bigger
4689 than one. Unrecognized escapes fall through and are either treated
4690 as literal characters (by default), or are faulted if
4691 PCRE_EXTRA is set. */
4692
4693 if (c == CHAR_BACKSLASH)
4694 {
4695 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4696 TRUE);
4697 if (*errorcodeptr != 0) goto FAILED;
4698 if (escape == 0) c = ec;
4699 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4700 else if (escape == ESC_N) /* \N is not supported in a class */
4701 {
4702 *errorcodeptr = ERR71;
4703 goto FAILED;
4704 }
4705 else if (escape == ESC_Q) /* Handle start of quoted string */
4706 {
4707 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4708 {
4709 ptr += 2; /* avoid empty string */
4710 }
4711 else inescq = TRUE;
4712 continue;
4713 }
4714 else if (escape == ESC_E) continue; /* Ignore orphan \E */
4715
4716 else
4717 {
4718 register const pcre_uint8 *cbits = cd->cbits;
4719 /* Every class contains at least two < 256 characters. */
4720 class_has_8bitchar++;
4721 /* Every class contains at least two characters. */
4722 class_one_char += 2;
4723
4724 switch (escape)
4725 {
4726 #ifdef SUPPORT_UCP
4727 case ESC_du: /* These are the values given for \d etc */
4728 case ESC_DU: /* when PCRE_UCP is set. We replace the */
4729 case ESC_wu: /* escape sequence with an appropriate \p */
4730 case ESC_WU: /* or \P to test Unicode properties instead */
4731 case ESC_su: /* of the default ASCII testing. */
4732 case ESC_SU:
4733 nestptr = ptr;
4734 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4735 class_has_8bitchar--; /* Undo! */
4736 continue;
4737 #endif
4738 case ESC_d:
4739 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4740 continue;
4741
4742 case ESC_D:
4743 should_flip_negation = TRUE;
4744 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4745 continue;
4746
4747 case ESC_w:
4748 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4749 continue;
4750
4751 case ESC_W:
4752 should_flip_negation = TRUE;
4753 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4754 continue;
4755
4756 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4757 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4758 previously set by something earlier in the character class.
4759 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4760 we could just adjust the appropriate bit. From PCRE 8.34 we no
4761 longer treat \s and \S specially. */
4762
4763 case ESC_s:
4764 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4765 continue;
4766
4767 case ESC_S:
4768 should_flip_negation = TRUE;
4769 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4770 continue;
4771
4772 /* The rest apply in both UCP and non-UCP cases. */
4773
4774 case ESC_h:
4775 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4776 PRIV(hspace_list), NOTACHAR);
4777 continue;
4778
4779 case ESC_H:
4780 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4781 cd, PRIV(hspace_list));
4782 continue;
4783
4784 case ESC_v:
4785 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4786 PRIV(vspace_list), NOTACHAR);
4787 continue;
4788
4789 case ESC_V:
4790 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4791 cd, PRIV(vspace_list));
4792 continue;
4793
4794 #ifdef SUPPORT_UCP
4795 case ESC_p:
4796 case ESC_P:
4797 {
4798 BOOL negated;
4799 unsigned int ptype = 0, pdata = 0;
4800 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
4801 goto FAILED;
4802 *class_uchardata++ = ((escape == ESC_p) != negated)?
4803 XCL_PROP : XCL_NOTPROP;
4804 *class_uchardata++ = ptype;
4805 *class_uchardata++ = pdata;
4806 class_has_8bitchar--; /* Undo! */
4807 continue;
4808 }
4809 #endif
4810 /* Unrecognized escapes are faulted if PCRE is running in its
4811 strict mode. By default, for compatibility with Perl, they are
4812 treated as literals. */
4813
4814 default:
4815 if ((options & PCRE_EXTRA) != 0)
4816 {
4817 *errorcodeptr = ERR7;
4818 goto FAILED;
4819 }
4820 class_has_8bitchar--; /* Undo the speculative increase. */
4821 class_one_char -= 2; /* Undo the speculative increase. */
4822 c = *ptr; /* Get the final character and fall through */
4823 break;
4824 }
4825 }
4826
4827 /* Fall through if the escape just defined a single character (c >= 0).
4828 This may be greater than 256. */
4829
4830 escape = 0;
4831
4832 } /* End of backslash handling */
4833
4834 /* A character may be followed by '-' to form a range. However, Perl does
4835 not permit ']' to be the end of the range. A '-' character at the end is
4836 treated as a literal. Perl ignores orphaned \E sequences entirely. The
4837 code for handling \Q and \E is messy. */
4838
4839 CHECK_RANGE:
4840 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4841 {
4842 inescq = FALSE;
4843 ptr += 2;
4844 }
4845 oldptr = ptr;
4846
4847 /* Remember if \r or \n were explicitly used */
4848
4849 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4850
4851 /* Check for range */
4852
4853 if (!inescq && ptr[1] == CHAR_MINUS)
4854 {
4855 pcre_uint32 d;
4856 ptr += 2;
4857 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4858
4859 /* If we hit \Q (not followed by \E) at this point, go into escaped
4860 mode. */
4861
4862 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4863 {
4864 ptr += 2;
4865 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4866 { ptr += 2; continue; }
4867 inescq = TRUE;
4868 break;
4869 }
4870
4871 /* Minus (hyphen) at the end of a class is treated as a literal, so put
4872 back the pointer and jump to handle the character that preceded it. */
4873
4874 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4875 {
4876 ptr = oldptr;
4877 goto CLASS_SINGLE_CHARACTER;
4878 }
4879
4880 /* Otherwise, we have a potential range; pick up the next character */
4881
4882 #ifdef SUPPORT_UTF
4883 if (utf)
4884 { /* Braces are required because the */
4885 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4886 }
4887 else
4888 #endif
4889 d = *ptr; /* Not UTF-8 mode */
4890
4891 /* The second part of a range can be a single-character escape, but
4892 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4893 in such circumstances. */
4894
4895 if (!inescq && d == CHAR_BACKSLASH)
4896 {
4897 int descape;
4898 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
4899 if (*errorcodeptr != 0) goto FAILED;
4900
4901 /* \b is backspace; any other special means the '-' was literal. */
4902
4903 if (descape != 0)
4904 {
4905 if (descape == ESC_b) d = CHAR_BS; else
4906 {
4907 ptr = oldptr;
4908 goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4909 }
4910 }
4911 }
4912
4913 /* Check that the two values are in the correct order. Optimize
4914 one-character ranges. */
4915
4916 if (d < c)
4917 {
4918 *errorcodeptr = ERR8;
4919 goto FAILED;
4920 }
4921 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4922
4923 /* We have found a character range, so single character optimizations
4924 cannot be done anymore. Any value greater than 1 indicates that there
4925 is more than one character. */
4926
4927 class_one_char = 2;
4928
4929 /* Remember an explicit \r or \n, and add the range to the class. */
4930
4931 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4932
4933 class_has_8bitchar +=
4934 add_to_class(classbits, &class_uchardata, options, cd, c, d);
4935
4936 continue; /* Go get the next char in the class */
4937 }
4938
4939 /* Handle a single character - we can get here for a normal non-escape
4940 char, or after \ that introduces a single character or for an apparent
4941 range that isn't. Only the value 1 matters for class_one_char, so don't
4942 increase it if it is already 2 or more ... just in case there's a class
4943 with a zillion characters in it. */
4944
4945 CLASS_SINGLE_CHARACTER:
4946 if (class_one_char < 2) class_one_char++;
4947
4948 /* If class_one_char is 1, we have the first single character in the
4949 class, and there have been no prior ranges, or XCLASS items generated by
4950 escapes. If this is the final character in the class, we can optimize by
4951 turning the item into a 1-character OP_CHAR[I] if it's positive, or
4952 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
4953 to be set. Otherwise, there can be no first char if this item is first,
4954 whatever repeat count may follow. In the case of reqchar, save the
4955 previous value for reinstating. */
4956
4957 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4958 {
4959 ptr++;
4960 zeroreqchar = reqchar;
4961 zeroreqcharflags = reqcharflags;
4962
4963 if (negate_class)
4964 {
4965 #ifdef SUPPORT_UCP
4966 int d;
4967 #endif
4968 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4969 zerofirstchar = firstchar;
4970 zerofirstcharflags = firstcharflags;
4971
4972 /* For caseless UTF-8 mode when UCP support is available, check
4973 whether this character has more than one other case. If so, generate
4974 a special OP_NOTPROP item instead of OP_NOTI. */
4975
4976 #ifdef SUPPORT_UCP
4977 if (utf && (options & PCRE_CASELESS) != 0 &&
4978 (d = UCD_CASESET(c)) != 0)
4979 {
4980 *code++ = OP_NOTPROP;
4981 *code++ = PT_CLIST;
4982 *code++ = d;
4983 }
4984 else
4985 #endif
4986 /* Char has only one other case, or UCP not available */
4987
4988 {
4989 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4990 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4991 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4992 code += PRIV(ord2utf)(c, code);
4993 else
4994 #endif
4995 *code++ = c;
4996 }
4997
4998 /* We are finished with this character class */
4999
5000 goto END_CLASS;
5001 }
5002
5003 /* For a single, positive character, get the value into mcbuffer, and
5004 then we can handle this with the normal one-character code. */
5005
5006 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5007 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5008 mclength = PRIV(ord2utf)(c, mcbuffer);
5009 else
5010 #endif
5011 {
5012 mcbuffer[0] = c;
5013 mclength = 1;
5014 }
5015 goto ONE_CHAR;
5016 } /* End of 1-char optimization */
5017
5018 /* There is more than one character in the class, or an XCLASS item
5019 has been generated. Add this character to the class. */
5020
5021 class_has_8bitchar +=
5022 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5023 }
5024
5025 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5026 If we are at the end of an internal nested string, revert to the outer
5027 string. */
5028
5029 while (((c = *(++ptr)) != CHAR_NULL ||
5030 (nestptr != NULL &&
5031 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5032 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5033
5034 /* Check for missing terminating ']' */
5035
5036 if (c == CHAR_NULL)
5037 {
5038 *errorcodeptr = ERR6;
5039 goto FAILED;
5040 }
5041
5042 /* We will need an XCLASS if data has been placed in class_uchardata. In
5043 the second phase this is a sufficient test. However, in the pre-compile
5044 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5045 only if the very last character in the class needs XCLASS will it contain
5046 anything at this point. For this reason, xclass gets set TRUE above when
5047 uchar_classdata is emptied, and that's why this code is the way it is here
5048 instead of just doing a test on class_uchardata below. */
5049
5050 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5051 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5052 #endif
5053
5054 /* If this is the first thing in the branch, there can be no first char
5055 setting, whatever the repeat count. Any reqchar setting must remain
5056 unchanged after any kind of repeat. */
5057
5058 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5059 zerofirstchar = firstchar;
5060 zerofirstcharflags = firstcharflags;
5061 zeroreqchar = reqchar;
5062 zeroreqcharflags = reqcharflags;
5063
5064 /* If there are characters with values > 255, we have to compile an
5065 extended class, with its own opcode, unless there was a negated special
5066 such as \S in the class, and PCRE_UCP is not set, because in that case all
5067 characters > 255 are in the class, so any that were explicitly given as
5068 well can be ignored. If (when there are explicit characters > 255 that must
5069 be listed) there are no characters < 256, we can omit the bitmap in the
5070 actual compiled code. */
5071
5072 #ifdef SUPPORT_UTF
5073 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5074 #elif !defined COMPILE_PCRE8
5075 if (xclass && !should_flip_negation)
5076 #endif
5077 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5078 {
5079 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5080 *code++ = OP_XCLASS;
5081 code += LINK_SIZE;
5082 *code = negate_class? XCL_NOT:0;
5083
5084 /* If the map is required, move up the extra data to make room for it;
5085 otherwise just move the code pointer to the end of the extra data. */
5086
5087 if (class_has_8bitchar > 0)
5088 {
5089 *code++ |= XCL_MAP;
5090 memmove(code + (32 / sizeof(pcre_uchar)), code,
5091 IN_UCHARS(class_uchardata - code));
5092 memcpy(code, classbits, 32);
5093 code = class_uchardata + (32 / sizeof(pcre_uchar));
5094 }
5095 else code = class_uchardata;
5096
5097 /* Now fill in the complete length of the item */
5098
5099 PUT(previous, 1, (int)(code - previous));
5100 break; /* End of class handling */
5101 }
5102 #endif
5103
5104 /* If there are no characters > 255, or they are all to be included or
5105 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5106 whole class was negated and whether there were negative specials such as \S
5107 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5108 negating it if necessary. */
5109
5110 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5111 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5112 {
5113 if (negate_class)
5114 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5115 memcpy(code, classbits, 32);
5116 }
5117 code += 32 / sizeof(pcre_uchar);
5118
5119 END_CLASS:
5120 break;
5121
5122
5123 /* ===================================================================*/
5124 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5125 has been tested above. */
5126
5127 case CHAR_LEFT_CURLY_BRACKET:
5128 if (!is_quantifier) goto NORMAL_CHAR;
5129 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5130 if (*errorcodeptr != 0) goto FAILED;
5131 goto REPEAT;
5132
5133 case CHAR_ASTERISK:
5134 repeat_min = 0;
5135 repeat_max = -1;
5136 goto REPEAT;
5137
5138 case CHAR_PLUS:
5139 repeat_min = 1;
5140 repeat_max = -1;
5141 goto REPEAT;
5142
5143 case CHAR_QUESTION_MARK:
5144 repeat_min = 0;
5145 repeat_max = 1;
5146
5147 REPEAT:
5148 if (previous == NULL)
5149 {
5150 *errorcodeptr = ERR9;
5151 goto FAILED;
5152 }
5153
5154 if (repeat_min == 0)
5155 {
5156 firstchar = zerofirstchar; /* Adjust for zero repeat */
5157 firstcharflags = zerofirstcharflags;
5158 reqchar = zeroreqchar; /* Ditto */
5159 reqcharflags = zeroreqcharflags;
5160 }
5161
5162 /* Remember whether this is a variable length repeat */
5163
5164 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5165
5166 op_type = 0; /* Default single-char op codes */
5167 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5168
5169 /* Save start of previous item, in case we have to move it up in order to
5170 insert something before it. */
5171
5172 tempcode = previous;
5173
5174 /* If the next character is '+', we have a possessive quantifier. This
5175 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5176 If the next character is '?' this is a minimizing repeat, by default,
5177 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5178 repeat type to the non-default. */
5179
5180 if (ptr[1] == CHAR_PLUS)
5181 {
5182 repeat_type = 0; /* Force greedy */
5183 possessive_quantifier = TRUE;
5184 ptr++;
5185 }
5186 else if (ptr[1] == CHAR_QUESTION_MARK)
5187 {
5188 repeat_type = greedy_non_default;
5189 ptr++;
5190 }
5191 else repeat_type = greedy_default;
5192
5193 /* If previous was a recursion call, wrap it in atomic brackets so that
5194 previous becomes the atomic group. All recursions were so wrapped in the
5195 past, but it no longer happens for non-repeated recursions. In fact, the
5196 repeated ones could be re-implemented independently so as not to need this,
5197 but for the moment we rely on the code for repeating groups. */
5198
5199 if (*previous == OP_RECURSE)
5200 {
5201 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5202 *previous = OP_ONCE;
5203 PUT(previous, 1, 2 + 2*LINK_SIZE);
5204 previous[2 + 2*LINK_SIZE] = OP_KET;
5205 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5206 code += 2 + 2 * LINK_SIZE;
5207 length_prevgroup = 3 + 3*LINK_SIZE;
5208
5209 /* When actually compiling, we need to check whether this was a forward
5210 reference, and if so, adjust the offset. */
5211
5212 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5213 {
5214 int offset = GET(cd->hwm, -LINK_SIZE);
5215 if (offset == previous + 1 - cd->start_code)
5216 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5217 }
5218 }
5219
5220 /* Now handle repetition for the different types of item. */
5221
5222 /* If previous was a character or negated character match, abolish the item
5223 and generate a repeat item instead. If a char item has a minimum of more
5224 than one, ensure that it is set in reqchar - it might not be if a sequence
5225 such as x{3} is the first thing in a branch because the x will have gone
5226 into firstchar instead. */
5227
5228 if (*previous == OP_CHAR || *previous == OP_CHARI
5229 || *previous == OP_NOT || *previous == OP_NOTI)
5230 {
5231 switch (*previous)
5232 {
5233 default: /* Make compiler happy. */
5234 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5235 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5236 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5237 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5238 }
5239
5240 /* Deal with UTF characters that take up more than one character. It's
5241 easier to write this out separately than try to macrify it. Use c to
5242 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5243 it's a length rather than a small character. */
5244
5245 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5246 if (utf && NOT_FIRSTCHAR(code[-1]))
5247 {
5248 pcre_uchar *lastchar = code - 1;
5249 BACKCHAR(lastchar);
5250 c = (int)(code - lastchar); /* Length of UTF-8 character */
5251 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5252 c |= UTF_LENGTH; /* Flag c as a length */
5253 }
5254 else
5255 #endif /* SUPPORT_UTF */
5256
5257 /* Handle the case of a single charater - either with no UTF support, or
5258 with UTF disabled, or for a single character UTF character. */
5259 {
5260 c = code[-1];
5261 if (*previous <= OP_CHARI && repeat_min > 1)
5262 {
5263 reqchar = c;
5264 reqcharflags = req_caseopt | cd->req_varyopt;
5265 }
5266 }
5267
5268 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5269 }
5270
5271 /* If previous was a character type match (\d or similar), abolish it and
5272 create a suitable repeat item. The code is shared with single-character
5273 repeats by setting op_type to add a suitable offset into repeat_type. Note
5274 the the Unicode property types will be present only when SUPPORT_UCP is
5275 defined, but we don't wrap the little bits of code here because it just
5276 makes it horribly messy. */
5277
5278 else if (*previous < OP_EODN)
5279 {
5280 pcre_uchar *oldcode;
5281 int prop_type, prop_value;
5282 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5283 c = *previous;
5284
5285 OUTPUT_SINGLE_REPEAT:
5286 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5287 {
5288 prop_type = previous[1];
5289 prop_value = previous[2];
5290 }
5291 else prop_type = prop_value = -1;
5292
5293 oldcode = code;
5294 code = previous; /* Usually overwrite previous item */
5295
5296 /* If the maximum is zero then the minimum must also be zero; Perl allows
5297 this case, so we do too - by simply omitting the item altogether. */
5298
5299 if (repeat_max == 0) goto END_REPEAT;
5300
5301 /* Combine the op_type with the repeat_type */
5302
5303 repeat_type += op_type;
5304
5305 /* A minimum of zero is handled either as the special case * or ?, or as
5306 an UPTO, with the maximum given. */
5307
5308 if (repeat_min == 0)
5309 {
5310 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5311 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5312 else
5313 {
5314 *code++ = OP_UPTO + repeat_type;
5315 PUT2INC(code, 0, repeat_max);
5316 }
5317 }
5318
5319 /* A repeat minimum of 1 is optimized into some special cases. If the
5320 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5321 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5322 one less than the maximum. */
5323
5324 else if (repeat_min == 1)
5325 {
5326 if (repeat_max == -1)
5327 *code++ = OP_PLUS + repeat_type;
5328 else
5329 {
5330 code = oldcode; /* leave previous item in place */
5331 if (repeat_max == 1) goto END_REPEAT;
5332 *code++ = OP_UPTO + repeat_type;
5333 PUT2INC(code, 0, repeat_max - 1);
5334 }
5335 }
5336
5337 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5338 handled as an EXACT followed by an UPTO. */
5339
5340 else
5341 {
5342 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5343 PUT2INC(code, 0, repeat_min);
5344
5345 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5346 we have to insert the character for the previous code. For a repeated
5347 Unicode property match, there are two extra bytes that define the
5348 required property. In UTF-8 mode, long characters have their length in
5349 c, with the UTF_LENGTH bit as a flag. */
5350
5351 if (repeat_max < 0)
5352 {
5353 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5354 if (utf && (c & UTF_LENGTH) != 0)
5355 {
5356 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5357 code += c & 7;
5358 }
5359 else
5360 #endif
5361 {
5362 *code++ = c;
5363 if (prop_type >= 0)
5364 {
5365 *code++ = prop_type;
5366 *code++ = prop_value;
5367 }
5368 }
5369 *code++ = OP_STAR + repeat_type;
5370 }
5371
5372 /* Else insert an UPTO if the max is greater than the min, again
5373 preceded by the character, for the previously inserted code. If the
5374 UPTO is just for 1 instance, we can use QUERY instead. */
5375
5376 else if (repeat_max != repeat_min)
5377 {
5378 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5379 if (utf && (c & UTF_LENGTH) != 0)
5380 {
5381 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5382 code += c & 7;
5383 }
5384 else
5385 #endif
5386 *code++ = c;
5387 if (prop_type >= 0)
5388 {
5389 *code++ = prop_type;
5390 *code++ = prop_value;
5391 }
5392 repeat_max -= repeat_min;
5393
5394 if (repeat_max == 1)
5395 {
5396 *code++ = OP_QUERY + repeat_type;
5397 }
5398 else
5399 {
5400 *code++ = OP_UPTO + repeat_type;
5401 PUT2INC(code, 0, repeat_max);
5402 }
5403 }
5404 }
5405
5406 /* The character or character type itself comes last in all cases. */
5407
5408 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5409 if (utf && (c & UTF_LENGTH) != 0)
5410 {
5411 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5412 code += c & 7;
5413 }
5414 else
5415 #endif
5416 *code++ = c;
5417
5418 /* For a repeated Unicode property match, there are two extra bytes that
5419 define the required property. */
5420
5421 #ifdef SUPPORT_UCP
5422 if (prop_type >= 0)
5423 {
5424 *code++ = prop_type;
5425 *code++ = prop_value;
5426 }
5427 #endif
5428 }
5429
5430 /* If previous was a character class or a back reference, we put the repeat
5431 stuff after it, but just skip the item if the repeat was {0,0}. */
5432
5433 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5434 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5435 *previous == OP_XCLASS ||
5436 #endif
5437 *previous == OP_REF || *previous == OP_REFI ||
5438 *previous == OP_DNREF || *previous == OP_DNREFI)
5439 {
5440 if (repeat_max == 0)
5441 {
5442 code = previous;
5443 goto END_REPEAT;
5444 }
5445
5446 if (repeat_min == 0 && repeat_max == -1)
5447 *code++ = OP_CRSTAR + repeat_type;
5448 else if (repeat_min == 1 && repeat_max == -1)
5449 *code++ = OP_CRPLUS + repeat_type;
5450 else if (repeat_min == 0 && repeat_max == 1)
5451 *code++ = OP_CRQUERY + repeat_type;
5452 else
5453 {
5454 *code++ = OP_CRRANGE + repeat_type;
5455 PUT2INC(code, 0, repeat_min);
5456 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5457 PUT2INC(code, 0, repeat_max);
5458 }
5459 }
5460
5461 /* If previous was a bracket group, we may have to replicate it in certain
5462 cases. Note that at this point we can encounter only the "basic" bracket
5463 opcodes such as BRA and CBRA, as this is the place where they get converted
5464 into the more special varieties such as BRAPOS and SBRA. A test for >=
5465 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5466 ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5467 repetition of assertions, but now it does, for Perl compatibility. */
5468
5469 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5470 {
5471 register int i;
5472 int len = (int)(code - previous);
5473 pcre_uchar *bralink = NULL;
5474 pcre_uchar *brazeroptr = NULL;
5475
5476 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5477 we just ignore the repeat. */
5478
5479 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5480 goto END_REPEAT;
5481
5482 /* There is no sense in actually repeating assertions. The only potential
5483 use of repetition is in cases when the assertion is optional. Therefore,
5484 if the minimum is greater than zero, just ignore the repeat. If the
5485 maximum is not not zero or one, set it to 1. */
5486
5487 if (*previous < OP_ONCE) /* Assertion */
5488 {
5489 if (repeat_min > 0) goto END_REPEAT;
5490 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5491 }
5492
5493 /* The case of a zero minimum is special because of the need to stick
5494 OP_BRAZERO in front of it, and because the group appears once in the
5495 data, whereas in other cases it appears the minimum number of times. For
5496 this reason, it is simplest to treat this case separately, as otherwise
5497 the code gets far too messy. There are several special subcases when the
5498 minimum is zero. */
5499
5500 if (repeat_min == 0)
5501 {
5502 /* If the maximum is also zero, we used to just omit the group from the
5503 output altogether, like this:
5504
5505 ** if (repeat_max == 0)
5506 ** {
5507 ** code = previous;
5508 ** goto END_REPEAT;
5509 ** }
5510
5511 However, that fails when a group or a subgroup within it is referenced
5512 as a subroutine from elsewhere in the pattern, so now we stick in
5513 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5514 don't have a list of which groups are referenced, we cannot do this
5515 selectively.
5516
5517 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5518 and do no more at this point. However, we do need to adjust any
5519 OP_RECURSE calls inside the group that refer to the group itself or any
5520 internal or forward referenced group, because the offset is from the
5521 start of the whole regex. Temporarily terminate the pattern while doing
5522 this. */
5523
5524 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5525 {
5526 *code = OP_END;
5527 adjust_recurse(previous, 1, utf, cd, save_hwm);
5528 memmove(previous + 1, previous, IN_UCHARS(len));
5529 code++;
5530 if (repeat_max == 0)
5531 {
5532 *previous++ = OP_SKIPZERO;
5533 goto END_REPEAT;
5534 }
5535 brazeroptr = previous; /* Save for possessive optimizing */
5536 *previous++ = OP_BRAZERO + repeat_type;
5537 }
5538
5539 /* If the maximum is greater than 1 and limited, we have to replicate
5540 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5541 The first one has to be handled carefully because it's the original
5542 copy, which has to be moved up. The remainder can be handled by code
5543 that is common with the non-zero minimum case below. We have to
5544 adjust the value or repeat_max, since one less copy is required. Once
5545 again, we may have to adjust any OP_RECURSE calls inside the group. */
5546
5547 else
5548 {
5549 int offset;
5550 *code = OP_END;
5551 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5552 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5553 code += 2 + LINK_SIZE;
5554 *previous++ = OP_BRAZERO + repeat_type;
5555 *previous++ = OP_BRA;
5556
5557 /* We chain together the bracket offset fields that have to be
5558 filled in later when the ends of the brackets are reached. */
5559
5560 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5561 bralink = previous;
5562 PUTINC(previous, 0, offset);
5563 }
5564
5565 repeat_max--;
5566 }
5567
5568 /* If the minimum is greater than zero, replicate the group as many
5569 times as necessary, and adjust the maximum to the number of subsequent
5570 copies that we need. If we set a first char from the group, and didn't
5571 set a required char, copy the latter from the former. If there are any
5572 forward reference subroutine calls in the group, there will be entries on
5573 the workspace list; replicate these with an appropriate increment. */
5574
5575 else
5576 {
5577 if (repeat_min > 1)
5578 {
5579 /* In the pre-compile phase, we don't actually do the replication. We
5580 just adjust the length as if we had. Do some paranoid checks for
5581 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5582 integer type when available, otherwise double. */
5583
5584 if (lengthptr != NULL)
5585 {
5586 int delta = (repeat_min - 1)*length_prevgroup;
5587 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5588 (INT64_OR_DOUBLE)length_prevgroup >
5589 (INT64_OR_DOUBLE)INT_MAX ||
5590 OFLOW_MAX - *lengthptr < delta)
5591 {
5592 *errorcodeptr = ERR20;
5593 goto FAILED;
5594 }
5595 *lengthptr += delta;
5596 }
5597
5598 /* This is compiling for real. If there is a set first byte for
5599 the group, and we have not yet set a "required byte", set it. Make
5600 sure there is enough workspace for copying forward references before
5601 doing the copy. */
5602
5603 else
5604 {
5605 if (groupsetfirstchar && reqcharflags < 0)
5606 {
5607 reqchar = firstchar;
5608 reqcharflags = firstcharflags;
5609 }
5610
5611 for (i = 1; i < repeat_min; i++)
5612 {
5613 pcre_uchar *hc;
5614 pcre_uchar *this_hwm = cd->hwm;
5615 memcpy(code, previous, IN_UCHARS(len));
5616
5617 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5618 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5619 {
5620 int save_offset = save_hwm - cd->start_workspace;
5621 int this_offset = this_hwm - cd->start_workspace;
5622 *errorcodeptr = expand_workspace(cd);
5623 if (*errorcodeptr != 0) goto FAILED;
5624 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5625 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5626 }
5627
5628 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5629 {
5630 PUT(cd->hwm, 0, GET(hc, 0) + len);
5631 cd->hwm += LINK_SIZE;
5632 }
5633 save_hwm = this_hwm;
5634 code += len;
5635 }
5636 }
5637 }
5638
5639 if (repeat_max > 0) repeat_max -= repeat_min;
5640 }
5641
5642 /* This code is common to both the zero and non-zero minimum cases. If
5643 the maximum is limited, it replicates the group in a nested fashion,
5644 remembering the bracket starts on a stack. In the case of a zero minimum,
5645 the first one was set up above. In all cases the repeat_max now specifies
5646 the number of additional copies needed. Again, we must remember to
5647 replicate entries on the forward reference list. */
5648
5649 if (repeat_max >= 0)
5650 {
5651 /* In the pre-compile phase, we don't actually do the replication. We
5652 just adjust the length as if we had. For each repetition we must add 1
5653 to the length for BRAZERO and for all but the last repetition we must
5654 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5655 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5656 a 64-bit integer type when available, otherwise double. */
5657
5658 if (lengthptr != NULL && repeat_max > 0)
5659 {
5660 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5661 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5662 if ((INT64_OR_DOUBLE)repeat_max *
5663 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5664 > (INT64_OR_DOUBLE)INT_MAX ||
5665 OFLOW_MAX - *lengthptr < delta)
5666 {
5667 *errorcodeptr = ERR20;
5668 goto FAILED;
5669 }
5670 *lengthptr += delta;
5671 }
5672
5673 /* This is compiling for real */
5674
5675 else for (i = repeat_max - 1; i >= 0; i--)
5676 {
5677 pcre_uchar *hc;
5678 pcre_uchar *this_hwm = cd->hwm;
5679
5680 *code++ = OP_BRAZERO + repeat_type;
5681
5682 /* All but the final copy start a new nesting, maintaining the
5683 chain of brackets outstanding. */
5684
5685 if (i != 0)
5686 {
5687 int offset;
5688 *code++ = OP_BRA;
5689 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5690 bralink = code;
5691 PUTINC(code, 0, offset);
5692 }
5693
5694 memcpy(code, previous, IN_UCHARS(len));
5695
5696 /* Ensure there is enough workspace for forward references before
5697 copying them. */
5698
5699 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5700 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5701 {
5702 int save_offset = save_hwm - cd->start_workspace;
5703 int this_offset = this_hwm - cd->start_workspace;
5704 *errorcodeptr = expand_workspace(cd);
5705 if (*errorcodeptr != 0) goto FAILED;
5706 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5707 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5708 }
5709
5710 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5711 {
5712 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5713 cd->hwm += LINK_SIZE;
5714 }
5715 save_hwm = this_hwm;
5716 code += len;
5717 }
5718
5719 /* Now chain through the pending brackets, and fill in their length
5720 fields (which are holding the chain links pro tem). */
5721
5722 while (bralink != NULL)
5723 {
5724 int oldlinkoffset;
5725 int offset = (int)(code - bralink + 1);
5726 pcre_uchar *bra = code - offset;
5727 oldlinkoffset = GET(bra, 1);
5728 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5729 *code++ = OP_KET;
5730 PUTINC(code, 0, offset);
5731 PUT(bra, 1, offset);
5732 }
5733 }
5734
5735 /* If the maximum is unlimited, set a repeater in the final copy. For
5736 ONCE brackets, that's all we need to do. However, possessively repeated
5737 ONCE brackets can be converted into non-capturing brackets, as the
5738 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5739 deal with possessive ONCEs specially.
5740
5741 Otherwise, when we are doing the actual compile phase, check to see
5742 whether this group is one that could match an empty string. If so,
5743 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5744 that runtime checking can be done. [This check is also applied to ONCE
5745 groups at runtime, but in a different way.]
5746
5747 Then, if the quantifier was possessive and the bracket is not a
5748 conditional, we convert the BRA code to the POS form, and the KET code to
5749 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5750 subpattern at both the start and at the end.) The use of special opcodes
5751 makes it possible to reduce greatly the stack usage in pcre_exec(). If
5752 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5753
5754 Then, if the minimum number of matches is 1 or 0, cancel the possessive
5755 flag so that the default action below, of wrapping everything inside
5756 atomic brackets, does not happen. When the minimum is greater than 1,
5757 there will be earlier copies of the group, and so we still have to wrap
5758 the whole thing. */
5759
5760 else
5761 {
5762 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5763 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5764
5765 /* Convert possessive ONCE brackets to non-capturing */
5766
5767 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5768 possessive_quantifier) *bracode = OP_BRA;
5769
5770 /* For non-possessive ONCE brackets, all we need to do is to
5771 set the KET. */
5772
5773 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5774 *ketcode = OP_KETRMAX + repeat_type;
5775
5776 /* Handle non-ONCE brackets and possessive ONCEs (which have been
5777 converted to non-capturing above). */
5778
5779 else
5780 {
5781 /* In the compile phase, check for empty string matching. */
5782
5783 if (lengthptr == NULL)
5784 {
5785 pcre_uchar *scode = bracode;
5786 do
5787 {
5788 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
5789 {
5790 *bracode += OP_SBRA - OP_BRA;
5791 break;
5792 }
5793 scode += GET(scode, 1);
5794 }
5795 while (*scode == OP_ALT);
5796 }
5797
5798 /* Handle possessive quantifiers. */
5799
5800 if (possessive_quantifier)
5801 {
5802 /* For COND brackets, we wrap the whole thing in a possessively
5803 repeated non-capturing bracket, because we have not invented POS
5804 versions of the COND opcodes. Because we are moving code along, we
5805 must ensure that any pending recursive references are updated. */
5806
5807 if (*bracode == OP_COND || *bracode == OP_SCOND)
5808 {
5809 int nlen = (int)(code - bracode);
5810 *code = OP_END;
5811 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5812 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5813 code += 1 + LINK_SIZE;
5814 nlen += 1 + LINK_SIZE;
5815 *bracode = OP_BRAPOS;
5816 *code++ = OP_KETRPOS;
5817 PUTINC(code, 0, nlen);
5818 PUT(bracode, 1, nlen);
5819 }
5820
5821 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5822
5823 else
5824 {
5825 *bracode += 1; /* Switch to xxxPOS opcodes */
5826 *ketcode = OP_KETRPOS;
5827 }
5828
5829 /* If the minimum is zero, mark it as possessive, then unset the
5830 possessive flag when the minimum is 0 or 1. */
5831
5832 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5833 if (repeat_min < 2) possessive_quantifier = FALSE;
5834 }
5835
5836 /* Non-possessive quantifier */
5837
5838 else *ketcode = OP_KETRMAX + repeat_type;
5839 }
5840 }
5841 }
5842
5843 /* If previous is OP_FAIL, it was generated by an empty class [] in
5844 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5845 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5846 error above. We can just ignore the repeat in JS case. */
5847
5848 else if (*previous == OP_FAIL) goto END_REPEAT;
5849
5850 /* Else there's some kind of shambles */
5851
5852 else
5853 {
5854 *errorcodeptr = ERR11;
5855 goto FAILED;
5856 }
5857
5858 /* If the character following a repeat is '+', or if certain optimization
5859 tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5860 there are special alternative opcodes for this case. For anything else, we
5861 wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5862 notation is just syntactic sugar, taken from Sun's Java package, but the
5863 special opcodes can optimize it.
5864
5865 Some (but not all) possessively repeated subpatterns have already been
5866 completely handled in the code just above. For them, possessive_quantifier
5867 is always FALSE at this stage.
5868
5869 Note that the repeated item starts at tempcode, not at previous, which
5870 might be the first part of a string whose (former) last char we repeated.
5871
5872 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5873 an 'upto' may follow. We skip over an 'exact' item, and then test the
5874 length of what remains before proceeding. */
5875
5876 if (possessive_quantifier)
5877 {
5878 int len;
5879
5880 if (*tempcode == OP_TYPEEXACT)
5881 tempcode += PRIV(OP_lengths)[*tempcode] +
5882 ((tempcode[1 + IMM2_SIZE] == OP_PROP
5883 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5884
5885 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5886 {
5887 tempcode += PRIV(OP_lengths)[*tempcode];
5888 #ifdef SUPPORT_UTF
5889 if (utf && HAS_EXTRALEN(tempcode[-1]))
5890 tempcode += GET_EXTRALEN(tempcode[-1]);
5891 #endif
5892 }
5893
5894 len = (int)(code - tempcode);
5895 if (len > 0) switch (*tempcode)
5896 {
5897 case OP_STAR: *tempcode = OP_POSSTAR; break;
5898 case OP_PLUS: *tempcode = OP_POSPLUS; break;
5899 case OP_QUERY: *tempcode = OP_POSQUERY; break;
5900 case OP_UPTO: *tempcode = OP_POSUPTO; break;
5901
5902 case OP_STARI: *tempcode = OP_POSSTARI; break;
5903 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
5904 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5905 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
5906
5907 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
5908 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
5909 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5910 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
5911
5912 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
5913 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
5914 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5915 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
5916
5917 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
5918 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
5919 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5920 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
5921
5922 /* Because we are moving code along, we must ensure that any
5923 pending recursive references are updated. */
5924
5925 default:
5926 *code = OP_END;
5927 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5928 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5929 code += 1 + LINK_SIZE;
5930 len += 1 + LINK_SIZE;
5931 tempcode[0] = OP_ONCE;
5932 *code++ = OP_KET;
5933 PUTINC(code, 0, len);
5934 PUT(tempcode, 1, len);
5935 break;
5936 }
5937 }
5938
5939 /* In all case we no longer have a previous item. We also set the
5940 "follows varying string" flag for subsequently encountered reqchars if
5941 it isn't already set and we have just passed a varying length item. */
5942
5943 END_REPEAT:
5944 previous = NULL;
5945 cd->req_varyopt |= reqvary;
5946 break;
5947
5948
5949 /* ===================================================================*/
5950 /* Start of nested parenthesized sub-expression, or comment or lookahead or
5951 lookbehind or option setting or condition or all the other extended
5952 parenthesis forms. */
5953
5954 case CHAR_LEFT_PARENTHESIS:
5955 newoptions = options;
5956 skipbytes = 0;
5957 bravalue = OP_CBRA;
5958 save_hwm = cd->hwm;
5959 reset_bracount = FALSE;
5960
5961 /* First deal with various "verbs" that can be introduced by '*'. */
5962
5963 ptr++;
5964 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5965 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5966 {
5967 int i, namelen;
5968 int arglen = 0;
5969 const char *vn = verbnames;
5970 const pcre_uchar *name = ptr + 1;
5971 const pcre_uchar *arg = NULL;
5972 previous = NULL;
5973 ptr++;
5974 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5975 namelen = (int)(ptr - name);
5976
5977 /* It appears that Perl allows any characters whatsoever, other than
5978 a closing parenthesis, to appear in arguments, so we no longer insist on
5979 letters, digits, and underscores. */
5980
5981 if (*ptr == CHAR_COLON)
5982 {
5983 arg = ++ptr;
5984 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5985 arglen = (int)(ptr - arg);
5986 if ((unsigned int)arglen > MAX_MARK)
5987 {
5988 *errorcodeptr = ERR75;
5989 goto FAILED;
5990 }
5991 }
5992
5993 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5994 {
5995 *errorcodeptr = ERR60;
5996 goto FAILED;
5997 }
5998
5999 /* Scan the table of verb names */
6000
6001 for (i = 0; i < verbcount; i++)
6002 {
6003 if (namelen == verbs[i].len &&
6004 STRNCMP_UC_C8(name, vn, namelen) == 0)
6005 {
6006 int setverb;
6007
6008 /* Check for open captures before ACCEPT and convert it to
6009 ASSERT_ACCEPT if in an assertion. */
6010
6011 if (verbs[i].op == OP_ACCEPT)
6012 {
6013 open_capitem *oc;
6014 if (arglen != 0)
6015 {
6016 *errorcodeptr = ERR59;
6017 goto FAILED;
6018 }
6019 cd->had_accept = TRUE;
6020 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6021 {
6022 *code++ = OP_CLOSE;
6023 PUT2INC(code, 0, oc->number);
6024 }
6025 setverb = *code++ =
6026 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6027
6028 /* Do not set firstchar after *ACCEPT */
6029 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6030 }
6031
6032 /* Handle other cases with/without an argument */
6033
6034 else if (arglen == 0)
6035 {
6036 if (verbs[i].op < 0) /* Argument is mandatory */
6037 {
6038 *errorcodeptr = ERR66;
6039 goto FAILED;
6040 }
6041 setverb = *code++ = verbs[i].op;
6042 }
6043
6044 else
6045 {
6046 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6047 {
6048 *errorcodeptr = ERR59;
6049 goto FAILED;
6050 }
6051 setverb = *code++ = verbs[i].op_arg;
6052 *code++ = arglen;
6053 memcpy(code, arg, IN_UCHARS(arglen));
6054 code += arglen;
6055 *code++ = 0;
6056 }
6057
6058 switch (setverb)
6059 {
6060 case OP_THEN:
6061 case OP_THEN_ARG:
6062 cd->external_flags |= PCRE_HASTHEN;
6063 break;
6064
6065 case OP_PRUNE:
6066 case OP_PRUNE_ARG:
6067 case OP_SKIP:
6068 case OP_SKIP_ARG:
6069 cd->had_pruneorskip = TRUE;
6070 break;
6071 }
6072
6073 break; /* Found verb, exit loop */
6074 }
6075
6076 vn += verbs[i].len + 1;
6077 }
6078
6079 if (i < verbcount) continue; /* Successfully handled a verb */
6080 *errorcodeptr = ERR60; /* Verb not recognized */
6081 goto FAILED;
6082 }
6083
6084 /* Deal with the extended parentheses; all are introduced by '?', and the
6085 appearance of any of them means that this is not a capturing group. */
6086
6087 else if (*ptr == CHAR_QUESTION_MARK)
6088 {
6089 int i, set, unset, namelen;
6090 int *optset;
6091 const pcre_uchar *name;
6092 pcre_uchar *slot;
6093
6094 switch (*(++ptr))
6095 {
6096 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
6097 ptr++;
6098 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6099 if (*ptr == CHAR_NULL)
6100 {
6101 *errorcodeptr = ERR18;
6102 goto FAILED;
6103 }
6104 continue;
6105
6106
6107 /* ------------------------------------------------------------ */
6108 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6109 reset_bracount = TRUE;
6110 /* Fall through */
6111
6112 /* ------------------------------------------------------------ */
6113 case CHAR_COLON: /* Non-capturing bracket */
6114 bravalue = OP_BRA;
6115 ptr++;
6116 break;
6117
6118
6119 /* ------------------------------------------------------------ */
6120 case CHAR_LEFT_PARENTHESIS:
6121 bravalue = OP_COND; /* Conditional group */
6122 tempptr = ptr;
6123
6124 /* A condition can be an assertion, a number (referring to a numbered
6125 group), a name (referring to a named group), or 'R', referring to
6126 recursion. R<digits> and R&name are also permitted for recursion tests.
6127
6128 There are several syntaxes for testing a named group: (?(name)) is used
6129 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
6130
6131 There are two unfortunate ambiguities, caused by history. (a) 'R' can
6132 be the recursive thing or the name 'R' (and similarly for 'R' followed
6133 by digits), and (b) a number could be a name that consists of digits.
6134 In both cases, we look for a name first; if not found, we try the other
6135 cases.
6136
6137 For compatibility with auto-callouts, we allow a callout to be
6138 specified before a condition that is an assertion. First, check for the
6139 syntax of a callout; if found, adjust the temporary pointer that is
6140 used to check for an assertion condition. That's all that is needed! */
6141
6142 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6143 {
6144 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6145 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6146 tempptr += i + 1;
6147 }
6148
6149 /* For conditions that are assertions, check the syntax, and then exit
6150 the switch. This will take control down to where bracketed groups,
6151 including assertions, are processed. */
6152
6153 if (tempptr[1] == CHAR_QUESTION_MARK &&
6154 (tempptr[2] == CHAR_EQUALS_SIGN ||
6155 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6156 tempptr[2] == CHAR_LESS_THAN_SIGN))
6157 break;
6158
6159 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6160 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6161
6162 code[1+LINK_SIZE] = OP_CREF;
6163 skipbytes = 1+IMM2_SIZE;
6164 refsign = -1;
6165
6166 /* Check for a test for recursion in a named group. */
6167
6168 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
6169 {
6170 terminator = -1;
6171 ptr += 2;
6172 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6173 }
6174
6175 /* Check for a test for a named group's having been set, using the Perl
6176 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6177 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6178 consist entirely of digits, there is scope for ambiguity. */
6179
6180 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6181 {
6182 terminator = CHAR_GREATER_THAN_SIGN;
6183 ptr++;
6184 }
6185 else if (ptr[1] == CHAR_APOSTROPHE)
6186 {
6187 terminator = CHAR_APOSTROPHE;
6188 ptr++;
6189 }
6190 else
6191 {
6192 terminator = CHAR_NULL;
6193 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6194 }
6195
6196 /* When a name is one of a number of duplicates, a different opcode is
6197 used and it needs more memory. Unfortunately we cannot tell whether a
6198 name is a duplicate in the first pass, so we have to allow for more
6199 memory except when we know it is a relative numerical reference. */
6200
6201 if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6202
6203 /* We now expect to read a name (possibly all digits); any thing else
6204 is an error. In the case of all digits, also get it as a number. */
6205
6206 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6207 {
6208 ptr += 1; /* To get the right offset */
6209 *errorcodeptr = ERR28;
6210 goto FAILED;
6211 }
6212
6213 recno = 0;
6214 name = ++ptr;
6215 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6216 {
6217 if (recno >= 0)
6218 recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
6219 ptr++;
6220 }
6221 namelen = (int)(ptr - name);
6222
6223 /* Check the terminator */
6224
6225 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6226 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6227 {
6228 ptr--; /* Error offset */
6229 *errorcodeptr = ERR26;
6230 goto FAILED;
6231 }
6232
6233 /* Do no further checking in the pre-compile phase. */
6234
6235 if (lengthptr != NULL) break;
6236
6237 /* In the real compile we do the work of looking for the actual
6238 reference. If the string started with "+" or "-" we require the rest to
6239 be digits, in which case recno will be set. */
6240
6241 if (refsign > 0)
6242 {
6243 if (recno <= 0)
6244 {
6245 *errorcodeptr = ERR58;
6246 goto FAILED;
6247 }
6248 recno = (refsign == CHAR_MINUS)?
6249 cd->bracount - recno + 1 : recno +cd->bracount;
6250 if (recno <= 0 || recno > cd->final_bracount)
6251 {
6252 *errorcodeptr = ERR15;
6253 goto FAILED;
6254 }
6255 PUT2(code, 2+LINK_SIZE, recno);
6256 break;
6257 }
6258
6259 /* Otherwise (did not start with "+" or "-"), start by looking for the
6260 name. */
6261
6262 slot = cd->name_table;
6263 for (i = 0; i < cd->names_found; i++)
6264 {
6265 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6266 slot += cd->name_entry_size;
6267 }
6268
6269 /* Found the named subpattern. If the name is duplicated, add one to
6270 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6271 appropriate data values. Otherwise, just insert the unique subpattern
6272 number. */
6273
6274 if (i < cd->names_found)
6275 {
6276 int offset = i++;
6277 int count = 1;
6278 recno = GET2(slot, 0); /* Number from first found */
6279 for (; i < cd->names_found; i++)
6280 {
6281 slot += cd->name_entry_size;
6282 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6283 count++;
6284 }
6285 if (count > 1)
6286 {
6287 PUT2(code, 2+LINK_SIZE, offset);
6288 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6289 skipbytes += IMM2_SIZE;
6290 code[1+LINK_SIZE]++;
6291 }
6292 else /* Not a duplicated name */
6293 {
6294 PUT2(code, 2+LINK_SIZE, recno);
6295 }
6296 }
6297
6298 /* If terminator == CHAR_NULL it means that the name followed directly
6299 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6300 are some further alternatives to try. For the cases where terminator !=
6301 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
6302 now checked all the possibilities, so give an error. */
6303
6304 else if (terminator != CHAR_NULL)
6305 {
6306 *errorcodeptr = ERR15;
6307 goto FAILED;
6308 }
6309
6310 /* Check for (?(R) for recursion. Allow digits after R to specify a
6311 specific group number. */
6312
6313 else if (*name == CHAR_R)
6314 {
6315 recno = 0;
6316 for (i = 1; i < namelen; i++)
6317 {
6318 if (!IS_DIGIT(name[i]))
6319 {
6320 *errorcodeptr = ERR15;
6321 goto FAILED;
6322 }
6323 recno = recno * 10 + name[i] - CHAR_0;
6324 }
6325 if (recno == 0) recno = RREF_ANY;
6326 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
6327 PUT2(code, 2+LINK_SIZE, recno);
6328 }
6329
6330 /* Similarly, check for the (?(DEFINE) "condition", which is always
6331 false. */
6332
6333 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6334 {
6335 code[1+LINK_SIZE] = OP_DEF;
6336 skipbytes = 1;
6337 }
6338
6339 /* Check for the "name" actually being a subpattern number. We are
6340 in the second pass here, so final_bracount is set. */
6341
6342 else if (recno > 0 && recno <= cd->final_bracount)
6343 {
6344 PUT2(code, 2+LINK_SIZE, recno);
6345 }
6346
6347 /* Either an unidentified subpattern, or a reference to (?(0) */
6348
6349 else
6350 {
6351 *errorcodeptr = (recno == 0)? ERR35: ERR15;
6352 goto FAILED;
6353 }
6354 break;
6355
6356
6357 /* ------------------------------------------------------------ */
6358 case CHAR_EQUALS_SIGN: /* Positive lookahead */
6359 bravalue = OP_ASSERT;
6360 cd->assert_depth += 1;
6361 ptr++;
6362 break;
6363
6364
6365 /* ------------------------------------------------------------ */
6366 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
6367 ptr++;
6368 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
6369 {
6370 *code++ = OP_FAIL;
6371 previous = NULL;
6372 continue;
6373 }
6374 bravalue = OP_ASSERT_NOT;
6375 cd->assert_depth += 1;
6376 break;
6377
6378
6379 /* ------------------------------------------------------------ */
6380 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
6381 switch (ptr[1])
6382 {
6383 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
6384 bravalue = OP_ASSERTBACK;
6385 cd->assert_depth += 1;
6386 ptr += 2;
6387 break;
6388
6389 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
6390 bravalue = OP_ASSERTBACK_NOT;
6391 cd->assert_depth += 1;
6392 ptr += 2;
6393 break;
6394
6395 default: /* Could be name define, else bad */
6396 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
6397 goto DEFINE_NAME;
6398 ptr++; /* Correct offset for error */
6399 *errorcodeptr = ERR24;
6400 goto FAILED;
6401 }
6402 break;
6403
6404
6405 /* ------------------------------------------------------------ */
6406 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
6407 bravalue = OP_ONCE;
6408 ptr++;
6409 break;
6410
6411
6412 /* ------------------------------------------------------------ */
6413 case CHAR_C: /* Callout - may be followed by digits; */
6414 previous_callout = code; /* Save for later completion */
6415 after_manual_callout = 1; /* Skip one item before completing */
6416 *code++ = OP_CALLOUT;
6417 {
6418 int n = 0;
6419 ptr++;
6420 while(IS_DIGIT(*ptr))
6421 n = n * 10 + *ptr++ - CHAR_0;
6422 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6423 {
6424 *errorcodeptr = ERR39;
6425 goto FAILED;
6426 }
6427 if (n > 255)
6428 {
6429 *errorcodeptr = ERR38;
6430 goto FAILED;
6431 }
6432 *code++ = n;
6433 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6434 PUT(code, LINK_SIZE, 0); /* Default length */
6435 code += 2 * LINK_SIZE;
6436 }
6437 previous = NULL;
6438 continue;
6439
6440
6441 /* ------------------------------------------------------------ */
6442 case CHAR_P: /* Python-style named subpattern handling */
6443 if (*(++ptr) == CHAR_EQUALS_SIGN ||
6444 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
6445 {
6446 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6447 terminator = CHAR_RIGHT_PARENTHESIS;
6448 goto NAMED_REF_OR_RECURSE;
6449 }
6450 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
6451 {
6452 *errorcodeptr = ERR41;
6453 goto FAILED;
6454 }
6455 /* Fall through to handle (?P< as (?< is handled */
6456
6457
6458 /* ------------------------------------------------------------ */
6459 DEFINE_NAME: /* Come here from (?< handling */
6460 case CHAR_APOSTROPHE:
6461 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6462 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6463 name = ++ptr;
6464
6465 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6466 namelen = (int)(ptr - name);
6467
6468 /* In the pre-compile phase, do a syntax check, remember the longest
6469 name, and then remember the group in a vector, expanding it if
6470 necessary. Duplicates for the same number are skipped; other duplicates
6471 are checked for validity. In the actual compile, there is nothing to
6472 do. */
6473
6474 if (lengthptr != NULL)
6475 {
6476 named_group *ng;
6477 pcre_uint32 number = cd->bracount + 1;
6478
6479 if (*ptr != (pcre_uchar)terminator)
6480 {
6481 *errorcodeptr = ERR42;
6482 goto FAILED;
6483 }
6484
6485 if (cd->names_found >= MAX_NAME_COUNT)
6486 {
6487 *errorcodeptr = ERR49;
6488 goto FAILED;
6489 }
6490
6491 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6492 {
6493 cd->name_entry_size = namelen + IMM2_SIZE + 1;
6494 if (namelen > MAX_NAME_SIZE)
6495 {
6496 *errorcodeptr = ERR48;
6497 goto FAILED;
6498 }
6499 }
6500
6501 /* Scan the list to check for duplicates. For duplicate names, if the
6502 number is the same, break the loop, which causes the name to be
6503 discarded; otherwise, if DUPNAMES is not set, give an error.
6504 If it is set, allow the name with a different number, but continue
6505 scanning in case this is a duplicate with the same number. For
6506 non-duplicate names, give an error if the number is duplicated. */
6507
6508 ng = cd->named_groups;
6509 for (i = 0; i < cd->names_found; i++, ng++)
6510 {
6511 if (namelen == ng->length &&
6512 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6513 {
6514 if (ng->number == number) break;
6515 if ((options & PCRE_DUPNAMES) == 0)
6516 {
6517 *errorcodeptr = ERR43;
6518 goto FAILED;
6519 }
6520 cd->dupnames = TRUE; /* Duplicate names exist */
6521 }
6522 else if (ng->number == number)
6523 {
6524 *errorcodeptr = ERR65;
6525 goto FAILED;
6526 }
6527 }
6528
6529 if (i >= cd->names_found) /* Not a duplicate with same number */
6530 {
6531 /* Increase the list size if necessary */
6532
6533 if (cd->names_found >= cd->named_group_list_size)
6534 {
6535 int newsize = cd->named_group_list_size * 2;
6536 named_group *newspace = (PUBL(malloc))
6537 (newsize * sizeof(named_group));
6538
6539 if (newspace == NULL)
6540 {
6541 *errorcodeptr = ERR21;
6542 goto FAILED;
6543 }
6544
6545 memcpy(newspace, cd->named_groups,
6546 cd->named_group_list_size * sizeof(named_group));
6547 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
6548 (PUBL(free))((void *)cd->named_groups);
6549 cd->named_groups = newspace;
6550 cd->named_group_list_size = newsize;
6551 }
6552
6553 cd->named_groups[cd->names_found].name = name;
6554 cd->named_groups[cd->names_found].length = namelen;
6555 cd->named_groups[cd->names_found].number = number;
6556 cd->names_found++;
6557 }
6558 }
6559
6560 ptr++; /* Move past > or ' in both passes. */
6561 goto NUMBERED_GROUP;
6562
6563
6564 /* ------------------------------------------------------------ */
6565 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
6566 terminator = CHAR_RIGHT_PARENTHESIS;
6567 is_recurse = TRUE;
6568 /* Fall through */
6569
6570 /* We come here from the Python syntax above that handles both
6571 references (?P=name) and recursion (?P>name), as well as falling
6572 through from the Perl recursion syntax (?&name). We also come here from
6573 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6574 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6575
6576 NAMED_REF_OR_RECURSE:
6577 name = ++ptr;
6578 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6579 namelen = (int)(ptr - name);
6580
6581 /* In the pre-compile phase, do a syntax check. We used to just set
6582 a dummy reference number, because it was not used in the first pass.
6583 However, with the change of recursive back references to be atomic,
6584 we have to look for the number so that this state can be identified, as
6585 otherwise the incorrect length is computed. If it's not a backwards
6586 reference, the dummy number will do. */
6587
6588 if (lengthptr != NULL)
6589 {
6590 named_group *ng;
6591
6592 if (namelen == 0)
6593 {
6594 *errorcodeptr = ERR62;
6595 goto FAILED;
6596 }
6597 if (*ptr != (pcre_uchar)terminator)
6598 {
6599 *errorcodeptr = ERR42;
6600 goto FAILED;
6601 }
6602 if (namelen > MAX_NAME_SIZE)
6603 {
6604 *errorcodeptr = ERR48;
6605 goto FAILED;
6606 }
6607
6608 /* The name table does not exist in the first pass; instead we must
6609 scan the list of names encountered so far in order to get the
6610 number. If the name is not found, set the value to 0 for a forward
6611 reference. */
6612
6613 ng = cd->named_groups;
6614 for (i = 0; i < cd->names_found; i++, ng++)
6615 {
6616 if (namelen == ng->length &&
6617 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6618 break;
6619 }
6620 recno = (i < cd->names_found)? ng->number : 0;
6621
6622 /* Count named back references. */
6623
6624 if (!is_recurse) cd->namedrefcount++;
6625 }
6626
6627 /* In the real compile, search the name table. We check the name
6628 first, and then check that we have reached the end of the name in the
6629 table. That way, if the name is longer than any in the table, the
6630 comparison will fail without reading beyond the table entry. */
6631
6632 else
6633 {
6634 slot = cd->name_table;
6635 for (i = 0; i < cd->names_found; i++)
6636 {
6637 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6638 slot[IMM2_SIZE+namelen] == 0)
6639 break;
6640 slot += cd->name_entry_size;
6641 }
6642
6643 if (i < cd->names_found)
6644 {
6645 recno = GET2(slot, 0);
6646 }
6647 else
6648 {
6649 *errorcodeptr = ERR15;
6650 goto FAILED;
6651 }
6652 }
6653
6654