/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1375 - (show annotations)
Sat Oct 12 17:56:40 2013 UTC (6 years, 1 month ago) by zherczeg
File MIME type: text/plain
File size: 295522 byte(s)
Error occurred while calculating annotation data.
+1 is not needed for XCLASS as well.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Tables of names of POSIX character classes and their lengths. The names are
264 now all in a single string, to reduce the number of relocations when a shared
265 library is dynamically loaded. The list of lengths is terminated by a zero
266 length entry. The first three must be alpha, lower, upper, as this is assumed
267 for handling case independence. */
268
269 static const char posix_names[] =
270 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
271 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
272 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
273 STRING_word0 STRING_xdigit;
274
275 static const pcre_uint8 posix_name_lengths[] = {
276 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
277
278 /* Table of class bit maps for each POSIX class. Each class is formed from a
279 base map, with an optional addition or removal of another map. Then, for some
280 classes, there is some additional tweaking: for [:blank:] the vertical space
281 characters are removed, and for [:alpha:] and [:alnum:] the underscore
282 character is removed. The triples in the table consist of the base map offset,
283 second map offset or -1 if no second map, and a non-negative value for map
284 addition or a negative value for map subtraction (if there are two maps). The
285 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
286 remove vertical space characters, 2 => remove underscore. */
287
288 static const int posix_class_maps[] = {
289 cbit_word, cbit_digit, -2, /* alpha */
290 cbit_lower, -1, 0, /* lower */
291 cbit_upper, -1, 0, /* upper */
292 cbit_word, -1, 2, /* alnum - word without underscore */
293 cbit_print, cbit_cntrl, 0, /* ascii */
294 cbit_space, -1, 1, /* blank - a GNU extension */
295 cbit_cntrl, -1, 0, /* cntrl */
296 cbit_digit, -1, 0, /* digit */
297 cbit_graph, -1, 0, /* graph */
298 cbit_print, -1, 0, /* print */
299 cbit_punct, -1, 0, /* punct */
300 cbit_space, -1, 0, /* space */
301 cbit_word, -1, 0, /* word - a Perl extension */
302 cbit_xdigit,-1, 0 /* xdigit */
303 };
304
305 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
306 substitutes must be in the order of the names, defined above, and there are
307 both positive and negative cases. NULL means no substitute. */
308
309 #ifdef SUPPORT_UCP
310 static const pcre_uchar string_PNd[] = {
311 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
312 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
313 static const pcre_uchar string_pNd[] = {
314 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
315 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
316 static const pcre_uchar string_PXsp[] = {
317 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
318 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319 static const pcre_uchar string_pXsp[] = {
320 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322 static const pcre_uchar string_PXwd[] = {
323 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
324 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325 static const pcre_uchar string_pXwd[] = {
326 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328
329 static const pcre_uchar *substitutes[] = {
330 string_PNd, /* \D */
331 string_pNd, /* \d */
332 string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
333 string_pXsp, /* \s */
334 string_PXwd, /* \W */
335 string_pXwd /* \w */
336 };
337
338 static const pcre_uchar string_pL[] = {
339 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
340 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pLl[] = {
342 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_pLu[] = {
345 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
346 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXan[] = {
348 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 static const pcre_uchar string_h[] = {
351 CHAR_BACKSLASH, CHAR_h, '\0' };
352 static const pcre_uchar string_pXps[] = {
353 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355 static const pcre_uchar string_PL[] = {
356 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
357 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358 static const pcre_uchar string_PLl[] = {
359 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
360 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361 static const pcre_uchar string_PLu[] = {
362 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
363 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
364 static const pcre_uchar string_PXan[] = {
365 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
366 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
367 static const pcre_uchar string_H[] = {
368 CHAR_BACKSLASH, CHAR_H, '\0' };
369 static const pcre_uchar string_PXps[] = {
370 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372
373 static const pcre_uchar *posix_substitutes[] = {
374 string_pL, /* alpha */
375 string_pLl, /* lower */
376 string_pLu, /* upper */
377 string_pXan, /* alnum */
378 NULL, /* ascii */
379 string_h, /* blank */
380 NULL, /* cntrl */
381 string_pNd, /* digit */
382 NULL, /* graph */
383 NULL, /* print */
384 NULL, /* punct */
385 string_pXps, /* space */ /* NOTE: Xps is POSIX space */
386 string_pXwd, /* word */
387 NULL, /* xdigit */
388 /* Negated cases */
389 string_PL, /* ^alpha */
390 string_PLl, /* ^lower */
391 string_PLu, /* ^upper */
392 string_PXan, /* ^alnum */
393 NULL, /* ^ascii */
394 string_H, /* ^blank */
395 NULL, /* ^cntrl */
396 string_PNd, /* ^digit */
397 NULL, /* ^graph */
398 NULL, /* ^print */
399 NULL, /* ^punct */
400 string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
401 string_PXwd, /* ^word */
402 NULL /* ^xdigit */
403 };
404 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
405 #endif
406
407 #define STRING(a) # a
408 #define XSTRING(s) STRING(s)
409
410 /* The texts of compile-time error messages. These are "char *" because they
411 are passed to the outside world. Do not ever re-use any error number, because
412 they are documented. Always add a new error instead. Messages marked DEAD below
413 are no longer used. This used to be a table of strings, but in order to reduce
414 the number of relocations needed when a shared library is loaded dynamically,
415 it is now one long string. We cannot use a table of offsets, because the
416 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
417 simply count through to the one we want - this isn't a performance issue
418 because these strings are used only when there is a compilation error.
419
420 Each substring ends with \0 to insert a null character. This includes the final
421 substring, so that the whole string ends with \0\0, which can be detected when
422 counting through. */
423
424 static const char error_texts[] =
425 "no error\0"
426 "\\ at end of pattern\0"
427 "\\c at end of pattern\0"
428 "unrecognized character follows \\\0"
429 "numbers out of order in {} quantifier\0"
430 /* 5 */
431 "number too big in {} quantifier\0"
432 "missing terminating ] for character class\0"
433 "invalid escape sequence in character class\0"
434 "range out of order in character class\0"
435 "nothing to repeat\0"
436 /* 10 */
437 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
438 "internal error: unexpected repeat\0"
439 "unrecognized character after (? or (?-\0"
440 "POSIX named classes are supported only within a class\0"
441 "missing )\0"
442 /* 15 */
443 "reference to non-existent subpattern\0"
444 "erroffset passed as NULL\0"
445 "unknown option bit(s) set\0"
446 "missing ) after comment\0"
447 "parentheses nested too deeply\0" /** DEAD **/
448 /* 20 */
449 "regular expression is too large\0"
450 "failed to get memory\0"
451 "unmatched parentheses\0"
452 "internal error: code overflow\0"
453 "unrecognized character after (?<\0"
454 /* 25 */
455 "lookbehind assertion is not fixed length\0"
456 "malformed number or name after (?(\0"
457 "conditional group contains more than two branches\0"
458 "assertion expected after (?(\0"
459 "(?R or (?[+-]digits must be followed by )\0"
460 /* 30 */
461 "unknown POSIX class name\0"
462 "POSIX collating elements are not supported\0"
463 "this version of PCRE is compiled without UTF support\0"
464 "spare error\0" /** DEAD **/
465 "character value in \\x{} or \\o{} is too large\0"
466 /* 35 */
467 "invalid condition (?(0)\0"
468 "\\C not allowed in lookbehind assertion\0"
469 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
470 "number after (?C is > 255\0"
471 "closing ) for (?C expected\0"
472 /* 40 */
473 "recursive call could loop indefinitely\0"
474 "unrecognized character after (?P\0"
475 "syntax error in subpattern name (missing terminator)\0"
476 "two named subpatterns have the same name\0"
477 "invalid UTF-8 string\0"
478 /* 45 */
479 "support for \\P, \\p, and \\X has not been compiled\0"
480 "malformed \\P or \\p sequence\0"
481 "unknown property name after \\P or \\p\0"
482 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
483 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
484 /* 50 */
485 "repeated subpattern is too long\0" /** DEAD **/
486 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
487 "internal error: overran compiling workspace\0"
488 "internal error: previously-checked referenced subpattern not found\0"
489 "DEFINE group contains more than one branch\0"
490 /* 55 */
491 "repeating a DEFINE group is not allowed\0" /** DEAD **/
492 "inconsistent NEWLINE options\0"
493 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
494 "a numbered reference must not be zero\0"
495 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
496 /* 60 */
497 "(*VERB) not recognized or malformed\0"
498 "number is too big\0"
499 "subpattern name expected\0"
500 "digit expected after (?+\0"
501 "] is an invalid data character in JavaScript compatibility mode\0"
502 /* 65 */
503 "different names for subpatterns of the same number are not allowed\0"
504 "(*MARK) must have an argument\0"
505 "this version of PCRE is not compiled with Unicode property support\0"
506 "\\c must be followed by an ASCII character\0"
507 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
508 /* 70 */
509 "internal error: unknown opcode in find_fixedlength()\0"
510 "\\N is not supported in a class\0"
511 "too many forward references\0"
512 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
513 "invalid UTF-16 string\0"
514 /* 75 */
515 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
516 "character value in \\u.... sequence is too large\0"
517 "invalid UTF-32 string\0"
518 "setting UTF is disabled by the application\0"
519 "non-hex character in \\x{} (closing brace missing?)\0"
520 /* 80 */
521 "non-octal character in \\o{} (closing brace missing?)\0"
522 "missing opening brace after \\o\0"
523 ;
524
525 /* Table to identify digits and hex digits. This is used when compiling
526 patterns. Note that the tables in chartables are dependent on the locale, and
527 may mark arbitrary characters as digits - but the PCRE compiling code expects
528 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
529 a private table here. It costs 256 bytes, but it is a lot faster than doing
530 character value tests (at least in some simple cases I timed), and in some
531 applications one wants PCRE to compile efficiently as well as match
532 efficiently.
533
534 For convenience, we use the same bit definitions as in chartables:
535
536 0x04 decimal digit
537 0x08 hexadecimal digit
538
539 Then we can use ctype_digit and ctype_xdigit in the code. */
540
541 /* Using a simple comparison for decimal numbers rather than a memory read
542 is much faster, and the resulting code is simpler (the compiler turns it
543 into a subtraction and unsigned comparison). */
544
545 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
546
547 #ifndef EBCDIC
548
549 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
550 UTF-8 mode. */
551
552 static const pcre_uint8 digitab[] =
553 {
554 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
555 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
556 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
557 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
558 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
559 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
560 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
561 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
562 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
563 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
564 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
565 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
566 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
586
587 #else
588
589 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
590
591 static const pcre_uint8 digitab[] =
592 {
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
609 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
617 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
623 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
624 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
625
626 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
627 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
628 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
629 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
631 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
635 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
636 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
638 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
640 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
643 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
644 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
645 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
646 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
647 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
648 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
649 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
650 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
651 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
652 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
653 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
654 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
655 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
656 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
657 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
658 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
659 #endif
660
661
662 /* This table is used to check whether auto-possessification is possible
663 between adjacent character-type opcodes. The left-hand (repeated) opcode is
664 used to select the row, and the right-hand opcode is use to select the column.
665 A value of 1 means that auto-possessification is OK. For example, the second
666 value in the first row means that \D+\d can be turned into \D++\d.
667
668 The Unicode property types (\P and \p) have to be present to fill out the table
669 because of what their opcode values are, but the table values should always be
670 zero because property types are handled separately in the code. The last four
671 columns apply to items that cannot be repeated, so there is no need to have
672 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
673 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
674
675 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
676 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
677
678 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
679 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
680 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
681 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
682 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
683 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
684 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
685 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
686 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
687 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
688 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
689 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
690 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
691 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
692 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
693 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
694 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
695 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
696 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
697 };
698
699
700 /* This table is used to check whether auto-possessification is possible
701 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
702 left-hand (repeated) opcode is used to select the row, and the right-hand
703 opcode is used to select the column. The values are as follows:
704
705 0 Always return FALSE (never auto-possessify)
706 1 Character groups are distinct (possessify if both are OP_PROP)
707 2 Check character categories in the same group (general or particular)
708 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
709
710 4 Check left general category vs right particular category
711 5 Check right general category vs left particular category
712
713 6 Left alphanum vs right general category
714 7 Left space vs right general category
715 8 Left word vs right general category
716
717 9 Right alphanum vs left general category
718 10 Right space vs left general category
719 11 Right word vs left general category
720
721 12 Left alphanum vs right particular category
722 13 Left space vs right particular category
723 14 Left word vs right particular category
724
725 15 Right alphanum vs left particular category
726 16 Right space vs left particular category
727 17 Right word vs left particular category
728 */
729
730 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
731 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
732 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
733 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
734 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
735 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
736 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
737 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
738 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
739 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
740 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
741 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
742 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
743 };
744
745 /* This table is used to check whether auto-possessification is possible
746 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
747 specifies a general category and the other specifies a particular category. The
748 row is selected by the general category and the column by the particular
749 category. The value is 1 if the particular category is not part of the general
750 category. */
751
752 static const pcre_uint8 catposstab[7][30] = {
753 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
754 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
755 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
756 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
757 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
758 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
759 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
760 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
761 };
762
763 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
764 a general or particular category. The properties in each row are those
765 that apply to the character set in question. Duplication means that a little
766 unnecessary work is done when checking, but this keeps things much simpler
767 because they can all use the same code. For more details see the comment where
768 this table is used.
769
770 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
771 "space", but from Perl 5.18 it's included, so both categories are treated the
772 same here. */
773
774 static const pcre_uint8 posspropstab[3][4] = {
775 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
776 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
777 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
778 };
779
780
781
782 /*************************************************
783 * Find an error text *
784 *************************************************/
785
786 /* The error texts are now all in one long string, to save on relocations. As
787 some of the text is of unknown length, we can't use a table of offsets.
788 Instead, just count through the strings. This is not a performance issue
789 because it happens only when there has been a compilation error.
790
791 Argument: the error number
792 Returns: pointer to the error string
793 */
794
795 static const char *
796 find_error_text(int n)
797 {
798 const char *s = error_texts;
799 for (; n > 0; n--)
800 {
801 while (*s++ != CHAR_NULL) {};
802 if (*s == CHAR_NULL) return "Error text not found (please report)";
803 }
804 return s;
805 }
806
807
808
809 /*************************************************
810 * Expand the workspace *
811 *************************************************/
812
813 /* This function is called during the second compiling phase, if the number of
814 forward references fills the existing workspace, which is originally a block on
815 the stack. A larger block is obtained from malloc() unless the ultimate limit
816 has been reached or the increase will be rather small.
817
818 Argument: pointer to the compile data block
819 Returns: 0 if all went well, else an error number
820 */
821
822 static int
823 expand_workspace(compile_data *cd)
824 {
825 pcre_uchar *newspace;
826 int newsize = cd->workspace_size * 2;
827
828 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
829 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
830 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
831 return ERR72;
832
833 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
834 if (newspace == NULL) return ERR21;
835 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
836 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
837 if (cd->workspace_size > COMPILE_WORK_SIZE)
838 (PUBL(free))((void *)cd->start_workspace);
839 cd->start_workspace = newspace;
840 cd->workspace_size = newsize;
841 return 0;
842 }
843
844
845
846 /*************************************************
847 * Check for counted repeat *
848 *************************************************/
849
850 /* This function is called when a '{' is encountered in a place where it might
851 start a quantifier. It looks ahead to see if it really is a quantifier or not.
852 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
853 where the ddds are digits.
854
855 Arguments:
856 p pointer to the first char after '{'
857
858 Returns: TRUE or FALSE
859 */
860
861 static BOOL
862 is_counted_repeat(const pcre_uchar *p)
863 {
864 if (!IS_DIGIT(*p)) return FALSE;
865 p++;
866 while (IS_DIGIT(*p)) p++;
867 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
868
869 if (*p++ != CHAR_COMMA) return FALSE;
870 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
871
872 if (!IS_DIGIT(*p)) return FALSE;
873 p++;
874 while (IS_DIGIT(*p)) p++;
875
876 return (*p == CHAR_RIGHT_CURLY_BRACKET);
877 }
878
879
880
881 /*************************************************
882 * Handle escapes *
883 *************************************************/
884
885 /* This function is called when a \ has been encountered. It either returns a
886 positive value for a simple escape such as \n, or 0 for a data character which
887 will be placed in chptr. A backreference to group n is returned as negative n.
888 When UTF-8 is enabled, a positive value greater than 255 may be returned in
889 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
890 character of the escape sequence.
891
892 Arguments:
893 ptrptr points to the pattern position pointer
894 chptr points to a returned data character
895 errorcodeptr points to the errorcode variable
896 bracount number of previous extracting brackets
897 options the options bits
898 isclass TRUE if inside a character class
899
900 Returns: zero => a data character
901 positive => a special escape sequence
902 negative => a back reference
903 on error, errorcodeptr is set
904 */
905
906 static int
907 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
908 int bracount, int options, BOOL isclass)
909 {
910 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
911 BOOL utf = (options & PCRE_UTF8) != 0;
912 const pcre_uchar *ptr = *ptrptr + 1;
913 pcre_uint32 c;
914 int escape = 0;
915 int i;
916
917 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
918 ptr--; /* Set pointer back to the last byte */
919
920 /* If backslash is at the end of the pattern, it's an error. */
921
922 if (c == CHAR_NULL) *errorcodeptr = ERR1;
923
924 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
925 in a table. A non-zero result is something that can be returned immediately.
926 Otherwise further processing may be required. */
927
928 #ifndef EBCDIC /* ASCII/UTF-8 coding */
929 /* Not alphanumeric */
930 else if (c < CHAR_0 || c > CHAR_z) {}
931 else if ((i = escapes[c - CHAR_0]) != 0)
932 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
933
934 #else /* EBCDIC coding */
935 /* Not alphanumeric */
936 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
937 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
938 #endif
939
940 /* Escapes that need further processing, or are illegal. */
941
942 else
943 {
944 const pcre_uchar *oldptr;
945 BOOL braced, negated, overflow;
946 int s;
947
948 switch (c)
949 {
950 /* A number of Perl escapes are not handled by PCRE. We give an explicit
951 error. */
952
953 case CHAR_l:
954 case CHAR_L:
955 *errorcodeptr = ERR37;
956 break;
957
958 case CHAR_u:
959 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
960 {
961 /* In JavaScript, \u must be followed by four hexadecimal numbers.
962 Otherwise it is a lowercase u letter. */
963 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
964 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
965 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
966 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
967 {
968 c = 0;
969 for (i = 0; i < 4; ++i)
970 {
971 register pcre_uint32 cc = *(++ptr);
972 #ifndef EBCDIC /* ASCII/UTF-8 coding */
973 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
974 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
975 #else /* EBCDIC coding */
976 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
977 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
978 #endif
979 }
980
981 #if defined COMPILE_PCRE8
982 if (c > (utf ? 0x10ffffU : 0xffU))
983 #elif defined COMPILE_PCRE16
984 if (c > (utf ? 0x10ffffU : 0xffffU))
985 #elif defined COMPILE_PCRE32
986 if (utf && c > 0x10ffffU)
987 #endif
988 {
989 *errorcodeptr = ERR76;
990 }
991 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
992 }
993 }
994 else
995 *errorcodeptr = ERR37;
996 break;
997
998 case CHAR_U:
999 /* In JavaScript, \U is an uppercase U letter. */
1000 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1001 break;
1002
1003 /* In a character class, \g is just a literal "g". Outside a character
1004 class, \g must be followed by one of a number of specific things:
1005
1006 (1) A number, either plain or braced. If positive, it is an absolute
1007 backreference. If negative, it is a relative backreference. This is a Perl
1008 5.10 feature.
1009
1010 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1011 is part of Perl's movement towards a unified syntax for back references. As
1012 this is synonymous with \k{name}, we fudge it up by pretending it really
1013 was \k.
1014
1015 (3) For Oniguruma compatibility we also support \g followed by a name or a
1016 number either in angle brackets or in single quotes. However, these are
1017 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1018 the ESC_g code (cf \k). */
1019
1020 case CHAR_g:
1021 if (isclass) break;
1022 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1023 {
1024 escape = ESC_g;
1025 break;
1026 }
1027
1028 /* Handle the Perl-compatible cases */
1029
1030 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1031 {
1032 const pcre_uchar *p;
1033 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1034 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1035 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1036 {
1037 escape = ESC_k;
1038 break;
1039 }
1040 braced = TRUE;
1041 ptr++;
1042 }
1043 else braced = FALSE;
1044
1045 if (ptr[1] == CHAR_MINUS)
1046 {
1047 negated = TRUE;
1048 ptr++;
1049 }
1050 else negated = FALSE;
1051
1052 /* The integer range is limited by the machine's int representation. */
1053 s = 0;
1054 overflow = FALSE;
1055 while (IS_DIGIT(ptr[1]))
1056 {
1057 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1058 {
1059 overflow = TRUE;
1060 break;
1061 }
1062 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1063 }
1064 if (overflow) /* Integer overflow */
1065 {
1066 while (IS_DIGIT(ptr[1]))
1067 ptr++;
1068 *errorcodeptr = ERR61;
1069 break;
1070 }
1071
1072 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1073 {
1074 *errorcodeptr = ERR57;
1075 break;
1076 }
1077
1078 if (s == 0)
1079 {
1080 *errorcodeptr = ERR58;
1081 break;
1082 }
1083
1084 if (negated)
1085 {
1086 if (s > bracount)
1087 {
1088 *errorcodeptr = ERR15;
1089 break;
1090 }
1091 s = bracount - (s - 1);
1092 }
1093
1094 escape = -s;
1095 break;
1096
1097 /* The handling of escape sequences consisting of a string of digits
1098 starting with one that is not zero is not straightforward. Perl has changed
1099 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1100 recommended to avoid the ambiguities in the old syntax.
1101
1102 Outside a character class, the digits are read as a decimal number. If the
1103 number is less than 8 (used to be 10), or if there are that many previous
1104 extracting left brackets, then it is a back reference. Otherwise, up to
1105 three octal digits are read to form an escaped byte. Thus \123 is likely to
1106 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1107 the octal value is greater than 377, the least significant 8 bits are
1108 taken. \8 and \9 are treated as the literal characters 8 and 9.
1109
1110 Inside a character class, \ followed by a digit is always either a literal
1111 8 or 9 or an octal number. */
1112
1113 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1114 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1115
1116 if (!isclass)
1117 {
1118 oldptr = ptr;
1119 /* The integer range is limited by the machine's int representation. */
1120 s = (int)(c -CHAR_0);
1121 overflow = FALSE;
1122 while (IS_DIGIT(ptr[1]))
1123 {
1124 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1125 {
1126 overflow = TRUE;
1127 break;
1128 }
1129 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1130 }
1131 if (overflow) /* Integer overflow */
1132 {
1133 while (IS_DIGIT(ptr[1]))
1134 ptr++;
1135 *errorcodeptr = ERR61;
1136 break;
1137 }
1138 if (s < 8 || s <= bracount) /* Check for back reference */
1139 {
1140 escape = -s;
1141 break;
1142 }
1143 ptr = oldptr; /* Put the pointer back and fall through */
1144 }
1145
1146 /* Handle a digit following \ when the number is not a back reference. If
1147 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1148 then treat the digit as a following literal. At least by Perl 5.18 this
1149 changed so as not to insert the binary zero. */
1150
1151 if ((c = *ptr) >= CHAR_8) break;
1152
1153 /* Fall through with a digit less than 8 */
1154
1155 /* \0 always starts an octal number, but we may drop through to here with a
1156 larger first octal digit. The original code used just to take the least
1157 significant 8 bits of octal numbers (I think this is what early Perls used
1158 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1159 but no more than 3 octal digits. */
1160
1161 case CHAR_0:
1162 c -= CHAR_0;
1163 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1164 c = c * 8 + *(++ptr) - CHAR_0;
1165 #ifdef COMPILE_PCRE8
1166 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1167 #endif
1168 break;
1169
1170 /* \o is a relatively new Perl feature, supporting a more general way of
1171 specifying character codes in octal. The only supported form is \o{ddd}. */
1172
1173 case CHAR_o:
1174 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1175 {
1176 ptr += 2;
1177 c = 0;
1178 overflow = FALSE;
1179 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1180 {
1181 register pcre_uint32 cc = *ptr++;
1182 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1183 #ifdef COMPILE_PCRE32
1184 if (c >= 0x20000000l) { overflow = TRUE; break; }
1185 #endif
1186 c = (c << 3) + cc - CHAR_0 ;
1187 #if defined COMPILE_PCRE8
1188 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1189 #elif defined COMPILE_PCRE16
1190 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1191 #elif defined COMPILE_PCRE32
1192 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1193 #endif
1194 }
1195 if (overflow)
1196 {
1197 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1198 *errorcodeptr = ERR34;
1199 }
1200 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1201 {
1202 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1203 }
1204 else *errorcodeptr = ERR80;
1205 }
1206 break;
1207
1208 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1209 numbers. Otherwise it is a lowercase x letter. */
1210
1211 case CHAR_x:
1212 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1213 {
1214 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1215 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1216 {
1217 c = 0;
1218 for (i = 0; i < 2; ++i)
1219 {
1220 register pcre_uint32 cc = *(++ptr);
1221 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1222 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1223 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1224 #else /* EBCDIC coding */
1225 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1226 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1227 #endif
1228 }
1229 }
1230 } /* End JavaScript handling */
1231
1232 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1233 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1234 digits. If not, { used to be treated as a data character. However, Perl
1235 seems to read hex digits up to the first non-such, and ignore the rest, so
1236 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1237 now gives an error. */
1238
1239 else
1240 {
1241 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1242 {
1243 ptr += 2;
1244 c = 0;
1245 overflow = FALSE;
1246 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1247 {
1248 register pcre_uint32 cc = *ptr++;
1249 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1250
1251 #ifdef COMPILE_PCRE32
1252 if (c >= 0x10000000l) { overflow = TRUE; break; }
1253 #endif
1254
1255 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1256 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1257 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1258 #else /* EBCDIC coding */
1259 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1260 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1261 #endif
1262
1263 #if defined COMPILE_PCRE8
1264 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1265 #elif defined COMPILE_PCRE16
1266 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1267 #elif defined COMPILE_PCRE32
1268 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1269 #endif
1270 }
1271
1272 if (overflow)
1273 {
1274 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1275 *errorcodeptr = ERR34;
1276 }
1277
1278 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1279 {
1280 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1281 }
1282
1283 /* If the sequence of hex digits does not end with '}', give an error.
1284 We used just to recognize this construct and fall through to the normal
1285 \x handling, but nowadays Perl gives an error, which seems much more
1286 sensible, so we do too. */
1287
1288 else *errorcodeptr = ERR79;
1289 } /* End of \x{} processing */
1290
1291 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1292
1293 else
1294 {
1295 c = 0;
1296 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1297 {
1298 pcre_uint32 cc; /* Some compilers don't like */
1299 cc = *(++ptr); /* ++ in initializers */
1300 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1301 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1302 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1303 #else /* EBCDIC coding */
1304 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1305 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1306 #endif
1307 }
1308 } /* End of \xdd handling */
1309 } /* End of Perl-style \x handling */
1310 break;
1311
1312 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1313 An error is given if the byte following \c is not an ASCII character. This
1314 coding is ASCII-specific, but then the whole concept of \cx is
1315 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1316
1317 case CHAR_c:
1318 c = *(++ptr);
1319 if (c == CHAR_NULL)
1320 {
1321 *errorcodeptr = ERR2;
1322 break;
1323 }
1324 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1325 if (c > 127) /* Excludes all non-ASCII in either mode */
1326 {
1327 *errorcodeptr = ERR68;
1328 break;
1329 }
1330 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1331 c ^= 0x40;
1332 #else /* EBCDIC coding */
1333 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1334 c ^= 0xC0;
1335 #endif
1336 break;
1337
1338 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1339 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1340 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1341 odd, but there used to be some cases other than the default, and there may
1342 be again in future, so I haven't "optimized" it. */
1343
1344 default:
1345 if ((options & PCRE_EXTRA) != 0) switch(c)
1346 {
1347 default:
1348 *errorcodeptr = ERR3;
1349 break;
1350 }
1351 break;
1352 }
1353 }
1354
1355 /* Perl supports \N{name} for character names, as well as plain \N for "not
1356 newline". PCRE does not support \N{name}. However, it does support
1357 quantification such as \N{2,3}. */
1358
1359 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1360 !is_counted_repeat(ptr+2))
1361 *errorcodeptr = ERR37;
1362
1363 /* If PCRE_UCP is set, we change the values for \d etc. */
1364
1365 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1366 escape += (ESC_DU - ESC_D);
1367
1368 /* Set the pointer to the final character before returning. */
1369
1370 *ptrptr = ptr;
1371 *chptr = c;
1372 return escape;
1373 }
1374
1375
1376
1377 #ifdef SUPPORT_UCP
1378 /*************************************************
1379 * Handle \P and \p *
1380 *************************************************/
1381
1382 /* This function is called after \P or \p has been encountered, provided that
1383 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1384 pointing at the P or p. On exit, it is pointing at the final character of the
1385 escape sequence.
1386
1387 Argument:
1388 ptrptr points to the pattern position pointer
1389 negptr points to a boolean that is set TRUE for negation else FALSE
1390 ptypeptr points to an unsigned int that is set to the type value
1391 pdataptr points to an unsigned int that is set to the detailed property value
1392 errorcodeptr points to the error code variable
1393
1394 Returns: TRUE if the type value was found, or FALSE for an invalid type
1395 */
1396
1397 static BOOL
1398 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1399 unsigned int *pdataptr, int *errorcodeptr)
1400 {
1401 pcre_uchar c;
1402 int i, bot, top;
1403 const pcre_uchar *ptr = *ptrptr;
1404 pcre_uchar name[32];
1405
1406 c = *(++ptr);
1407 if (c == CHAR_NULL) goto ERROR_RETURN;
1408
1409 *negptr = FALSE;
1410
1411 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1412 negation. */
1413
1414 if (c == CHAR_LEFT_CURLY_BRACKET)
1415 {
1416 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1417 {
1418 *negptr = TRUE;
1419 ptr++;
1420 }
1421 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1422 {
1423 c = *(++ptr);
1424 if (c == CHAR_NULL) goto ERROR_RETURN;
1425 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1426 name[i] = c;
1427 }
1428 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1429 name[i] = 0;
1430 }
1431
1432 /* Otherwise there is just one following character */
1433
1434 else
1435 {
1436 name[0] = c;
1437 name[1] = 0;
1438 }
1439
1440 *ptrptr = ptr;
1441
1442 /* Search for a recognized property name using binary chop */
1443
1444 bot = 0;
1445 top = PRIV(utt_size);
1446
1447 while (bot < top)
1448 {
1449 int r;
1450 i = (bot + top) >> 1;
1451 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1452 if (r == 0)
1453 {
1454 *ptypeptr = PRIV(utt)[i].type;
1455 *pdataptr = PRIV(utt)[i].value;
1456 return TRUE;
1457 }
1458 if (r > 0) bot = i + 1; else top = i;
1459 }
1460
1461 *errorcodeptr = ERR47;
1462 *ptrptr = ptr;
1463 return FALSE;
1464
1465 ERROR_RETURN:
1466 *errorcodeptr = ERR46;
1467 *ptrptr = ptr;
1468 return FALSE;
1469 }
1470 #endif
1471
1472
1473
1474 /*************************************************
1475 * Read repeat counts *
1476 *************************************************/
1477
1478 /* Read an item of the form {n,m} and return the values. This is called only
1479 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1480 so the syntax is guaranteed to be correct, but we need to check the values.
1481
1482 Arguments:
1483 p pointer to first char after '{'
1484 minp pointer to int for min
1485 maxp pointer to int for max
1486 returned as -1 if no max
1487 errorcodeptr points to error code variable
1488
1489 Returns: pointer to '}' on success;
1490 current ptr on error, with errorcodeptr set non-zero
1491 */
1492
1493 static const pcre_uchar *
1494 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1495 {
1496 int min = 0;
1497 int max = -1;
1498
1499 /* Read the minimum value and do a paranoid check: a negative value indicates
1500 an integer overflow. */
1501
1502 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1503 if (min < 0 || min > 65535)
1504 {
1505 *errorcodeptr = ERR5;
1506 return p;
1507 }
1508
1509 /* Read the maximum value if there is one, and again do a paranoid on its size.
1510 Also, max must not be less than min. */
1511
1512 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1513 {
1514 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1515 {
1516 max = 0;
1517 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1518 if (max < 0 || max > 65535)
1519 {
1520 *errorcodeptr = ERR5;
1521 return p;
1522 }
1523 if (max < min)
1524 {
1525 *errorcodeptr = ERR4;
1526 return p;
1527 }
1528 }
1529 }
1530
1531 /* Fill in the required variables, and pass back the pointer to the terminating
1532 '}'. */
1533
1534 *minp = min;
1535 *maxp = max;
1536 return p;
1537 }
1538
1539
1540
1541 /*************************************************
1542 * Find first significant op code *
1543 *************************************************/
1544
1545 /* This is called by several functions that scan a compiled expression looking
1546 for a fixed first character, or an anchoring op code etc. It skips over things
1547 that do not influence this. For some calls, it makes sense to skip negative
1548 forward and all backward assertions, and also the \b assertion; for others it
1549 does not.
1550
1551 Arguments:
1552 code pointer to the start of the group
1553 skipassert TRUE if certain assertions are to be skipped
1554
1555 Returns: pointer to the first significant opcode
1556 */
1557
1558 static const pcre_uchar*
1559 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1560 {
1561 for (;;)
1562 {
1563 switch ((int)*code)
1564 {
1565 case OP_ASSERT_NOT:
1566 case OP_ASSERTBACK:
1567 case OP_ASSERTBACK_NOT:
1568 if (!skipassert) return code;
1569 do code += GET(code, 1); while (*code == OP_ALT);
1570 code += PRIV(OP_lengths)[*code];
1571 break;
1572
1573 case OP_WORD_BOUNDARY:
1574 case OP_NOT_WORD_BOUNDARY:
1575 if (!skipassert) return code;
1576 /* Fall through */
1577
1578 case OP_CALLOUT:
1579 case OP_CREF:
1580 case OP_DNCREF:
1581 case OP_RREF:
1582 case OP_DNRREF:
1583 case OP_DEF:
1584 code += PRIV(OP_lengths)[*code];
1585 break;
1586
1587 default:
1588 return code;
1589 }
1590 }
1591 /* Control never reaches here */
1592 }
1593
1594
1595
1596 /*************************************************
1597 * Find the fixed length of a branch *
1598 *************************************************/
1599
1600 /* Scan a branch and compute the fixed length of subject that will match it,
1601 if the length is fixed. This is needed for dealing with backward assertions.
1602 In UTF8 mode, the result is in characters rather than bytes. The branch is
1603 temporarily terminated with OP_END when this function is called.
1604
1605 This function is called when a backward assertion is encountered, so that if it
1606 fails, the error message can point to the correct place in the pattern.
1607 However, we cannot do this when the assertion contains subroutine calls,
1608 because they can be forward references. We solve this by remembering this case
1609 and doing the check at the end; a flag specifies which mode we are running in.
1610
1611 Arguments:
1612 code points to the start of the pattern (the bracket)
1613 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1614 atend TRUE if called when the pattern is complete
1615 cd the "compile data" structure
1616
1617 Returns: the fixed length,
1618 or -1 if there is no fixed length,
1619 or -2 if \C was encountered (in UTF-8 mode only)
1620 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1621 or -4 if an unknown opcode was encountered (internal error)
1622 */
1623
1624 static int
1625 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1626 {
1627 int length = -1;
1628
1629 register int branchlength = 0;
1630 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1631
1632 /* Scan along the opcodes for this branch. If we get to the end of the
1633 branch, check the length against that of the other branches. */
1634
1635 for (;;)
1636 {
1637 int d;
1638 pcre_uchar *ce, *cs;
1639 register pcre_uchar op = *cc;
1640
1641 switch (op)
1642 {
1643 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1644 OP_BRA (normal non-capturing bracket) because the other variants of these
1645 opcodes are all concerned with unlimited repeated groups, which of course
1646 are not of fixed length. */
1647
1648 case OP_CBRA:
1649 case OP_BRA:
1650 case OP_ONCE:
1651 case OP_ONCE_NC:
1652 case OP_COND:
1653 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1654 if (d < 0) return d;
1655 branchlength += d;
1656 do cc += GET(cc, 1); while (*cc == OP_ALT);
1657 cc += 1 + LINK_SIZE;
1658 break;
1659
1660 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1661 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1662 an ALT. If it is END it's the end of the outer call. All can be handled by
1663 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1664 because they all imply an unlimited repeat. */
1665
1666 case OP_ALT:
1667 case OP_KET:
1668 case OP_END:
1669 case OP_ACCEPT:
1670 case OP_ASSERT_ACCEPT:
1671 if (length < 0) length = branchlength;
1672 else if (length != branchlength) return -1;
1673 if (*cc != OP_ALT) return length;
1674 cc += 1 + LINK_SIZE;
1675 branchlength = 0;
1676 break;
1677
1678 /* A true recursion implies not fixed length, but a subroutine call may
1679 be OK. If the subroutine is a forward reference, we can't deal with
1680 it until the end of the pattern, so return -3. */
1681
1682 case OP_RECURSE:
1683 if (!atend) return -3;
1684 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1685 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1686 if (cc > cs && cc < ce) return -1; /* Recursion */
1687 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1688 if (d < 0) return d;
1689 branchlength += d;
1690 cc += 1 + LINK_SIZE;
1691 break;
1692
1693 /* Skip over assertive subpatterns */
1694
1695 case OP_ASSERT:
1696 case OP_ASSERT_NOT:
1697 case OP_ASSERTBACK:
1698 case OP_ASSERTBACK_NOT:
1699 do cc += GET(cc, 1); while (*cc == OP_ALT);
1700 cc += PRIV(OP_lengths)[*cc];
1701 break;
1702
1703 /* Skip over things that don't match chars */
1704
1705 case OP_MARK:
1706 case OP_PRUNE_ARG:
1707 case OP_SKIP_ARG:
1708 case OP_THEN_ARG:
1709 cc += cc[1] + PRIV(OP_lengths)[*cc];
1710 break;
1711
1712 case OP_CALLOUT:
1713 case OP_CIRC:
1714 case OP_CIRCM:
1715 case OP_CLOSE:
1716 case OP_COMMIT:
1717 case OP_CREF:
1718 case OP_DEF:
1719 case OP_DNCREF:
1720 case OP_DNRREF:
1721 case OP_DOLL:
1722 case OP_DOLLM:
1723 case OP_EOD:
1724 case OP_EODN:
1725 case OP_FAIL:
1726 case OP_NOT_WORD_BOUNDARY:
1727 case OP_PRUNE:
1728 case OP_REVERSE:
1729 case OP_RREF:
1730 case OP_SET_SOM:
1731 case OP_SKIP:
1732 case OP_SOD:
1733 case OP_SOM:
1734 case OP_THEN:
1735 case OP_WORD_BOUNDARY:
1736 cc += PRIV(OP_lengths)[*cc];
1737 break;
1738
1739 /* Handle literal characters */
1740
1741 case OP_CHAR:
1742 case OP_CHARI:
1743 case OP_NOT:
1744 case OP_NOTI:
1745 branchlength++;
1746 cc += 2;
1747 #ifdef SUPPORT_UTF
1748 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1749 #endif
1750 break;
1751
1752 /* Handle exact repetitions. The count is already in characters, but we
1753 need to skip over a multibyte character in UTF8 mode. */
1754
1755 case OP_EXACT:
1756 case OP_EXACTI:
1757 case OP_NOTEXACT:
1758 case OP_NOTEXACTI:
1759 branchlength += (int)GET2(cc,1);
1760 cc += 2 + IMM2_SIZE;
1761 #ifdef SUPPORT_UTF
1762 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1763 #endif
1764 break;
1765
1766 case OP_TYPEEXACT:
1767 branchlength += GET2(cc,1);
1768 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1769 cc += 2;
1770 cc += 1 + IMM2_SIZE + 1;
1771 break;
1772
1773 /* Handle single-char matchers */
1774
1775 case OP_PROP:
1776 case OP_NOTPROP:
1777 cc += 2;
1778 /* Fall through */
1779
1780 case OP_HSPACE:
1781 case OP_VSPACE:
1782 case OP_NOT_HSPACE:
1783 case OP_NOT_VSPACE:
1784 case OP_NOT_DIGIT:
1785 case OP_DIGIT:
1786 case OP_NOT_WHITESPACE:
1787 case OP_WHITESPACE:
1788 case OP_NOT_WORDCHAR:
1789 case OP_WORDCHAR:
1790 case OP_ANY:
1791 case OP_ALLANY:
1792 branchlength++;
1793 cc++;
1794 break;
1795
1796 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1797 otherwise \C is coded as OP_ALLANY. */
1798
1799 case OP_ANYBYTE:
1800 return -2;
1801
1802 /* Check a class for variable quantification */
1803
1804 case OP_CLASS:
1805 case OP_NCLASS:
1806 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1807 case OP_XCLASS:
1808 /* The original code caused an unsigned overflow in 64 bit systems,
1809 so now we use a conditional statement. */
1810 if (op == OP_XCLASS)
1811 cc += GET(cc, 1);
1812 else
1813 cc += PRIV(OP_lengths)[OP_CLASS];
1814 #else
1815 cc += PRIV(OP_lengths)[OP_CLASS];
1816 #endif
1817
1818 switch (*cc)
1819 {
1820 case OP_CRPLUS:
1821 case OP_CRMINPLUS:
1822 case OP_CRSTAR:
1823 case OP_CRMINSTAR:
1824 case OP_CRQUERY:
1825 case OP_CRMINQUERY:
1826 return -1;
1827
1828 case OP_CRRANGE:
1829 case OP_CRMINRANGE:
1830 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1831 branchlength += (int)GET2(cc,1);
1832 cc += 1 + 2 * IMM2_SIZE;
1833 break;
1834
1835 default:
1836 branchlength++;
1837 }
1838 break;
1839
1840 /* Anything else is variable length */
1841
1842 case OP_ANYNL:
1843 case OP_BRAMINZERO:
1844 case OP_BRAPOS:
1845 case OP_BRAPOSZERO:
1846 case OP_BRAZERO:
1847 case OP_CBRAPOS:
1848 case OP_EXTUNI:
1849 case OP_KETRMAX:
1850 case OP_KETRMIN:
1851 case OP_KETRPOS:
1852 case OP_MINPLUS:
1853 case OP_MINPLUSI:
1854 case OP_MINQUERY:
1855 case OP_MINQUERYI:
1856 case OP_MINSTAR:
1857 case OP_MINSTARI:
1858 case OP_MINUPTO:
1859 case OP_MINUPTOI:
1860 case OP_NOTMINPLUS:
1861 case OP_NOTMINPLUSI:
1862 case OP_NOTMINQUERY:
1863 case OP_NOTMINQUERYI:
1864 case OP_NOTMINSTAR:
1865 case OP_NOTMINSTARI:
1866 case OP_NOTMINUPTO:
1867 case OP_NOTMINUPTOI:
1868 case OP_NOTPLUS:
1869 case OP_NOTPLUSI:
1870 case OP_NOTPOSPLUS:
1871 case OP_NOTPOSPLUSI:
1872 case OP_NOTPOSQUERY:
1873 case OP_NOTPOSQUERYI:
1874 case OP_NOTPOSSTAR:
1875 case OP_NOTPOSSTARI:
1876 case OP_NOTPOSUPTO:
1877 case OP_NOTPOSUPTOI:
1878 case OP_NOTQUERY:
1879 case OP_NOTQUERYI:
1880 case OP_NOTSTAR:
1881 case OP_NOTSTARI:
1882 case OP_NOTUPTO:
1883 case OP_NOTUPTOI:
1884 case OP_PLUS:
1885 case OP_PLUSI:
1886 case OP_POSPLUS:
1887 case OP_POSPLUSI:
1888 case OP_POSQUERY:
1889 case OP_POSQUERYI:
1890 case OP_POSSTAR:
1891 case OP_POSSTARI:
1892 case OP_POSUPTO:
1893 case OP_POSUPTOI:
1894 case OP_QUERY:
1895 case OP_QUERYI:
1896 case OP_REF:
1897 case OP_REFI:
1898 case OP_DNREF:
1899 case OP_DNREFI:
1900 case OP_SBRA:
1901 case OP_SBRAPOS:
1902 case OP_SCBRA:
1903 case OP_SCBRAPOS:
1904 case OP_SCOND:
1905 case OP_SKIPZERO:
1906 case OP_STAR:
1907 case OP_STARI:
1908 case OP_TYPEMINPLUS:
1909 case OP_TYPEMINQUERY:
1910 case OP_TYPEMINSTAR:
1911 case OP_TYPEMINUPTO:
1912 case OP_TYPEPLUS:
1913 case OP_TYPEPOSPLUS:
1914 case OP_TYPEPOSQUERY:
1915 case OP_TYPEPOSSTAR:
1916 case OP_TYPEPOSUPTO:
1917 case OP_TYPEQUERY:
1918 case OP_TYPESTAR:
1919 case OP_TYPEUPTO:
1920 case OP_UPTO:
1921 case OP_UPTOI:
1922 return -1;
1923
1924 /* Catch unrecognized opcodes so that when new ones are added they
1925 are not forgotten, as has happened in the past. */
1926
1927 default:
1928 return -4;
1929 }
1930 }
1931 /* Control never gets here */
1932 }
1933
1934
1935
1936 /*************************************************
1937 * Scan compiled regex for specific bracket *
1938 *************************************************/
1939
1940 /* This little function scans through a compiled pattern until it finds a
1941 capturing bracket with the given number, or, if the number is negative, an
1942 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1943 so that it can be called from pcre_study() when finding the minimum matching
1944 length.
1945
1946 Arguments:
1947 code points to start of expression
1948 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1949 number the required bracket number or negative to find a lookbehind
1950
1951 Returns: pointer to the opcode for the bracket, or NULL if not found
1952 */
1953
1954 const pcre_uchar *
1955 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
1956 {
1957 for (;;)
1958 {
1959 register pcre_uchar c = *code;
1960
1961 if (c == OP_END) return NULL;
1962
1963 /* XCLASS is used for classes that cannot be represented just by a bit
1964 map. This includes negated single high-valued characters. The length in
1965 the table is zero; the actual length is stored in the compiled code. */
1966
1967 if (c == OP_XCLASS) code += GET(code, 1);
1968
1969 /* Handle recursion */
1970
1971 else if (c == OP_REVERSE)
1972 {
1973 if (number < 0) return (pcre_uchar *)code;
1974 code += PRIV(OP_lengths)[c];
1975 }
1976
1977 /* Handle capturing bracket */
1978
1979 else if (c == OP_CBRA || c == OP_SCBRA ||
1980 c == OP_CBRAPOS || c == OP_SCBRAPOS)
1981 {
1982 int n = (int)GET2(code, 1+LINK_SIZE);
1983 if (n == number) return (pcre_uchar *)code;
1984 code += PRIV(OP_lengths)[c];
1985 }
1986
1987 /* Otherwise, we can get the item's length from the table, except that for
1988 repeated character types, we have to test for \p and \P, which have an extra
1989 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1990 must add in its length. */
1991
1992 else
1993 {
1994 switch(c)
1995 {
1996 case OP_TYPESTAR:
1997 case OP_TYPEMINSTAR:
1998 case OP_TYPEPLUS:
1999 case OP_TYPEMINPLUS:
2000 case OP_TYPEQUERY:
2001 case OP_TYPEMINQUERY:
2002 case OP_TYPEPOSSTAR:
2003 case OP_TYPEPOSPLUS:
2004 case OP_TYPEPOSQUERY:
2005 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2006 break;
2007
2008 case OP_TYPEUPTO:
2009 case OP_TYPEMINUPTO:
2010 case OP_TYPEEXACT:
2011 case OP_TYPEPOSUPTO:
2012 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2013 code += 2;
2014 break;
2015
2016 case OP_MARK:
2017 case OP_PRUNE_ARG:
2018 case OP_SKIP_ARG:
2019 case OP_THEN_ARG:
2020 code += code[1];
2021 break;
2022 }
2023
2024 /* Add in the fixed length from the table */
2025
2026 code += PRIV(OP_lengths)[c];
2027
2028 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2029 a multi-byte character. The length in the table is a minimum, so we have to
2030 arrange to skip the extra bytes. */
2031
2032 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2033 if (utf) switch(c)
2034 {
2035 case OP_CHAR:
2036 case OP_CHARI:
2037 case OP_EXACT:
2038 case OP_EXACTI:
2039 case OP_UPTO:
2040 case OP_UPTOI:
2041 case OP_MINUPTO:
2042 case OP_MINUPTOI:
2043 case OP_POSUPTO:
2044 case OP_POSUPTOI:
2045 case OP_STAR:
2046 case OP_STARI:
2047 case OP_MINSTAR:
2048 case OP_MINSTARI:
2049 case OP_POSSTAR:
2050 case OP_POSSTARI:
2051 case OP_PLUS:
2052 case OP_PLUSI:
2053 case OP_MINPLUS:
2054 case OP_MINPLUSI:
2055 case OP_POSPLUS:
2056 case OP_POSPLUSI:
2057 case OP_QUERY:
2058 case OP_QUERYI:
2059 case OP_MINQUERY:
2060 case OP_MINQUERYI:
2061 case OP_POSQUERY:
2062 case OP_POSQUERYI:
2063 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2064 break;
2065 }
2066 #else
2067 (void)(utf); /* Keep compiler happy by referencing function argument */
2068 #endif
2069 }
2070 }
2071 }
2072
2073
2074
2075 /*************************************************
2076 * Scan compiled regex for recursion reference *
2077 *************************************************/
2078
2079 /* This little function scans through a compiled pattern until it finds an
2080 instance of OP_RECURSE.
2081
2082 Arguments:
2083 code points to start of expression
2084 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2085
2086 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2087 */
2088
2089 static const pcre_uchar *
2090 find_recurse(const pcre_uchar *code, BOOL utf)
2091 {
2092 for (;;)
2093 {
2094 register pcre_uchar c = *code;
2095 if (c == OP_END) return NULL;
2096 if (c == OP_RECURSE) return code;
2097
2098 /* XCLASS is used for classes that cannot be represented just by a bit
2099 map. This includes negated single high-valued characters. The length in
2100 the table is zero; the actual length is stored in the compiled code. */
2101
2102 if (c == OP_XCLASS) code += GET(code, 1);
2103
2104 /* Otherwise, we can get the item's length from the table, except that for
2105 repeated character types, we have to test for \p and \P, which have an extra
2106 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2107 must add in its length. */
2108
2109 else
2110 {
2111 switch(c)
2112 {
2113 case OP_TYPESTAR:
2114 case OP_TYPEMINSTAR:
2115 case OP_TYPEPLUS:
2116 case OP_TYPEMINPLUS:
2117 case OP_TYPEQUERY:
2118 case OP_TYPEMINQUERY:
2119 case OP_TYPEPOSSTAR:
2120 case OP_TYPEPOSPLUS:
2121 case OP_TYPEPOSQUERY:
2122 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2123 break;
2124
2125 case OP_TYPEPOSUPTO:
2126 case OP_TYPEUPTO:
2127 case OP_TYPEMINUPTO:
2128 case OP_TYPEEXACT:
2129 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2130 code += 2;
2131 break;
2132
2133 case OP_MARK:
2134 case OP_PRUNE_ARG:
2135 case OP_SKIP_ARG:
2136 case OP_THEN_ARG:
2137 code += code[1];
2138 break;
2139 }
2140
2141 /* Add in the fixed length from the table */
2142
2143 code += PRIV(OP_lengths)[c];
2144
2145 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2146 by a multi-byte character. The length in the table is a minimum, so we have
2147 to arrange to skip the extra bytes. */
2148
2149 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2150 if (utf) switch(c)
2151 {
2152 case OP_CHAR:
2153 case OP_CHARI:
2154 case OP_NOT:
2155 case OP_NOTI:
2156 case OP_EXACT:
2157 case OP_EXACTI:
2158 case OP_NOTEXACT:
2159 case OP_NOTEXACTI:
2160 case OP_UPTO:
2161 case OP_UPTOI:
2162 case OP_NOTUPTO:
2163 case OP_NOTUPTOI:
2164 case OP_MINUPTO:
2165 case OP_MINUPTOI:
2166 case OP_NOTMINUPTO:
2167 case OP_NOTMINUPTOI:
2168 case OP_POSUPTO:
2169 case OP_POSUPTOI:
2170 case OP_NOTPOSUPTO:
2171 case OP_NOTPOSUPTOI:
2172 case OP_STAR:
2173 case OP_STARI:
2174 case OP_NOTSTAR:
2175 case OP_NOTSTARI:
2176 case OP_MINSTAR:
2177 case OP_MINSTARI:
2178 case OP_NOTMINSTAR:
2179 case OP_NOTMINSTARI:
2180 case OP_POSSTAR:
2181 case OP_POSSTARI:
2182 case OP_NOTPOSSTAR:
2183 case OP_NOTPOSSTARI:
2184 case OP_PLUS:
2185 case OP_PLUSI:
2186 case OP_NOTPLUS:
2187 case OP_NOTPLUSI:
2188 case OP_MINPLUS:
2189 case OP_MINPLUSI:
2190 case OP_NOTMINPLUS:
2191 case OP_NOTMINPLUSI:
2192 case OP_POSPLUS:
2193 case OP_POSPLUSI:
2194 case OP_NOTPOSPLUS:
2195 case OP_NOTPOSPLUSI:
2196 case OP_QUERY:
2197 case OP_QUERYI:
2198 case OP_NOTQUERY:
2199 case OP_NOTQUERYI:
2200 case OP_MINQUERY:
2201 case OP_MINQUERYI:
2202 case OP_NOTMINQUERY:
2203 case OP_NOTMINQUERYI:
2204 case OP_POSQUERY:
2205 case OP_POSQUERYI:
2206 case OP_NOTPOSQUERY:
2207 case OP_NOTPOSQUERYI:
2208 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2209 break;
2210 }
2211 #else
2212 (void)(utf); /* Keep compiler happy by referencing function argument */
2213 #endif
2214 }
2215 }
2216 }
2217
2218
2219
2220 /*************************************************
2221 * Scan compiled branch for non-emptiness *
2222 *************************************************/
2223
2224 /* This function scans through a branch of a compiled pattern to see whether it
2225 can match the empty string or not. It is called from could_be_empty()
2226 below and from compile_branch() when checking for an unlimited repeat of a
2227 group that can match nothing. Note that first_significant_code() skips over
2228 backward and negative forward assertions when its final argument is TRUE. If we
2229 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2230 bracket whose current branch will already have been scanned.
2231
2232 Arguments:
2233 code points to start of search
2234 endcode points to where to stop
2235 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2236 cd contains pointers to tables etc.
2237 recurses chain of recurse_check to catch mutual recursion
2238
2239 Returns: TRUE if what is matched could be empty
2240 */
2241
2242 typedef struct recurse_check {
2243 struct recurse_check *prev;
2244 const pcre_uchar *group;
2245 } recurse_check;
2246
2247 static BOOL
2248 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2249 BOOL utf, compile_data *cd, recurse_check *recurses)
2250 {
2251 register pcre_uchar c;
2252 recurse_check this_recurse;
2253
2254 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2255 code < endcode;
2256 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2257 {
2258 const pcre_uchar *ccode;
2259
2260 c = *code;
2261
2262 /* Skip over forward assertions; the other assertions are skipped by
2263 first_significant_code() with a TRUE final argument. */
2264
2265 if (c == OP_ASSERT)
2266 {
2267 do code += GET(code, 1); while (*code == OP_ALT);
2268 c = *code;
2269 continue;
2270 }
2271
2272 /* For a recursion/subroutine call, if its end has been reached, which
2273 implies a backward reference subroutine call, we can scan it. If it's a
2274 forward reference subroutine call, we can't. To detect forward reference
2275 we have to scan up the list that is kept in the workspace. This function is
2276 called only when doing the real compile, not during the pre-compile that
2277 measures the size of the compiled pattern. */
2278
2279 if (c == OP_RECURSE)
2280 {
2281 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2282 BOOL empty_branch;
2283
2284 /* Test for forward reference or uncompleted reference. This is disabled
2285 when called to scan a completed pattern by setting cd->start_workspace to
2286 NULL. */
2287
2288 if (cd->start_workspace != NULL)
2289 {
2290 const pcre_uchar *tcode;
2291 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2292 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2293 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2294 }
2295
2296 /* If we are scanning a completed pattern, there are no forward references
2297 and all groups are complete. We need to detect whether this is a recursive
2298 call, as otherwise there will be an infinite loop. If it is a recursion,
2299 just skip over it. Simple recursions are easily detected. For mutual
2300 recursions we keep a chain on the stack. */
2301
2302 else
2303 {
2304 recurse_check *r = recurses;
2305 const pcre_uchar *endgroup = scode;
2306
2307 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2308 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2309
2310 for (r = recurses; r != NULL; r = r->prev)
2311 if (r->group == scode) break;
2312 if (r != NULL) continue; /* Mutual recursion */
2313 }
2314
2315 /* Completed reference; scan the referenced group, remembering it on the
2316 stack chain to detect mutual recursions. */
2317
2318 empty_branch = FALSE;
2319 this_recurse.prev = recurses;
2320 this_recurse.group = scode;
2321
2322 do
2323 {
2324 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2325 {
2326 empty_branch = TRUE;
2327 break;
2328 }
2329 scode += GET(scode, 1);
2330 }
2331 while (*scode == OP_ALT);
2332
2333 if (!empty_branch) return FALSE; /* All branches are non-empty */
2334 continue;
2335 }
2336
2337 /* Groups with zero repeats can of course be empty; skip them. */
2338
2339 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2340 c == OP_BRAPOSZERO)
2341 {
2342 code += PRIV(OP_lengths)[c];
2343 do code += GET(code, 1); while (*code == OP_ALT);
2344 c = *code;
2345 continue;
2346 }
2347
2348 /* A nested group that is already marked as "could be empty" can just be
2349 skipped. */
2350
2351 if (c == OP_SBRA || c == OP_SBRAPOS ||
2352 c == OP_SCBRA || c == OP_SCBRAPOS)
2353 {
2354 do code += GET(code, 1); while (*code == OP_ALT);
2355 c = *code;
2356 continue;
2357 }
2358
2359 /* For other groups, scan the branches. */
2360
2361 if (c == OP_BRA || c == OP_BRAPOS ||
2362 c == OP_CBRA || c == OP_CBRAPOS ||
2363 c == OP_ONCE || c == OP_ONCE_NC ||
2364 c == OP_COND)
2365 {
2366 BOOL empty_branch;
2367 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2368
2369 /* If a conditional group has only one branch, there is a second, implied,
2370 empty branch, so just skip over the conditional, because it could be empty.
2371 Otherwise, scan the individual branches of the group. */
2372
2373 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2374 code += GET(code, 1);
2375 else
2376 {
2377 empty_branch = FALSE;
2378 do
2379 {
2380 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2381 empty_branch = TRUE;
2382 code += GET(code, 1);
2383 }
2384 while (*code == OP_ALT);
2385 if (!empty_branch) return FALSE; /* All branches are non-empty */
2386 }
2387
2388 c = *code;
2389 continue;
2390 }
2391
2392 /* Handle the other opcodes */
2393
2394 switch (c)
2395 {
2396 /* Check for quantifiers after a class. XCLASS is used for classes that
2397 cannot be represented just by a bit map. This includes negated single
2398 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2399 actual length is stored in the compiled code, so we must update "code"
2400 here. */
2401
2402 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2403 case OP_XCLASS:
2404 ccode = code += GET(code, 1);
2405 goto CHECK_CLASS_REPEAT;
2406 #endif
2407
2408 case OP_CLASS:
2409 case OP_NCLASS:
2410 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2411
2412 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2413 CHECK_CLASS_REPEAT:
2414 #endif
2415
2416 switch (*ccode)
2417 {
2418 case OP_CRSTAR: /* These could be empty; continue */
2419 case OP_CRMINSTAR:
2420 case OP_CRQUERY:
2421 case OP_CRMINQUERY:
2422 break;
2423
2424 default: /* Non-repeat => class must match */
2425 case OP_CRPLUS: /* These repeats aren't empty */
2426 case OP_CRMINPLUS:
2427 return FALSE;
2428
2429 case OP_CRRANGE:
2430 case OP_CRMINRANGE:
2431 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2432 break;
2433 }
2434 break;
2435
2436 /* Opcodes that must match a character */
2437
2438 case OP_ANY:
2439 case OP_ALLANY:
2440 case OP_ANYBYTE:
2441
2442 case OP_PROP:
2443 case OP_NOTPROP:
2444 case OP_ANYNL:
2445
2446 case OP_NOT_HSPACE:
2447 case OP_HSPACE:
2448 case OP_NOT_VSPACE:
2449 case OP_VSPACE:
2450 case OP_EXTUNI:
2451
2452 case OP_NOT_DIGIT:
2453 case OP_DIGIT:
2454 case OP_NOT_WHITESPACE:
2455 case OP_WHITESPACE:
2456 case OP_NOT_WORDCHAR:
2457 case OP_WORDCHAR:
2458
2459 case OP_CHAR:
2460 case OP_CHARI:
2461 case OP_NOT:
2462 case OP_NOTI:
2463
2464 case OP_PLUS:
2465 case OP_PLUSI:
2466 case OP_MINPLUS:
2467 case OP_MINPLUSI:
2468
2469 case OP_NOTPLUS:
2470 case OP_NOTPLUSI:
2471 case OP_NOTMINPLUS:
2472 case OP_NOTMINPLUSI:
2473
2474 case OP_POSPLUS:
2475 case OP_POSPLUSI:
2476 case OP_NOTPOSPLUS:
2477 case OP_NOTPOSPLUSI:
2478
2479 case OP_EXACT:
2480 case OP_EXACTI:
2481 case OP_NOTEXACT:
2482 case OP_NOTEXACTI:
2483
2484 case OP_TYPEPLUS:
2485 case OP_TYPEMINPLUS:
2486 case OP_TYPEPOSPLUS:
2487 case OP_TYPEEXACT:
2488
2489 return FALSE;
2490
2491 /* These are going to continue, as they may be empty, but we have to
2492 fudge the length for the \p and \P cases. */
2493
2494 case OP_TYPESTAR:
2495 case OP_TYPEMINSTAR:
2496 case OP_TYPEPOSSTAR:
2497 case OP_TYPEQUERY:
2498 case OP_TYPEMINQUERY:
2499 case OP_TYPEPOSQUERY:
2500 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2501 break;
2502
2503 /* Same for these */
2504
2505 case OP_TYPEUPTO:
2506 case OP_TYPEMINUPTO:
2507 case OP_TYPEPOSUPTO:
2508 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2509 code += 2;
2510 break;
2511
2512 /* End of branch */
2513
2514 case OP_KET:
2515 case OP_KETRMAX:
2516 case OP_KETRMIN:
2517 case OP_KETRPOS:
2518 case OP_ALT:
2519 return TRUE;
2520
2521 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2522 MINUPTO, and POSUPTO and their caseless and negative versions may be
2523 followed by a multibyte character. */
2524
2525 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2526 case OP_STAR:
2527 case OP_STARI:
2528 case OP_NOTSTAR:
2529 case OP_NOTSTARI:
2530
2531 case OP_MINSTAR:
2532 case OP_MINSTARI:
2533 case OP_NOTMINSTAR:
2534 case OP_NOTMINSTARI:
2535
2536 case OP_POSSTAR:
2537 case OP_POSSTARI:
2538 case OP_NOTPOSSTAR:
2539 case OP_NOTPOSSTARI:
2540
2541 case OP_QUERY:
2542 case OP_QUERYI:
2543 case OP_NOTQUERY:
2544 case OP_NOTQUERYI:
2545
2546 case OP_MINQUERY:
2547 case OP_MINQUERYI:
2548 case OP_NOTMINQUERY:
2549 case OP_NOTMINQUERYI:
2550
2551 case OP_POSQUERY:
2552 case OP_POSQUERYI:
2553 case OP_NOTPOSQUERY:
2554 case OP_NOTPOSQUERYI:
2555
2556 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2557 break;
2558
2559 case OP_UPTO:
2560 case OP_UPTOI:
2561 case OP_NOTUPTO:
2562 case OP_NOTUPTOI:
2563
2564 case OP_MINUPTO:
2565 case OP_MINUPTOI:
2566 case OP_NOTMINUPTO:
2567 case OP_NOTMINUPTOI:
2568
2569 case OP_POSUPTO:
2570 case OP_POSUPTOI:
2571 case OP_NOTPOSUPTO:
2572 case OP_NOTPOSUPTOI:
2573
2574 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2575 break;
2576 #endif
2577
2578 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2579 string. */
2580
2581 case OP_MARK:
2582 case OP_PRUNE_ARG:
2583 case OP_SKIP_ARG:
2584 case OP_THEN_ARG:
2585 code += code[1];
2586 break;
2587
2588 /* None of the remaining opcodes are required to match a character. */
2589
2590 default:
2591 break;
2592 }
2593 }
2594
2595 return TRUE;
2596 }
2597
2598
2599
2600 /*************************************************
2601 * Scan compiled regex for non-emptiness *
2602 *************************************************/
2603
2604 /* This function is called to check for left recursive calls. We want to check
2605 the current branch of the current pattern to see if it could match the empty
2606 string. If it could, we must look outwards for branches at other levels,
2607 stopping when we pass beyond the bracket which is the subject of the recursion.
2608 This function is called only during the real compile, not during the
2609 pre-compile.
2610
2611 Arguments:
2612 code points to start of the recursion
2613 endcode points to where to stop (current RECURSE item)
2614 bcptr points to the chain of current (unclosed) branch starts
2615 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2616 cd pointers to tables etc
2617
2618 Returns: TRUE if what is matched could be empty
2619 */
2620
2621 static BOOL
2622 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2623 branch_chain *bcptr, BOOL utf, compile_data *cd)
2624 {
2625 while (bcptr != NULL && bcptr->current_branch >= code)
2626 {
2627 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2628 return FALSE;
2629 bcptr = bcptr->outer;
2630 }
2631 return TRUE;
2632 }
2633
2634
2635
2636 /*************************************************
2637 * Base opcode of repeated opcodes *
2638 *************************************************/
2639
2640 /* Returns the base opcode for repeated single character type opcodes. If the
2641 opcode is not a repeated character type, it returns with the original value.
2642
2643 Arguments: c opcode
2644 Returns: base opcode for the type
2645 */
2646
2647 static pcre_uchar
2648 get_repeat_base(pcre_uchar c)
2649 {
2650 return (c > OP_TYPEPOSUPTO)? c :
2651 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2652 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2653 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2654 (c >= OP_STARI)? OP_STARI :
2655 OP_STAR;
2656 }
2657
2658
2659
2660 #ifdef SUPPORT_UCP
2661 /*************************************************
2662 * Check a character and a property *
2663 *************************************************/
2664
2665 /* This function is called by check_auto_possessive() when a property item
2666 is adjacent to a fixed character.
2667
2668 Arguments:
2669 c the character
2670 ptype the property type
2671 pdata the data for the type
2672 negated TRUE if it's a negated property (\P or \p{^)
2673
2674 Returns: TRUE if auto-possessifying is OK
2675 */
2676
2677 static BOOL
2678 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2679 BOOL negated)
2680 {
2681 const pcre_uint32 *p;
2682 const ucd_record *prop = GET_UCD(c);
2683
2684 switch(ptype)
2685 {
2686 case PT_LAMP:
2687 return (prop->chartype == ucp_Lu ||
2688 prop->chartype == ucp_Ll ||
2689 prop->chartype == ucp_Lt) == negated;
2690
2691 case PT_GC:
2692 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2693
2694 case PT_PC:
2695 return (pdata == prop->chartype) == negated;
2696
2697 case PT_SC:
2698 return (pdata == prop->script) == negated;
2699
2700 /* These are specials */
2701
2702 case PT_ALNUM:
2703 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2704 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2705
2706 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2707 means that Perl space and POSIX space are now identical. PCRE was changed
2708 at release 8.34. */
2709
2710 case PT_SPACE: /* Perl space */
2711 case PT_PXSPACE: /* POSIX space */
2712 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2713 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2714 c == CHAR_FF || c == CHAR_CR)
2715 == negated;
2716
2717 case PT_WORD:
2718 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2719 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2720 c == CHAR_UNDERSCORE) == negated;
2721
2722 case PT_CLIST:
2723 p = PRIV(ucd_caseless_sets) + prop->caseset;
2724 for (;;)
2725 {
2726 if (c < *p) return !negated;
2727 if (c == *p++) return negated;
2728 }
2729 break; /* Control never reaches here */
2730 }
2731
2732 return FALSE;
2733 }
2734 #endif /* SUPPORT_UCP */
2735
2736
2737
2738 /*************************************************
2739 * Fill the character property list *
2740 *************************************************/
2741
2742 /* Checks whether the code points to an opcode that can take part in auto-
2743 possessification, and if so, fills a list with its properties.
2744
2745 Arguments:
2746 code points to start of expression
2747 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2748 fcc points to case-flipping table
2749 list points to output list
2750 list[0] will be filled with the opcode
2751 list[1] will be non-zero if this opcode
2752 can match an empty character string
2753 list[2..7] depends on the opcode
2754
2755 Returns: points to the start of the next opcode if *code is accepted
2756 NULL if *code is not accepted
2757 */
2758
2759 static const pcre_uchar *
2760 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2761 const pcre_uint8 *fcc, pcre_uint32 *list)
2762 {
2763 pcre_uchar c = *code;
2764 const pcre_uchar *end;
2765 const pcre_uint32 *clist_src;
2766 pcre_uint32 *clist_dest;
2767 pcre_uint32 chr;
2768 pcre_uchar base;
2769
2770 list[0] = c;
2771 list[1] = FALSE;
2772 code++;
2773
2774 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2775 {
2776 base = get_repeat_base(c);
2777 c -= (base - OP_STAR);
2778
2779 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2780 code += IMM2_SIZE;
2781
2782 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2783
2784 switch(base)
2785 {
2786 case OP_STAR:
2787 list[0] = OP_CHAR;
2788 break;
2789
2790 case OP_STARI:
2791 list[0] = OP_CHARI;
2792 break;
2793
2794 case OP_NOTSTAR:
2795 list[0] = OP_NOT;
2796 break;
2797
2798 case OP_NOTSTARI:
2799 list[0] = OP_NOTI;
2800 break;
2801
2802 case OP_TYPESTAR:
2803 list[0] = *code;
2804 code++;
2805 break;
2806 }
2807 c = list[0];
2808 }
2809
2810 switch(c)
2811 {
2812 case OP_NOT_DIGIT:
2813 case OP_DIGIT:
2814 case OP_NOT_WHITESPACE:
2815 case OP_WHITESPACE:
2816 case OP_NOT_WORDCHAR:
2817 case OP_WORDCHAR:
2818 case OP_ANY:
2819 case OP_ALLANY:
2820 case OP_ANYNL:
2821 case OP_NOT_HSPACE:
2822 case OP_HSPACE:
2823 case OP_NOT_VSPACE:
2824 case OP_VSPACE:
2825 case OP_EXTUNI:
2826 case OP_EODN:
2827 case OP_EOD:
2828 case OP_DOLL:
2829 case OP_DOLLM:
2830 return code;
2831
2832 case OP_CHAR:
2833 case OP_NOT:
2834 GETCHARINCTEST(chr, code);
2835 list[2] = chr;
2836 list[3] = NOTACHAR;
2837 return code;
2838
2839 case OP_CHARI:
2840 case OP_NOTI:
2841 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2842 GETCHARINCTEST(chr, code);
2843 list[2] = chr;
2844
2845 #ifdef SUPPORT_UCP
2846 if (chr < 128 || (chr < 256 && !utf))
2847 list[3] = fcc[chr];
2848 else
2849 list[3] = UCD_OTHERCASE(chr);
2850 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2851 list[3] = (chr < 256) ? fcc[chr] : chr;
2852 #else
2853 list[3] = fcc[chr];
2854 #endif
2855
2856 /* The othercase might be the same value. */
2857
2858 if (chr == list[3])
2859 list[3] = NOTACHAR;
2860 else
2861 list[4] = NOTACHAR;
2862 return code;
2863
2864 #ifdef SUPPORT_UCP
2865 case OP_PROP:
2866 case OP_NOTPROP:
2867 if (code[0] != PT_CLIST)
2868 {
2869 list[2] = code[0];
2870 list[3] = code[1];
2871 return code + 2;
2872 }
2873
2874 /* Convert only if we have enough space. */
2875
2876 clist_src = PRIV(ucd_caseless_sets) + code[1];
2877 clist_dest = list + 2;
2878 code += 2;
2879
2880 do {
2881 if (clist_dest >= list + 8)
2882 {
2883 /* Early return if there is not enough space. This should never
2884 happen, since all clists are shorter than 5 character now. */
2885 list[2] = code[0];
2886 list[3] = code[1];
2887 return code;
2888 }
2889 *clist_dest++ = *clist_src;
2890 }
2891 while(*clist_src++ != NOTACHAR);
2892
2893 /* All characters are stored. The terminating NOTACHAR
2894 is copied form the clist itself. */
2895
2896 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2897 return code;
2898 #endif
2899
2900 case OP_NCLASS:
2901 case OP_CLASS:
2902 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2903 case OP_XCLASS:
2904
2905 if (c == OP_XCLASS)
2906 end = code + GET(code, 0);
2907 else
2908 #endif
2909 end = code + 32 / sizeof(pcre_uchar);
2910
2911 switch(*end)
2912 {
2913 case OP_CRSTAR:
2914 case OP_CRMINSTAR:
2915 case OP_CRQUERY:
2916 case OP_CRMINQUERY:
2917 list[1] = TRUE;
2918 end++;
2919 break;
2920
2921 case OP_CRRANGE:
2922 case OP_CRMINRANGE:
2923 list[1] = (GET2(end, 1) == 0);
2924 end += 1 + 2 * IMM2_SIZE;
2925 break;
2926 }
2927 list[2] = end - code;
2928 return end;
2929 }
2930 return NULL; /* Opcode not accepted */
2931 }
2932
2933
2934
2935 /*************************************************
2936 * Scan further character sets for match *
2937 *************************************************/
2938
2939 /* Checks whether the base and the current opcode have a common character, in
2940 which case the base cannot be possessified.
2941
2942 Arguments:
2943 code points to the byte code
2944 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2945 cd static compile data
2946 base_list the data list of the base opcode
2947
2948 Returns: TRUE if the auto-possessification is possible
2949 */
2950
2951 static BOOL
2952 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
2953 const pcre_uint32* base_list)
2954 {
2955 pcre_uchar c;
2956 pcre_uint32 list[8];
2957 const pcre_uint32* chr_ptr;
2958 const pcre_uint32* ochr_ptr;
2959 const pcre_uint32* list_ptr;
2960 const pcre_uchar *next_code;
2961 pcre_uint32 chr;
2962
2963 /* Note: the base_list[1] contains whether the current opcode has greedy
2964 (represented by a non-zero value) quantifier. This is a different from
2965 other character type lists, which stores here that the character iterator
2966 matches to an empty string (also represented by a non-zero value). */
2967
2968 for(;;)
2969 {
2970 c = *code;
2971
2972 /* Skip over callouts */
2973
2974 if (c == OP_CALLOUT)
2975 {
2976 code += PRIV(OP_lengths)[c];
2977 continue;
2978 }
2979
2980 if (c == OP_ALT)
2981 {
2982 do code += GET(code, 1); while (*code == OP_ALT);
2983 c = *code;
2984 }
2985
2986 switch(c)
2987 {
2988 case OP_END:
2989 case OP_KETRPOS:
2990 /* TRUE only in greedy case. The non-greedy case could be replaced by
2991 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
2992 uses more memory, which we cannot get at this stage.) */
2993
2994 return base_list[1] != 0;
2995
2996 case OP_KET:
2997 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
2998 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
2999 cannot be converted to a possessive form. */
3000
3001 if (base_list[1] == 0) return FALSE;
3002
3003 switch(*(code - GET(code, 1)))
3004 {
3005 case OP_ASSERT:
3006 case OP_ASSERT_NOT:
3007 case OP_ASSERTBACK:
3008 case OP_ASSERTBACK_NOT:
3009 case OP_ONCE:
3010 case OP_ONCE_NC:
3011 /* Atomic sub-patterns and assertions can always auto-possessify their
3012 last iterator. */
3013 return TRUE;
3014 }
3015
3016 code += PRIV(OP_lengths)[c];
3017 continue;
3018
3019 case OP_ONCE:
3020 case OP_ONCE_NC:
3021 case OP_BRA:
3022 case OP_CBRA:
3023 next_code = code;
3024 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3025
3026 /* We do not support repeated brackets, because they can lead to
3027 infinite recursion. */
3028
3029 if (*next_code != OP_KET) return FALSE;
3030
3031 next_code = code + GET(code, 1);
3032 code += PRIV(OP_lengths)[c];
3033
3034 while (*next_code == OP_ALT)
3035 {
3036 if (!compare_opcodes(code, utf, cd, base_list)) return FALSE;
3037 code = next_code + 1 + LINK_SIZE;
3038 next_code += GET(next_code, 1);
3039 }
3040 continue;
3041
3042 case OP_BRAZERO:
3043 case OP_BRAMINZERO:
3044
3045 next_code = code + 1;
3046 if (*next_code != OP_BRA && *next_code != OP_CBRA)
3047 return FALSE;
3048
3049 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3050
3051 /* We do not support repeated brackets, because they can lead to
3052 infinite recursion. */
3053 if (*next_code != OP_KET) return FALSE;
3054
3055 /* The bracket content will be checked by the
3056 OP_BRA/OP_CBRA case above. */
3057 next_code += 1 + LINK_SIZE;
3058 if (!compare_opcodes(next_code, utf, cd, base_list)) return FALSE;
3059
3060 code += PRIV(OP_lengths)[c];
3061 continue;
3062 }
3063
3064 /* Check for a supported opcode, and load its properties. */
3065
3066 code = get_chr_property_list(code, utf, cd->fcc, list);
3067 if (code == NULL) return FALSE; /* Unsupported */
3068
3069 /* If either opcode is a small character list, set pointers for comparing
3070 characters from that list with another list, or with a property. */
3071
3072 if (base_list[0] == OP_CHAR)
3073 {
3074 chr_ptr = base_list + 2;
3075 list_ptr = list;
3076 }
3077 else if (list[0] == OP_CHAR)
3078 {
3079 chr_ptr = list + 2;
3080 list_ptr = base_list;
3081 }
3082
3083 /* Some property combinations also acceptable. Unicode property opcodes are
3084 processed specially; the rest can be handled with a lookup table. */
3085
3086 else
3087 {
3088 pcre_uint32 leftop, rightop;
3089
3090 if (list[1] != 0) return FALSE; /* Must match at least one character */
3091 leftop = base_list[0];
3092 rightop = list[0];
3093
3094 #ifdef SUPPORT_UCP
3095 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3096 {
3097 if (rightop == OP_EOD) return TRUE;
3098 if (rightop == OP_PROP || rightop == OP_NOTPROP)
3099 {
3100 int n;
3101 const pcre_uint8 *p;
3102 BOOL same = leftop == rightop;
3103 BOOL lisprop = leftop == OP_PROP;
3104 BOOL risprop = rightop == OP_PROP;
3105 BOOL bothprop = lisprop && risprop;
3106
3107 /* There's a table that specifies how each combination is to be
3108 processed:
3109 0 Always return FALSE (never auto-possessify)
3110 1 Character groups are distinct (possessify if both are OP_PROP)
3111 2 Check character categories in the same group (general or particular)
3112 3 Return TRUE if the two opcodes are not the same
3113 ... see comments below
3114 */
3115
3116 n = propposstab[base_list[2]][list[2]];
3117 switch(n)
3118 {
3119 case 0: return FALSE;
3120 case 1: return bothprop;
3121 case 2: return (base_list[3] == list[3]) != same;
3122 case 3: return !same;
3123
3124 case 4: /* Left general category, right particular category */
3125 return risprop && catposstab[base_list[3]][list[3]] == same;
3126
3127 case 5: /* Right general category, left particular category */
3128 return lisprop && catposstab[list[3]][base_list[3]] == same;
3129
3130 /* This code is logically tricky. Think hard before fiddling with it.
3131 The posspropstab table has four entries per row. Each row relates to
3132 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3133 Only WORD actually needs all four entries, but using repeats for the
3134 others means they can all use the same code below.
3135
3136 The first two entries in each row are Unicode general categories, and
3137 apply always, because all the characters they include are part of the
3138 PCRE character set. The third and fourth entries are a general and a
3139 particular category, respectively, that include one or more relevant
3140 characters. One or the other is used, depending on whether the check
3141 is for a general or a particular category. However, in both cases the
3142 category contains more characters than the specials that are defined
3143 for the property being tested against. Therefore, it cannot be used
3144 in a NOTPROP case.
3145
3146 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3147 Underscore is covered by ucp_P or ucp_Po. */
3148
3149 case 6: /* Left alphanum vs right general category */
3150 case 7: /* Left space vs right general category */
3151 case 8: /* Left word vs right general category */
3152 p = posspropstab[n-6];
3153 return risprop && lisprop ==
3154 (list[3] != p[0] &&
3155 list[3] != p[1] &&
3156 (list[3] != p[2] || !lisprop));
3157
3158 case 9: /* Right alphanum vs left general category */
3159 case 10: /* Right space vs left general category */
3160 case 11: /* Right word vs left general category */
3161 p = posspropstab[n-9];
3162 return lisprop && risprop ==
3163 (base_list[3] != p[0] &&
3164 base_list[3] != p[1] &&
3165 (base_list[3] != p[2] || !risprop));
3166
3167 case 12: /* Left alphanum vs right particular category */
3168 case 13: /* Left space vs right particular category */
3169 case 14: /* Left word vs right particular category */
3170 p = posspropstab[n-12];
3171 return risprop && lisprop ==
3172 (catposstab[p[0]][list[3]] &&
3173 catposstab[p[1]][list[3]] &&
3174 (list[3] != p[3] || !lisprop));
3175
3176 case 15: /* Right alphanum vs left particular category */
3177 case 16: /* Right space vs left particular category */
3178 case 17: /* Right word vs left particular category */
3179 p = posspropstab[n-15];
3180 return lisprop && risprop ==
3181 (catposstab[p[0]][base_list[3]] &&
3182 catposstab[p[1]][base_list[3]] &&
3183 (base_list[3] != p[3] || !risprop));
3184 }
3185 }
3186 return FALSE;
3187 }
3188
3189 else
3190 #endif /* SUPPORT_UCP */
3191
3192 return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3193 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3194 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3195 }
3196
3197 /* Control reaches here only if one of the items is a small character list.
3198 All characters are checked against the other side. */
3199
3200 do
3201 {
3202 chr = *chr_ptr;
3203
3204 switch(list_ptr[0])
3205 {
3206 case OP_CHAR:
3207 ochr_ptr = list_ptr + 2;
3208 do
3209 {
3210 if (chr == *ochr_ptr) return FALSE;
3211 ochr_ptr++;
3212 }
3213 while(*ochr_ptr != NOTACHAR);
3214 break;
3215
3216 case OP_NOT:
3217 ochr_ptr = list_ptr + 2;
3218 do
3219 {
3220 if (chr == *ochr_ptr)
3221 break;
3222 ochr_ptr++;
3223 }
3224 while(*ochr_ptr != NOTACHAR);
3225 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3226 break;
3227
3228 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3229 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3230
3231 case OP_DIGIT:
3232 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3233 break;
3234
3235 case OP_NOT_DIGIT:
3236 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3237 break;
3238
3239 case OP_WHITESPACE:
3240 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3241 break;
3242
3243 case OP_NOT_WHITESPACE:
3244 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3245 break;
3246
3247 case OP_WORDCHAR:
3248 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3249 break;
3250
3251 case OP_NOT_WORDCHAR:
3252 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3253 break;
3254
3255 case OP_HSPACE:
3256 switch(chr)
3257 {
3258 HSPACE_CASES: return FALSE;
3259 default: break;
3260 }
3261 break;
3262
3263 case OP_NOT_HSPACE:
3264 switch(chr)
3265 {
3266 HSPACE_CASES: break;
3267 default: return FALSE;
3268 }
3269 break;
3270
3271 case OP_ANYNL:
3272 case OP_VSPACE:
3273 switch(chr)
3274 {
3275 VSPACE_CASES: return FALSE;
3276 default: break;
3277 }
3278 break;
3279
3280 case OP_NOT_VSPACE:
3281 switch(chr)
3282 {
3283 VSPACE_CASES: break;
3284 default: return FALSE;
3285 }
3286 break;
3287
3288 case OP_DOLL:
3289 case OP_EODN:
3290 switch (chr)
3291 {
3292 case CHAR_CR:
3293 case CHAR_LF:
3294 case CHAR_VT:
3295 case CHAR_FF:
3296 case CHAR_NEL:
3297 #ifndef EBCDIC
3298 case 0x2028:
3299 case 0x2029:
3300 #endif /* Not EBCDIC */
3301 return FALSE;
3302 }
3303 break;
3304
3305 case OP_EOD: /* Can always possessify before \z */
3306 break;
3307
3308 case OP_PROP:
3309 case OP_NOTPROP:
3310 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3311 list_ptr[0] == OP_NOTPROP))
3312 return FALSE;
3313 break;
3314
3315 /* The class comparisons work only when the class is the second item
3316 of the pair, because there are at present no possessive forms of the
3317 class opcodes. Note also that the "code" variable that is used below
3318 points after the second item, and that the pointer for the first item
3319 is not available, so even if there were possessive forms of the class
3320 opcodes, the correct comparison could not be done. */
3321
3322 case OP_NCLASS:
3323 if (chr > 255) return FALSE;
3324 /* Fall through */
3325
3326 case OP_CLASS:
3327 if (list_ptr != list) return FALSE; /* Class is first opcode */
3328 if (chr > 255) break;
3329 if ((((pcre_uint8 *)(code - list_ptr[2]))[chr >> 3] & (1 << (chr & 7))) != 0)
3330 return FALSE;
3331 break;
3332
3333 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3334 case OP_XCLASS:
3335 if (list_ptr != list) return FALSE; /* Class is first opcode */
3336 if (PRIV(xclass)(chr, code - list_ptr[2] + LINK_SIZE, utf))
3337 return FALSE;
3338 break;
3339 #endif
3340
3341 default:
3342 return FALSE;
3343 }
3344
3345 chr_ptr++;
3346 }
3347 while(*chr_ptr != NOTACHAR);
3348
3349 /* At least one character must be matched from this opcode. */
3350
3351 if (list[1] == 0) return TRUE;
3352 }
3353
3354 return FALSE;
3355 }
3356
3357
3358
3359 /*************************************************
3360 * Scan compiled regex for auto-possession *
3361 *************************************************/
3362
3363 /* Replaces single character iterations with their possessive alternatives
3364 if appropriate. This function modifies the compiled opcode!
3365
3366 Arguments:
3367 code points to start of the byte code
3368 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3369 cd static compile data
3370
3371 Returns: nothing
3372 */
3373
3374 static void
3375 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3376 {
3377 register pcre_uchar c;
3378 const pcre_uchar *end;
3379 pcre_uint32 list[8];
3380
3381 for (;;)
3382 {
3383 c = *code;
3384
3385 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3386 {
3387 c -= get_repeat_base(c) - OP_STAR;
3388 end = (c <= OP_MINUPTO) ?
3389 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3390 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3391
3392 if (end != NULL && compare_opcodes(end, utf, cd, list))
3393 {
3394 switch(c)
3395 {
3396 case OP_STAR:
3397 *code += OP_POSSTAR - OP_STAR;
3398 break;
3399
3400 case OP_MINSTAR:
3401 *code += OP_POSSTAR - OP_MINSTAR;
3402 break;
3403
3404 case OP_PLUS:
3405 *code += OP_POSPLUS - OP_PLUS;
3406 break;
3407
3408 case OP_MINPLUS:
3409 *code += OP_POSPLUS - OP_MINPLUS;
3410 break;
3411
3412 case OP_QUERY:
3413 *code += OP_POSQUERY - OP_QUERY;
3414 break;
3415
3416 case OP_MINQUERY:
3417 *code += OP_POSQUERY - OP_MINQUERY;
3418 break;
3419
3420 case OP_UPTO:
3421 *code += OP_POSUPTO - OP_UPTO;
3422 break;
3423
3424 case OP_MINUPTO:
3425 *code += OP_MINUPTO - OP_UPTO;
3426 break;
3427 }
3428 }
3429 c = *code;
3430 }
3431
3432 switch(c)
3433 {
3434 case OP_END:
3435 return;
3436
3437 case OP_TYPESTAR:
3438 case OP_TYPEMINSTAR:
3439 case OP_TYPEPLUS:
3440 case OP_TYPEMINPLUS:
3441 case OP_TYPEQUERY:
3442 case OP_TYPEMINQUERY:
3443 case OP_TYPEPOSSTAR:
3444 case OP_TYPEPOSPLUS:
3445 case OP_TYPEPOSQUERY:
3446 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3447 break;
3448
3449 case OP_TYPEUPTO:
3450 case OP_TYPEMINUPTO:
3451 case OP_TYPEEXACT:
3452 case OP_TYPEPOSUPTO:
3453 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3454 code += 2;
3455 break;
3456
3457 case OP_XCLASS:
3458 code += GET(code, 1);
3459 break;
3460
3461 case OP_MARK:
3462 case OP_PRUNE_ARG:
3463 case OP_SKIP_ARG:
3464 case OP_THEN_ARG:
3465 code += code[1];
3466 break;
3467 }
3468
3469 /* Add in the fixed length from the table */
3470
3471 code += PRIV(OP_lengths)[c];
3472
3473 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3474 a multi-byte character. The length in the table is a minimum, so we have to
3475 arrange to skip the extra bytes. */
3476
3477 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3478 if (utf) switch(c)
3479 {
3480 case OP_CHAR:
3481 case OP_CHARI:
3482 case OP_NOT:
3483 case OP_NOTI:
3484 case OP_STAR:
3485 case OP_MINSTAR:
3486 case OP_PLUS:
3487 case OP_MINPLUS:
3488 case OP_QUERY:
3489 case OP_MINQUERY:
3490 case OP_UPTO:
3491 case OP_MINUPTO:
3492 case OP_EXACT:
3493 case OP_POSSTAR:
3494 case OP_POSPLUS:
3495 case OP_POSQUERY:
3496 case OP_POSUPTO:
3497 case OP_STARI:
3498 case OP_MINSTARI:
3499 case OP_PLUSI:
3500 case OP_MINPLUSI:
3501 case OP_QUERYI:
3502 case OP_MINQUERYI:
3503 case OP_UPTOI:
3504 case OP_MINUPTOI:
3505 case OP_EXACTI:
3506 case OP_POSSTARI:
3507 case OP_POSPLUSI:
3508 case OP_POSQUERYI:
3509 case OP_POSUPTOI:
3510 case OP_NOTSTAR:
3511 case OP_NOTMINSTAR:
3512 case OP_NOTPLUS:
3513 case OP_NOTMINPLUS:
3514 case OP_NOTQUERY:
3515 case OP_NOTMINQUERY:
3516 case OP_NOTUPTO:
3517 case OP_NOTMINUPTO:
3518 case OP_NOTEXACT:
3519 case OP_NOTPOSSTAR:
3520 case OP_NOTPOSPLUS:
3521 case OP_NOTPOSQUERY:
3522 case OP_NOTPOSUPTO:
3523 case OP_NOTSTARI:
3524 case OP_NOTMINSTARI:
3525 case OP_NOTPLUSI:
3526 case OP_NOTMINPLUSI:
3527 case OP_NOTQUERYI:
3528 case OP_NOTMINQUERYI:
3529 case OP_NOTUPTOI:
3530 case OP_NOTMINUPTOI:
3531 case OP_NOTEXACTI:
3532 case OP_NOTPOSSTARI:
3533 case OP_NOTPOSPLUSI:
3534 case OP_NOTPOSQUERYI:
3535 case OP_NOTPOSUPTOI:
3536 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3537 break;
3538 }
3539 #else
3540 (void)(utf); /* Keep compiler happy by referencing function argument */
3541 #endif
3542 }
3543 }
3544
3545
3546
3547 /*************************************************
3548 * Check for POSIX class syntax *
3549 *************************************************/
3550
3551 /* This function is called when the sequence "[:" or "[." or "[=" is
3552 encountered in a character class. It checks whether this is followed by a
3553 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3554 reach an unescaped ']' without the special preceding character, return FALSE.
3555
3556 Originally, this function only recognized a sequence of letters between the
3557 terminators, but it seems that Perl recognizes any sequence of characters,
3558 though of course unknown POSIX names are subsequently rejected. Perl gives an
3559 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3560 didn't consider this to be a POSIX class. Likewise for [:1234:].
3561
3562 The problem in trying to be exactly like Perl is in the handling of escapes. We
3563 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3564 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3565 below handles the special case of \], but does not try to do any other escape
3566 processing. This makes it different from Perl for cases such as [:l\ower:]
3567 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3568 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
3569 I think.
3570
3571 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3572 It seems that the appearance of a nested POSIX class supersedes an apparent
3573 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3574 a digit.
3575
3576 In Perl, unescaped square brackets may also appear as part of class names. For
3577 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3578 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3579 seem right at all. PCRE does not allow closing square brackets in POSIX class
3580 names.
3581
3582 Arguments:
3583 ptr pointer to the initial [
3584 endptr where to return the end pointer
3585
3586 Returns: TRUE or FALSE
3587 */
3588
3589 static BOOL
3590 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3591 {
3592 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3593 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3594 for (++ptr; *ptr != CHAR_NULL; ptr++)
3595 {
3596 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3597 ptr++;
3598 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3599 else
3600 {
3601 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3602 {
3603 *endptr = ptr;
3604 return TRUE;
3605 }
3606 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3607 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3608 ptr[1] == CHAR_EQUALS_SIGN) &&
3609 check_posix_syntax(ptr, endptr))
3610 return FALSE;
3611 }
3612 }
3613 return FALSE;
3614 }
3615
3616
3617
3618
3619 /*************************************************
3620 * Check POSIX class name *
3621 *************************************************/
3622
3623 /* This function is called to check the name given in a POSIX-style class entry
3624 such as [:alnum:].
3625
3626 Arguments:
3627 ptr points to the first letter
3628 len the length of the name
3629
3630 Returns: a value representing the name, or -1 if unknown
3631 */
3632
3633 static int
3634 check_posix_name(const pcre_uchar *ptr, int len)
3635 {
3636 const char *pn = posix_names;
3637 register int yield = 0;
3638 while (posix_name_lengths[yield] != 0)
3639 {
3640 if (len == posix_name_lengths[yield] &&
3641 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3642 pn += posix_name_lengths[yield] + 1;
3643 yield++;
3644 }
3645 return -1;
3646 }
3647
3648
3649 /*************************************************
3650 * Adjust OP_RECURSE items in repeated group *
3651 *************************************************/
3652
3653 /* OP_RECURSE items contain an offset from the start of the regex to the group
3654 that is referenced. This means that groups can be replicated for fixed
3655 repetition simply by copying (because the recursion is allowed to refer to
3656 earlier groups that are outside the current group). However, when a group is
3657 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3658 inserted before it, after it has been compiled. This means that any OP_RECURSE
3659 items within it that refer to the group itself or any contained groups have to
3660 have their offsets adjusted. That one of the jobs of this function. Before it
3661 is called, the partially compiled regex must be temporarily terminated with
3662 OP_END.
3663
3664 This function has been extended with the possibility of forward references for
3665 recursions and subroutine calls. It must also check the list of such references
3666 for the group we are dealing with. If it finds that one of the recursions in
3667 the current group is on this list, it adjusts the offset in the list, not the
3668 value in the reference (which is a group number).
3669
3670 Arguments:
3671 group points to the start of the group
3672 adjust the amount by which the group is to be moved
3673 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3674 cd contains pointers to tables etc.
3675 save_hwm the hwm forward reference pointer at the start of the group
3676
3677 Returns: nothing
3678 */
3679
3680 static void
3681 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3682 pcre_uchar *save_hwm)
3683 {
3684 pcre_uchar *ptr = group;
3685
3686 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3687 {
3688 int offset;
3689 pcre_uchar *hc;
3690
3691 /* See if this recursion is on the forward reference list. If so, adjust the
3692 reference. */
3693
3694 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3695 {
3696 offset = (int)GET(hc, 0);
3697 if (cd->start_code + offset == ptr + 1)
3698 {
3699 PUT(hc, 0, offset + adjust);
3700 break;
3701 }
3702 }
3703
3704 /* Otherwise, adjust the recursion offset if it's after the start of this
3705 group. */
3706
3707 if (hc >= cd->hwm)
3708 {
3709 offset = (int)GET(ptr, 1);
3710 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3711 }
3712
3713 ptr += 1 + LINK_SIZE;
3714 }
3715 }
3716
3717
3718
3719 /*************************************************
3720 * Insert an automatic callout point *
3721 *************************************************/
3722
3723 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3724 callout points before each pattern item.
3725
3726 Arguments:
3727 code current code pointer
3728 ptr current pattern pointer
3729 cd pointers to tables etc
3730
3731 Returns: new code pointer
3732 */
3733
3734 static pcre_uchar *
3735 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3736 {
3737 *code++ = OP_CALLOUT;
3738 *code++ = 255;
3739 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
3740 PUT(code, LINK_SIZE, 0); /* Default length */
3741 return code + 2 * LINK_SIZE;
3742 }
3743
3744
3745
3746 /*************************************************
3747 * Complete a callout item *
3748 *************************************************/
3749
3750 /* A callout item contains the length of the next item in the pattern, which
3751 we can't fill in till after we have reached the relevant point. This is used
3752 for both automatic and manual callouts.
3753
3754 Arguments:
3755 previous_callout points to previous callout item
3756 ptr current pattern pointer
3757 cd pointers to tables etc
3758
3759 Returns: nothing
3760 */
3761
3762 static void
3763 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3764 {
3765 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3766 PUT(previous_callout, 2 + LINK_SIZE, length);
3767 }
3768
3769
3770
3771 #ifdef SUPPORT_UCP
3772 /*************************************************
3773 * Get othercase range *
3774 *************************************************/
3775
3776 /* This function is passed the start and end of a class range, in UTF-8 mode
3777 with UCP support. It searches up the characters, looking for ranges of
3778 characters in the "other" case. Each call returns the next one, updating the
3779 start address. A character with multiple other cases is returned on its own
3780 with a special return value.
3781
3782 Arguments:
3783 cptr points to starting character value; updated
3784 d end value
3785 ocptr where to put start of othercase range
3786 odptr where to put end of othercase range
3787
3788 Yield: -1 when no more
3789 0 when a range is returned
3790 >0 the CASESET offset for char with multiple other cases
3791 in this case, ocptr contains the original
3792 */
3793
3794 static int
3795 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
3796 pcre_uint32 *odptr)
3797 {
3798 pcre_uint32 c, othercase, next;
3799 unsigned int co;
3800
3801 /* Find the first character that has an other case. If it has multiple other
3802 cases, return its case offset value. */
3803
3804 for (c = *cptr; c <= d; c++)
3805 {
3806 if ((co = UCD_CASESET(c)) != 0)
3807 {
3808 *ocptr = c++; /* Character that has the set */
3809 *cptr = c; /* Rest of input range */
3810 return (int)co;
3811 }
3812 if ((othercase = UCD_OTHERCASE(c)) != c) break;
3813 }
3814
3815 if (c > d) return -1; /* Reached end of range */
3816
3817 *ocptr = othercase;
3818 next = othercase + 1;
3819
3820 for (++c; c <= d; c++)
3821 {
3822 if (UCD_OTHERCASE(c) != next) break;
3823 next++;
3824 }
3825
3826 *odptr = next - 1; /* End of othercase range */
3827 *cptr = c; /* Rest of input range */
3828 return 0;
3829 }
3830 #endif /* SUPPORT_UCP */
3831
3832
3833
3834 /*************************************************
3835 * Add a character or range to a class *
3836 *************************************************/
3837
3838 /* This function packages up the logic of adding a character or range of
3839 characters to a class. The character values in the arguments will be within the
3840 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3841 mutually recursive with the function immediately below.
3842
3843 Arguments:
3844 classbits the bit map for characters < 256
3845 uchardptr points to the pointer for extra data
3846 options the options word
3847 cd contains pointers to tables etc.
3848 start start of range character
3849 end end of range character
3850
3851 Returns: the number of < 256 characters added
3852 the pointer to extra data is updated
3853 */
3854
3855 static int
3856 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3857 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
3858 {
3859 pcre_uint32 c;
3860 int n8 = 0;
3861
3862 /* If caseless matching is required, scan the range and process alternate
3863 cases. In Unicode, there are 8-bit characters that have alternate cases that
3864 are greater than 255 and vice-versa. Sometimes we can just extend the original
3865 range. */
3866
3867 if ((options & PCRE_CASELESS) != 0)
3868 {
3869 #ifdef SUPPORT_UCP
3870 if ((options & PCRE_UTF8) != 0)
3871 {
3872 int rc;
3873 pcre_uint32 oc, od;
3874
3875 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
3876 c = start;
3877
3878 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3879 {
3880 /* Handle a single character that has more than one other case. */
3881
3882 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3883 PRIV(ucd_caseless_sets) + rc, oc);
3884
3885 /* Do nothing if the other case range is within the original range. */
3886
3887 else if (oc >= start && od <= end) continue;
3888
3889 /* Extend the original range if there is overlap, noting that if oc < c, we
3890 can't have od > end because a subrange is always shorter than the basic
3891 range. Otherwise, use a recursive call to add the additional range. */
3892
3893 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3894 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
3895 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3896 }
3897 }
3898 else
3899 #endif /* SUPPORT_UCP */
3900
3901 /* Not UTF-mode, or no UCP */
3902
3903 for (c = start; c <= end && c < 256; c++)
3904 {
3905 SETBIT(classbits, cd->fcc[c]);
3906 n8++;
3907 }
3908 }
3909
3910 /* Now handle the original range. Adjust the final value according to the bit
3911 length - this means that the same lists of (e.g.) horizontal spaces can be used
3912 in all cases. */
3913
3914 #if defined COMPILE_PCRE8
3915 #ifdef SUPPORT_UTF
3916 if ((options & PCRE_UTF8) == 0)
3917 #endif
3918 if (end > 0xff) end = 0xff;
3919
3920 #elif defined COMPILE_PCRE16
3921 #ifdef SUPPORT_UTF
3922 if ((options & PCRE_UTF16) == 0)
3923 #endif
3924 if (end > 0xffff) end = 0xffff;
3925
3926 #endif /* COMPILE_PCRE[8|16] */
3927
3928 /* If all characters are less than 256, use the bit map. Otherwise use extra
3929 data. */
3930
3931 if (end < 0x100)
3932 {
3933 for (c = start; c <= end; c++)
3934 {
3935 n8++;
3936 SETBIT(classbits, c);
3937 }
3938 }
3939
3940 else
3941 {
3942 pcre_uchar *uchardata = *uchardptr;
3943
3944 #ifdef SUPPORT_UTF
3945 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
3946 {
3947 if (start < end)
3948 {
3949 *uchardata++ = XCL_RANGE;
3950 uchardata += PRIV(ord2utf)(start, uchardata);
3951 uchardata += PRIV(ord2utf)(end, uchardata);
3952 }
3953 else if (start == end)
3954 {
3955 *uchardata++ = XCL_SINGLE;
3956 uchardata += PRIV(ord2utf)(start, uchardata);
3957 }
3958 }
3959 else
3960 #endif /* SUPPORT_UTF */
3961
3962 /* Without UTF support, character values are constrained by the bit length,
3963 and can only be > 256 for 16-bit and 32-bit libraries. */
3964
3965 #ifdef COMPILE_PCRE8
3966 {}
3967 #else
3968 if (start < end)
3969 {
3970 *uchardata++ = XCL_RANGE;
3971 *uchardata++ = start;
3972 *uchardata++ = end;
3973 }
3974 else if (start == end)
3975 {
3976 *uchardata++ = XCL_SINGLE;
3977 *uchardata++ = start;
3978 }
3979 #endif
3980
3981 *uchardptr = uchardata; /* Updata extra data pointer */
3982 }
3983
3984 return n8; /* Number of 8-bit characters */
3985 }
3986
3987
3988
3989
3990 /*************************************************
3991 * Add a list of characters to a class *
3992 *************************************************/
3993
3994 /* This function is used for adding a list of case-equivalent characters to a
3995 class, and also for adding a list of horizontal or vertical whitespace. If the
3996 list is in order (which it should be), ranges of characters are detected and
3997 handled appropriately. This function is mutually recursive with the function
3998 above.
3999
4000 Arguments:
4001 classbits the bit map for characters < 256
4002 uchardptr points to the pointer for extra data
4003 options the options word
4004 cd contains pointers to tables etc.
4005 p points to row of 32-bit values, terminated by NOTACHAR
4006 except character to omit; this is used when adding lists of
4007 case-equivalent characters to avoid including the one we
4008 already know about
4009
4010 Returns: the number of < 256 characters added
4011 the pointer to extra data is updated
4012 */
4013
4014 static int
4015 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4016 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4017 {
4018 int n8 = 0;
4019 while (p[0] < NOTACHAR)
4020 {
4021 int n = 0;
4022 if (p[0] != except)
4023 {
4024 while(p[n+1] == p[0] + n + 1) n++;
4025 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4026 }
4027 p += n + 1;
4028 }
4029 return n8;
4030 }
4031
4032
4033
4034 /*************************************************
4035 * Add characters not in a list to a class *
4036 *************************************************/
4037
4038 /* This function is used for adding the complement of a list of horizontal or
4039 vertical whitespace to a class. The list must be in order.
4040
4041 Arguments:
4042 classbits the bit map for characters < 256
4043 uchardptr points to the pointer for extra data
4044 options the options word
4045 cd contains pointers to tables etc.
4046 p points to row of 32-bit values, terminated by NOTACHAR
4047
4048 Returns: the number of < 256 characters added
4049 the pointer to extra data is updated
4050 */
4051
4052 static int
4053 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4054 int options, compile_data *cd, const pcre_uint32 *p)
4055 {
4056 BOOL utf = (options & PCRE_UTF8) != 0;
4057 int n8 = 0;
4058 if (p[0] > 0)
4059 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4060 while (p[0] < NOTACHAR)
4061 {
4062 while (p[1] == p[0] + 1) p++;
4063 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4064 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4065 p++;
4066 }
4067 return n8;
4068 }
4069
4070
4071
4072 /*************************************************
4073 * Compile one branch *
4074 *************************************************/
4075
4076 /* Scan the pattern, compiling it into the a vector. If the options are
4077 changed during the branch, the pointer is used to change the external options
4078 bits. This function is used during the pre-compile phase when we are trying
4079 to find out the amount of memory needed, as well as during the real compile
4080 phase. The value of lengthptr distinguishes the two phases.
4081
4082 Arguments:
4083 optionsptr pointer to the option bits
4084 codeptr points to the pointer to the current code point
4085 ptrptr points to the current pattern pointer
4086 errorcodeptr points to error code variable
4087 firstcharptr place to put the first required character
4088 firstcharflagsptr place to put the first character flags, or a negative number
4089 reqcharptr place to put the last required character
4090 reqcharflagsptr place to put the last required character flags, or a negative number
4091 bcptr points to current branch chain
4092 cond_depth conditional nesting depth
4093 cd contains pointers to tables etc.
4094 lengthptr NULL during the real compile phase
4095 points to length accumulator during pre-compile phase
4096
4097 Returns: TRUE on success
4098 FALSE, with *errorcodeptr set non-zero on error
4099 */
4100
4101 static BOOL
4102 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4103 const pcre_uchar **ptrptr, int *errorcodeptr,
4104 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4105 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4106 branch_chain *bcptr, int cond_depth,
4107 compile_data *cd, int *lengthptr)
4108 {
4109 int repeat_type, op_type;
4110 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4111 int bravalue = 0;
4112 int greedy_default, greedy_non_default;
4113 pcre_uint32 firstchar, reqchar;
4114 pcre_int32 firstcharflags, reqcharflags;
4115 pcre_uint32 zeroreqchar, zerofirstchar;
4116 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4117 pcre_int32 req_caseopt, reqvary, tempreqvary;
4118 int options = *optionsptr; /* May change dynamically */
4119 int after_manual_callout = 0;
4120 int length_prevgroup = 0;
4121 register pcre_uint32 c;
4122 int escape;
4123 register pcre_uchar *code = *codeptr;
4124 pcre_uchar *last_code = code;
4125 pcre_uchar *orig_code = code;
4126 pcre_uchar *tempcode;
4127 BOOL inescq = FALSE;
4128 BOOL groupsetfirstchar = FALSE;
4129 const pcre_uchar *ptr = *ptrptr;
4130 const pcre_uchar *tempptr;
4131 const pcre_uchar *nestptr = NULL;
4132 pcre_uchar *previous = NULL;
4133 pcre_uchar *previous_callout = NULL;
4134 pcre_uchar *save_hwm = NULL;
4135 pcre_uint8 classbits[32];
4136
4137 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4138 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4139 dynamically as we process the pattern. */
4140
4141 #ifdef SUPPORT_UTF
4142 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4143 BOOL utf = (options & PCRE_UTF8) != 0;
4144 #ifndef COMPILE_PCRE32
4145 pcre_uchar utf_chars[6];
4146 #endif
4147 #else
4148 BOOL utf = FALSE;
4149 #endif
4150
4151 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4152 class_uchardata always so that it can be passed to add_to_class() always,
4153 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4154 alternative calls for the different cases. */
4155
4156 pcre_uchar *class_uchardata;
4157 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4158 BOOL xclass;
4159 pcre_uchar *class_uchardata_base;
4160 #endif
4161
4162 #ifdef PCRE_DEBUG
4163 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4164 #endif
4165
4166 /* Set up the default and non-default settings for greediness */
4167
4168 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4169 greedy_non_default = greedy_default ^ 1;
4170
4171 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4172 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4173 matches a non-fixed char first char; reqchar just remains unset if we never
4174 find one.
4175
4176 When we hit a repeat whose minimum is zero, we may have to adjust these values
4177 to take the zero repeat into account. This is implemented by setting them to
4178 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4179 item types that can be repeated set these backoff variables appropriately. */
4180
4181 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4182 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4183
4184 /* The variable req_caseopt contains either the REQ_CASELESS value
4185 or zero, according to the current setting of the caseless flag. The
4186 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4187 firstchar or reqchar variables to record the case status of the
4188 value. This is used only for ASCII characters. */
4189
4190 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4191
4192 /* Switch on next character until the end of the branch */
4193
4194 for (;; ptr++)
4195 {
4196 BOOL negate_class;
4197 BOOL should_flip_negation;
4198 BOOL possessive_quantifier;
4199 BOOL is_quantifier;
4200 BOOL is_recurse;
4201 BOOL reset_bracount;
4202 int class_has_8bitchar;
4203 int class_one_char;
4204 int newoptions;
4205 int recno;
4206 int refsign;
4207 int skipbytes;
4208 pcre_uint32 subreqchar, subfirstchar;
4209 pcre_int32 subreqcharflags, subfirstcharflags;
4210 int terminator;
4211 unsigned int mclength;
4212 unsigned int tempbracount;
4213 pcre_uint32 ec;
4214 pcre_uchar mcbuffer[8];
4215
4216 /* Get next character in the pattern */
4217
4218 c = *ptr;
4219
4220 /* If we are at the end of a nested substitution, revert to the outer level
4221 string. Nesting only happens one level deep. */
4222
4223 if (c == CHAR_NULL && nestptr != NULL)
4224 {
4225 ptr = nestptr;
4226 nestptr = NULL;
4227 c = *ptr;
4228 }
4229
4230 /* If we are in the pre-compile phase, accumulate the length used for the
4231 previous cycle of this loop. */
4232
4233 if (lengthptr != NULL)
4234 {
4235 #ifdef PCRE_DEBUG
4236 if (code > cd->hwm) cd->hwm = code; /* High water info */
4237 #endif
4238 if (code > cd->start_workspace + cd->workspace_size -
4239 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4240 {
4241 *errorcodeptr = ERR52;
4242 goto FAILED;
4243 }
4244
4245 /* There is at least one situation where code goes backwards: this is the
4246 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4247 the class is simply eliminated. However, it is created first, so we have to
4248 allow memory for it. Therefore, don't ever reduce the length at this point.
4249 */
4250
4251 if (code < last_code) code = last_code;
4252
4253 /* Paranoid check for integer overflow */
4254
4255 if (OFLOW_MAX - *lengthptr < code - last_code)
4256 {
4257 *errorcodeptr = ERR20;
4258 goto FAILED;
4259 }
4260
4261 *lengthptr += (int)(code - last_code);
4262 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4263 (int)(code - last_code), c, c));
4264
4265 /* If "previous" is set and it is not at the start of the work space, move
4266 it back to there, in order to avoid filling up the work space. Otherwise,
4267 if "previous" is NULL, reset the current code pointer to the start. */
4268
4269 if (previous != NULL)
4270 {
4271 if (previous > orig_code)
4272 {
4273 memmove(orig_code, previous, IN_UCHARS(code - previous));
4274 code -= previous - orig_code;
4275 previous = orig_code;
4276 }
4277 }
4278 else code = orig_code;
4279
4280 /* Remember where this code item starts so we can pick up the length
4281 next time round. */
4282
4283 last_code = code;
4284 }
4285
4286 /* In the real compile phase, just check the workspace used by the forward
4287 reference list. */
4288
4289 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4290 WORK_SIZE_SAFETY_MARGIN)
4291 {
4292 *errorcodeptr = ERR52;
4293 goto FAILED;
4294 }
4295
4296 /* If in \Q...\E, check for the end; if not, we have a literal */
4297
4298 if (inescq && c != CHAR_NULL)
4299 {
4300 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4301 {
4302 inescq = FALSE;
4303 ptr++;
4304 continue;
4305 }
4306 else
4307 {
4308 if (previous_callout != NULL)
4309 {
4310 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4311 complete_callout(previous_callout, ptr, cd);
4312 previous_callout = NULL;
4313 }
4314 if ((options & PCRE_AUTO_CALLOUT) != 0)
4315 {
4316 previous_callout = code;
4317 code = auto_callout(code, ptr, cd);
4318 }
4319 goto NORMAL_CHAR;
4320 }
4321 }
4322
4323 is_quantifier =
4324 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4325 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4326
4327 /* Fill in length of a previous callout, except when the next thing is a
4328 quantifier or when processing a property substitution string in UCP mode. */
4329
4330 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4331 after_manual_callout-- <= 0)
4332 {
4333 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4334 complete_callout(previous_callout, ptr, cd);
4335 previous_callout = NULL;
4336 }
4337
4338 /* In extended mode, skip white space and comments. */
4339
4340 if ((options & PCRE_EXTENDED) != 0)
4341 {
4342 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
4343 if (c == CHAR_NUMBER_SIGN)
4344 {
4345 ptr++;
4346 while (*ptr != CHAR_NULL)
4347 {
4348 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
4349 ptr++;
4350 #ifdef SUPPORT_UTF
4351 if (utf) FORWARDCHAR(ptr);
4352 #endif
4353 }
4354 if (*ptr != CHAR_NULL) continue;
4355
4356 /* Else fall through to handle end of string */
4357 c = 0;
4358 }
4359 }
4360
4361 /* No auto callout for quantifiers, or while processing property strings that
4362 are substituted for \w etc in UCP mode. */
4363
4364 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4365 {
4366 previous_callout = code;
4367 code = auto_callout(code, ptr, cd);
4368 }
4369
4370 switch(c)
4371 {
4372 /* ===================================================================*/
4373 case 0: /* The branch terminates at string end */
4374 case CHAR_VERTICAL_LINE: /* or | or ) */
4375 case CHAR_RIGHT_PARENTHESIS:
4376 *firstcharptr = firstchar;
4377 *firstcharflagsptr = firstcharflags;
4378 *reqcharptr = reqchar;
4379 *reqcharflagsptr = reqcharflags;
4380 *codeptr = code;
4381 *ptrptr = ptr;
4382 if (lengthptr != NULL)
4383 {
4384 if (OFLOW_MAX - *lengthptr < code - last_code)
4385 {
4386 *errorcodeptr = ERR20;
4387 goto FAILED;
4388 }
4389 *lengthptr += (int)(code - last_code); /* To include callout length */
4390 DPRINTF((">> end branch\n"));
4391 }
4392 return TRUE;
4393
4394
4395 /* ===================================================================*/
4396 /* Handle single-character metacharacters. In multiline mode, ^ disables
4397 the setting of any following char as a first character. */
4398
4399 case CHAR_CIRCUMFLEX_ACCENT:
4400 previous = NULL;
4401 if ((options & PCRE_MULTILINE) != 0)
4402 {
4403 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4404 *code++ = OP_CIRCM;
4405 }
4406 else *code++ = OP_CIRC;
4407 break;
4408
4409 case CHAR_DOLLAR_SIGN:
4410 previous = NULL;
4411 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4412 break;
4413
4414 /* There can never be a first char if '.' is first, whatever happens about
4415 repeats. The value of reqchar doesn't change either. */
4416
4417 case CHAR_DOT:
4418 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4419 zerofirstchar = firstchar;
4420 zerofirstcharflags = firstcharflags;
4421 zeroreqchar = reqchar;
4422 zeroreqcharflags = reqcharflags;
4423 previous = code;
4424 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4425 break;
4426
4427
4428 /* ===================================================================*/
4429 /* Character classes. If the included characters are all < 256, we build a
4430 32-byte bitmap of the permitted characters, except in the special case
4431 where there is only one such character. For negated classes, we build the
4432 map as usual, then invert it at the end. However, we use a different opcode
4433 so that data characters > 255 can be handled correctly.
4434
4435 If the class contains characters outside the 0-255 range, a different
4436 opcode is compiled. It may optionally have a bit map for characters < 256,
4437 but those above are are explicitly listed afterwards. A flag byte tells
4438 whether the bitmap is present, and whether this is a negated class or not.
4439
4440 In JavaScript compatibility mode, an isolated ']' causes an error. In
4441 default (Perl) mode, it is treated as a data character. */
4442
4443 case CHAR_RIGHT_SQUARE_BRACKET:
4444 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4445 {
4446 *errorcodeptr = ERR64;
4447 goto FAILED;
4448 }
4449 goto NORMAL_CHAR;
4450
4451 case CHAR_LEFT_SQUARE_BRACKET:
4452 previous = code;
4453
4454 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4455 they are encountered at the top level, so we'll do that too. */
4456
4457 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4458 ptr[1] == CHAR_EQUALS_SIGN) &&
4459 check_posix_syntax(ptr, &tempptr))
4460 {
4461 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4462 goto FAILED;
4463 }
4464
4465 /* If the first character is '^', set the negation flag and skip it. Also,
4466 if the first few characters (either before or after ^) are \Q\E or \E we
4467 skip them too. This makes for compatibility with Perl. */
4468
4469 negate_class = FALSE;
4470 for (;;)
4471 {
4472 c = *(++ptr);
4473 if (c == CHAR_BACKSLASH)
4474 {
4475 if (ptr[1] == CHAR_E)
4476 ptr++;
4477 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4478 ptr += 3;
4479 else
4480 break;
4481 }
4482 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4483 negate_class = TRUE;
4484 else break;
4485 }
4486
4487 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4488 an initial ']' is taken as a data character -- the code below handles
4489 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4490 [^] must match any character, so generate OP_ALLANY. */
4491
4492 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4493 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4494 {
4495 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4496 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4497 zerofirstchar = firstchar;
4498 zerofirstcharflags = firstcharflags;
4499 break;
4500 }
4501
4502 /* If a class contains a negative special such as \S, we need to flip the
4503 negation flag at the end, so that support for characters > 255 works
4504 correctly (they are all included in the class). */
4505
4506 should_flip_negation = FALSE;
4507
4508 /* For optimization purposes, we track some properties of the class:
4509 class_has_8bitchar will be non-zero if the class contains at least one <
4510 256 character; class_one_char will be 1 if the class contains just one
4511 character. */
4512
4513 class_has_8bitchar = 0;
4514 class_one_char = 0;
4515
4516 /* Initialize the 32-char bit map to all zeros. We build the map in a
4517 temporary bit of memory, in case the class contains fewer than two
4518 8-bit characters because in that case the compiled code doesn't use the bit
4519 map. */
4520
4521 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4522
4523 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4524 xclass = FALSE;
4525 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4526 class_uchardata_base = class_uchardata; /* Save the start */
4527 #endif
4528
4529 /* Process characters until ] is reached. By writing this as a "do" it
4530 means that an initial ] is taken as a data character. At the start of the
4531 loop, c contains the first byte of the character. */
4532
4533 if (c != CHAR_NULL) do
4534 {
4535 const pcre_uchar *oldptr;
4536
4537 #ifdef SUPPORT_UTF
4538 if (utf && HAS_EXTRALEN(c))
4539 { /* Braces are required because the */
4540 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4541 }
4542 #endif
4543
4544 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4545 /* In the pre-compile phase, accumulate the length of any extra
4546 data and reset the pointer. This is so that very large classes that
4547 contain a zillion > 255 characters no longer overwrite the work space
4548 (which is on the stack). We have to remember that there was XCLASS data,
4549 however. */
4550
4551 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4552 {
4553 xclass = TRUE;
4554 *lengthptr += class_uchardata - class_uchardata_base;
4555 class_uchardata = class_uchardata_base;
4556 }
4557 #endif
4558
4559 /* Inside \Q...\E everything is literal except \E */
4560
4561 if (inescq)
4562 {
4563 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4564 {
4565 inescq = FALSE; /* Reset literal state */
4566 ptr++; /* Skip the 'E' */
4567 continue; /* Carry on with next */
4568 }
4569 goto CHECK_RANGE; /* Could be range if \E follows */
4570 }
4571
4572 /* Handle POSIX class names. Perl allows a negation extension of the
4573 form [:^name:]. A square bracket that doesn't match the syntax is
4574 treated as a literal. We also recognize the POSIX constructions
4575 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4576 5.6 and 5.8 do. */
4577
4578 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4579 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4580 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4581 {
4582 BOOL local_negate = FALSE;
4583 int posix_class, taboffset, tabopt;
4584 register const pcre_uint8 *cbits = cd->cbits;
4585 pcre_uint8 pbits[32];
4586
4587 if (ptr[1] != CHAR_COLON)
4588 {
4589 *errorcodeptr = ERR31;
4590 goto FAILED;
4591 }
4592
4593 ptr += 2;
4594 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4595 {
4596 local_negate = TRUE;
4597 should_flip_negation = TRUE; /* Note negative special */
4598 ptr++;
4599 }
4600
4601 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4602 if (posix_class < 0)
4603 {
4604 *errorcodeptr = ERR30;
4605 goto FAILED;
4606 }
4607
4608 /* If matching is caseless, upper and lower are converted to
4609 alpha. This relies on the fact that the class table starts with
4610 alpha, lower, upper as the first 3 entries. */
4611
4612 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4613 posix_class = 0;
4614
4615 /* When PCRE_UCP is set, some of the POSIX classes are converted to
4616 different escape sequences that use Unicode properties. */
4617
4618 #ifdef SUPPORT_UCP
4619 if ((options & PCRE_UCP) != 0)
4620 {
4621 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4622 if (posix_substitutes[pc] != NULL)
4623 {
4624 nestptr = tempptr + 1;
4625 ptr = posix_substitutes[pc] - 1;
4626 continue;
4627 }
4628 }
4629 #endif
4630 /* In the non-UCP case, we build the bit map for the POSIX class in a
4631 chunk of local store because we may be adding and subtracting from it,
4632 and we don't want to subtract bits that may be in the main map already.
4633 At the end we or the result into the bit map that is being built. */
4634
4635 posix_class *= 3;
4636
4637 /* Copy in the first table (always present) */
4638
4639 memcpy(pbits, cbits + posix_class_maps[posix_class],
4640 32 * sizeof(pcre_uint8));
4641
4642 /* If there is a second table, add or remove it as required. */
4643
4644 taboffset = posix_class_maps[posix_class + 1];
4645 tabopt = posix_class_maps[posix_class + 2];
4646
4647 if (taboffset >= 0)
4648 {
4649 if (tabopt >= 0)
4650 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4651 else
4652 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4653 }
4654
4655 /* Now see if we need to remove any special characters. An option
4656 value of 1 removes vertical space and 2 removes underscore. */
4657
4658 if (tabopt < 0) tabopt = -tabopt;
4659 if (tabopt == 1) pbits[1] &= ~0x3c;
4660 else if (tabopt == 2) pbits[11] &= 0x7f;
4661
4662 /* Add the POSIX table or its complement into the main table that is
4663 being built and we are done. */
4664
4665 if (local_negate)
4666 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4667 else
4668 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4669
4670 ptr = tempptr + 1;
4671 /* Every class contains at least one < 256 character. */
4672 class_has_8bitchar = 1;
4673 /* Every class contains at least two characters. */
4674 class_one_char = 2;
4675 continue; /* End of POSIX syntax handling */
4676 }
4677
4678 /* Backslash may introduce a single character, or it may introduce one
4679 of the specials, which just set a flag. The sequence \b is a special
4680 case. Inside a class (and only there) it is treated as backspace. We
4681 assume that other escapes have more than one character in them, so
4682 speculatively set both class_has_8bitchar and class_one_char bigger
4683 than one. Unrecognized escapes fall through and are either treated
4684 as literal characters (by default), or are faulted if
4685 PCRE_EXTRA is set. */
4686
4687 if (c == CHAR_BACKSLASH)
4688 {
4689 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4690 TRUE);
4691 if (*errorcodeptr != 0) goto FAILED;
4692 if (escape == 0) c = ec;
4693 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4694 else if (escape == ESC_N) /* \N is not supported in a class */
4695 {
4696 *errorcodeptr = ERR71;
4697 goto FAILED;
4698 }
4699 else if (escape == ESC_Q) /* Handle start of quoted string */
4700 {
4701 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4702 {
4703 ptr += 2; /* avoid empty string */
4704 }
4705 else inescq = TRUE;
4706 continue;
4707 }
4708 else if (escape == ESC_E) continue; /* Ignore orphan \E */
4709
4710 else
4711 {
4712 register const pcre_uint8 *cbits = cd->cbits;
4713 /* Every class contains at least two < 256 characters. */
4714 class_has_8bitchar++;
4715 /* Every class contains at least two characters. */
4716 class_one_char += 2;
4717
4718 switch (escape)
4719 {
4720 #ifdef SUPPORT_UCP
4721 case ESC_du: /* These are the values given for \d etc */
4722 case ESC_DU: /* when PCRE_UCP is set. We replace the */
4723 case ESC_wu: /* escape sequence with an appropriate \p */
4724 case ESC_WU: /* or \P to test Unicode properties instead */
4725 case ESC_su: /* of the default ASCII testing. */
4726 case ESC_SU:
4727 nestptr = ptr;
4728 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4729 class_has_8bitchar--; /* Undo! */
4730 continue;
4731 #endif
4732 case ESC_d:
4733 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4734 continue;
4735
4736 case ESC_D:
4737 should_flip_negation = TRUE;
4738 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4739 continue;
4740
4741 case ESC_w:
4742 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4743 continue;
4744
4745 case ESC_W:
4746 should_flip_negation = TRUE;
4747 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4748 continue;
4749
4750 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4751 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4752 previously set by something earlier in the character class.
4753 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4754 we could just adjust the appropriate bit. From PCRE 8.34 we no
4755 longer treat \s and \S specially. */
4756
4757 case ESC_s:
4758 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4759 continue;
4760
4761 case ESC_S:
4762 should_flip_negation = TRUE;
4763 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4764 continue;
4765
4766 /* The rest apply in both UCP and non-UCP cases. */
4767
4768 case ESC_h:
4769 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4770 PRIV(hspace_list), NOTACHAR);
4771 continue;
4772
4773 case ESC_H:
4774 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4775 cd, PRIV(hspace_list));
4776 continue;
4777
4778 case ESC_v:
4779 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4780 PRIV(vspace_list), NOTACHAR);
4781 continue;
4782
4783 case ESC_V:
4784 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4785 cd, PRIV(vspace_list));
4786 continue;
4787
4788 #ifdef SUPPORT_UCP
4789 case ESC_p:
4790 case ESC_P:
4791 {
4792 BOOL negated;
4793 unsigned int ptype = 0, pdata = 0;
4794 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
4795 goto FAILED;
4796 *class_uchardata++ = ((escape == ESC_p) != negated)?
4797 XCL_PROP : XCL_NOTPROP;
4798 *class_uchardata++ = ptype;
4799 *class_uchardata++ = pdata;
4800 class_has_8bitchar--; /* Undo! */
4801 continue;
4802 }
4803 #endif
4804 /* Unrecognized escapes are faulted if PCRE is running in its
4805 strict mode. By default, for compatibility with Perl, they are
4806 treated as literals. */
4807
4808 default:
4809 if ((options & PCRE_EXTRA) != 0)
4810 {
4811 *errorcodeptr = ERR7;
4812 goto FAILED;
4813 }
4814 class_has_8bitchar--; /* Undo the speculative increase. */
4815 class_one_char -= 2; /* Undo the speculative increase. */
4816 c = *ptr; /* Get the final character and fall through */
4817 break;
4818 }
4819 }
4820
4821 /* Fall through if the escape just defined a single character (c >= 0).
4822 This may be greater than 256. */
4823
4824 escape = 0;
4825
4826 } /* End of backslash handling */
4827
4828 /* A character may be followed by '-' to form a range. However, Perl does
4829 not permit ']' to be the end of the range. A '-' character at the end is
4830 treated as a literal. Perl ignores orphaned \E sequences entirely. The
4831 code for handling \Q and \E is messy. */
4832
4833 CHECK_RANGE:
4834 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4835 {
4836 inescq = FALSE;
4837 ptr += 2;
4838 }
4839 oldptr = ptr;
4840
4841 /* Remember if \r or \n were explicitly used */
4842
4843 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4844
4845 /* Check for range */
4846
4847 if (!inescq && ptr[1] == CHAR_MINUS)
4848 {
4849 pcre_uint32 d;
4850 ptr += 2;
4851 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4852
4853 /* If we hit \Q (not followed by \E) at this point, go into escaped
4854 mode. */
4855
4856 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4857 {
4858 ptr += 2;
4859 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4860 { ptr += 2; continue; }
4861 inescq = TRUE;
4862 break;
4863 }
4864
4865 /* Minus (hyphen) at the end of a class is treated as a literal, so put
4866 back the pointer and jump to handle the character that preceded it. */
4867
4868 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4869 {
4870 ptr = oldptr;
4871 goto CLASS_SINGLE_CHARACTER;
4872 }
4873
4874 /* Otherwise, we have a potential range; pick up the next character */
4875
4876 #ifdef SUPPORT_UTF
4877 if (utf)
4878 { /* Braces are required because the */
4879 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4880 }
4881 else
4882 #endif
4883 d = *ptr; /* Not UTF-8 mode */
4884
4885 /* The second part of a range can be a single-character escape, but
4886 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4887 in such circumstances. */
4888
4889 if (!inescq && d == CHAR_BACKSLASH)
4890 {
4891 int descape;
4892 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
4893 if (*errorcodeptr != 0) goto FAILED;
4894
4895 /* \b is backspace; any other special means the '-' was literal. */
4896
4897 if (descape != 0)
4898 {
4899 if (descape == ESC_b) d = CHAR_BS; else
4900 {
4901 ptr = oldptr;
4902 goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4903 }
4904 }
4905 }
4906
4907 /* Check that the two values are in the correct order. Optimize
4908 one-character ranges. */
4909
4910 if (d < c)
4911 {
4912 *errorcodeptr = ERR8;
4913 goto FAILED;
4914 }
4915 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4916
4917 /* We have found a character range, so single character optimizations
4918 cannot be done anymore. Any value greater than 1 indicates that there
4919 is more than one character. */
4920
4921 class_one_char = 2;
4922
4923 /* Remember an explicit \r or \n, and add the range to the class. */
4924
4925 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4926
4927 class_has_8bitchar +=
4928 add_to_class(classbits, &class_uchardata, options, cd, c, d);
4929
4930 continue; /* Go get the next char in the class */
4931 }
4932
4933 /* Handle a single character - we can get here for a normal non-escape
4934 char, or after \ that introduces a single character or for an apparent
4935 range that isn't. Only the value 1 matters for class_one_char, so don't
4936 increase it if it is already 2 or more ... just in case there's a class
4937 with a zillion characters in it. */
4938
4939 CLASS_SINGLE_CHARACTER:
4940 if (class_one_char < 2) class_one_char++;
4941
4942 /* If class_one_char is 1, we have the first single character in the
4943 class, and there have been no prior ranges, or XCLASS items generated by
4944 escapes. If this is the final character in the class, we can optimize by
4945 turning the item into a 1-character OP_CHAR[I] if it's positive, or
4946 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
4947 to be set. Otherwise, there can be no first char if this item is first,
4948 whatever repeat count may follow. In the case of reqchar, save the
4949 previous value for reinstating. */
4950
4951 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4952 {
4953 ptr++;
4954 zeroreqchar = reqchar;
4955 zeroreqcharflags = reqcharflags;
4956
4957 if (negate_class)
4958 {
4959 #ifdef SUPPORT_UCP
4960 int d;
4961 #endif
4962 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4963 zerofirstchar = firstchar;
4964 zerofirstcharflags = firstcharflags;
4965
4966 /* For caseless UTF-8 mode when UCP support is available, check
4967 whether this character has more than one other case. If so, generate
4968 a special OP_NOTPROP item instead of OP_NOTI. */
4969
4970 #ifdef SUPPORT_UCP
4971 if (utf && (options & PCRE_CASELESS) != 0 &&
4972 (d = UCD_CASESET(c)) != 0)
4973 {
4974 *code++ = OP_NOTPROP;
4975 *code++ = PT_CLIST;
4976 *code++ = d;
4977 }
4978 else
4979 #endif
4980 /* Char has only one other case, or UCP not available */
4981
4982 {
4983 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4984 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4985 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4986 code += PRIV(ord2utf)(c, code);
4987 else
4988 #endif
4989 *code++ = c;
4990 }
4991
4992 /* We are finished with this character class */
4993
4994 goto END_CLASS;
4995 }
4996
4997 /* For a single, positive character, get the value into mcbuffer, and
4998 then we can handle this with the normal one-character code. */
4999
5000 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5001 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5002 mclength = PRIV(ord2utf)(c, mcbuffer);
5003 else
5004 #endif
5005 {
5006 mcbuffer[0] = c;
5007 mclength = 1;
5008 }
5009 goto ONE_CHAR;
5010 } /* End of 1-char optimization */
5011
5012 /* There is more than one character in the class, or an XCLASS item
5013 has been generated. Add this character to the class. */
5014
5015 class_has_8bitchar +=
5016 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5017 }
5018
5019 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5020 If we are at the end of an internal nested string, revert to the outer
5021 string. */
5022
5023 while (((c = *(++ptr)) != CHAR_NULL ||
5024 (nestptr != NULL &&
5025 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5026 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5027
5028 /* Check for missing terminating ']' */
5029
5030 if (c == CHAR_NULL)
5031 {
5032 *errorcodeptr = ERR6;
5033 goto FAILED;
5034 }
5035
5036 /* We will need an XCLASS if data has been placed in class_uchardata. In
5037 the second phase this is a sufficient test. However, in the pre-compile
5038 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5039 only if the very last character in the class needs XCLASS will it contain
5040 anything at this point. For this reason, xclass gets set TRUE above when
5041 uchar_classdata is emptied, and that's why this code is the way it is here
5042 instead of just doing a test on class_uchardata below. */
5043
5044 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5045 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5046 #endif
5047
5048 /* If this is the first thing in the branch, there can be no first char
5049 setting, whatever the repeat count. Any reqchar setting must remain
5050 unchanged after any kind of repeat. */
5051
5052 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5053 zerofirstchar = firstchar;
5054 zerofirstcharflags = firstcharflags;
5055 zeroreqchar = reqchar;
5056 zeroreqcharflags = reqcharflags;
5057
5058 /* If there are characters with values > 255, we have to compile an
5059 extended class, with its own opcode, unless there was a negated special
5060 such as \S in the class, and PCRE_UCP is not set, because in that case all
5061 characters > 255 are in the class, so any that were explicitly given as
5062 well can be ignored. If (when there are explicit characters > 255 that must
5063 be listed) there are no characters < 256, we can omit the bitmap in the
5064 actual compiled code. */
5065
5066 #ifdef SUPPORT_UTF
5067 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5068 #elif !defined COMPILE_PCRE8
5069 if (xclass && !should_flip_negation)
5070 #endif
5071 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5072 {
5073 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5074 *code++ = OP_XCLASS;
5075 code += LINK_SIZE;
5076 *code = negate_class? XCL_NOT:0;
5077
5078 /* If the map is required, move up the extra data to make room for it;
5079 otherwise just move the code pointer to the end of the extra data. */
5080
5081 if (class_has_8bitchar > 0)
5082 {
5083 *code++ |= XCL_MAP;
5084 memmove(code + (32 / sizeof(pcre_uchar)), code,
5085 IN_UCHARS(class_uchardata - code));
5086 memcpy(code, classbits, 32);
5087 code = class_uchardata + (32 / sizeof(pcre_uchar));
5088 }
5089 else code = class_uchardata;
5090
5091 /* Now fill in the complete length of the item */
5092
5093 PUT(previous, 1, (int)(code - previous));
5094 break; /* End of class handling */
5095 }
5096 #endif
5097
5098 /* If there are no characters > 255, or they are all to be included or
5099 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5100 whole class was negated and whether there were negative specials such as \S
5101 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5102 negating it if necessary. */
5103
5104 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5105 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5106 {
5107 if (negate_class)
5108 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5109 memcpy(code, classbits, 32);
5110 }
5111 code += 32 / sizeof(pcre_uchar);
5112
5113 END_CLASS:
5114 break;
5115
5116
5117 /* ===================================================================*/
5118 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5119 has been tested above. */
5120
5121 case CHAR_LEFT_CURLY_BRACKET:
5122 if (!is_quantifier) goto NORMAL_CHAR;
5123 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5124 if (*errorcodeptr != 0) goto FAILED;
5125 goto REPEAT;
5126
5127 case CHAR_ASTERISK:
5128 repeat_min = 0;
5129 repeat_max = -1;
5130 goto REPEAT;
5131
5132 case CHAR_PLUS:
5133 repeat_min = 1;
5134 repeat_max = -1;
5135 goto REPEAT;
5136
5137 case CHAR_QUESTION_MARK:
5138 repeat_min = 0;
5139 repeat_max = 1;
5140
5141 REPEAT:
5142 if (previous == NULL)
5143 {
5144 *errorcodeptr = ERR9;
5145 goto FAILED;
5146 }
5147
5148 if (repeat_min == 0)
5149 {
5150 firstchar = zerofirstchar; /* Adjust for zero repeat */
5151 firstcharflags = zerofirstcharflags;
5152 reqchar = zeroreqchar; /* Ditto */
5153 reqcharflags = zeroreqcharflags;
5154 }
5155
5156 /* Remember whether this is a variable length repeat */
5157
5158 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5159
5160 op_type = 0; /* Default single-char op codes */
5161 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5162
5163 /* Save start of previous item, in case we have to move it up in order to
5164 insert something before it. */
5165
5166 tempcode = previous;
5167
5168 /* If the next character is '+', we have a possessive quantifier. This
5169 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5170 If the next character is '?' this is a minimizing repeat, by default,
5171 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5172 repeat type to the non-default. */
5173
5174 if (ptr[1] == CHAR_PLUS)
5175 {
5176 repeat_type = 0; /* Force greedy */
5177 possessive_quantifier = TRUE;
5178 ptr++;
5179 }
5180 else if (ptr[1] == CHAR_QUESTION_MARK)
5181 {
5182 repeat_type = greedy_non_default;
5183 ptr++;
5184 }
5185 else repeat_type = greedy_default;
5186
5187 /* If previous was a recursion call, wrap it in atomic brackets so that
5188 previous becomes the atomic group. All recursions were so wrapped in the
5189 past, but it no longer happens for non-repeated recursions. In fact, the
5190 repeated ones could be re-implemented independently so as not to need this,
5191 but for the moment we rely on the code for repeating groups. */
5192
5193 if (*previous == OP_RECURSE)
5194 {
5195 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5196 *previous = OP_ONCE;
5197 PUT(previous, 1, 2 + 2*LINK_SIZE);
5198 previous[2 + 2*LINK_SIZE] = OP_KET;
5199 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5200 code += 2 + 2 * LINK_SIZE;
5201 length_prevgroup = 3 + 3*LINK_SIZE;
5202
5203 /* When actually compiling, we need to check whether this was a forward
5204 reference, and if so, adjust the offset. */
5205
5206 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5207 {
5208 int offset = GET(cd->hwm, -LINK_SIZE);
5209 if (offset == previous + 1 - cd->start_code)
5210 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5211 }
5212 }
5213
5214 /* Now handle repetition for the different types of item. */
5215
5216 /* If previous was a character or negated character match, abolish the item
5217 and generate a repeat item instead. If a char item has a minimum of more
5218 than one, ensure that it is set in reqchar - it might not be if a sequence
5219 such as x{3} is the first thing in a branch because the x will have gone
5220 into firstchar instead. */
5221
5222 if (*previous == OP_CHAR || *previous == OP_CHARI
5223 || *previous == OP_NOT || *previous == OP_NOTI)
5224 {
5225 switch (*previous)
5226 {
5227 default: /* Make compiler happy. */
5228 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5229 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5230 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5231 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5232 }
5233
5234 /* Deal with UTF characters that take up more than one character. It's
5235 easier to write this out separately than try to macrify it. Use c to
5236 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5237 it's a length rather than a small character. */
5238
5239 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5240 if (utf && NOT_FIRSTCHAR(code[-1]))
5241 {
5242 pcre_uchar *lastchar = code - 1;
5243 BACKCHAR(lastchar);
5244 c = (int)(code - lastchar); /* Length of UTF-8 character */
5245 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5246 c |= UTF_LENGTH; /* Flag c as a length */
5247 }
5248 else
5249 #endif /* SUPPORT_UTF */
5250
5251 /* Handle the case of a single charater - either with no UTF support, or
5252 with UTF disabled, or for a single character UTF character. */
5253 {
5254 c = code[-1];
5255 if (*previous <= OP_CHARI && repeat_min > 1)
5256 {
5257 reqchar = c;
5258 reqcharflags = req_caseopt | cd->req_varyopt;
5259 }
5260 }
5261
5262 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5263 }
5264
5265 /* If previous was a character type match (\d or similar), abolish it and
5266 create a suitable repeat item. The code is shared with single-character
5267 repeats by setting op_type to add a suitable offset into repeat_type. Note
5268 the the Unicode property types will be present only when SUPPORT_UCP is
5269 defined, but we don't wrap the little bits of code here because it just
5270 makes it horribly messy. */
5271
5272 else if (*previous < OP_EODN)
5273 {
5274 pcre_uchar *oldcode;
5275 int prop_type, prop_value;
5276 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5277 c = *previous;
5278
5279 OUTPUT_SINGLE_REPEAT:
5280 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5281 {
5282 prop_type = previous[1];
5283 prop_value = previous[2];
5284 }
5285 else prop_type = prop_value = -1;
5286
5287 oldcode = code;
5288 code = previous; /* Usually overwrite previous item */
5289
5290 /* If the maximum is zero then the minimum must also be zero; Perl allows
5291 this case, so we do too - by simply omitting the item altogether. */
5292
5293 if (repeat_max == 0) goto END_REPEAT;
5294
5295 /* Combine the op_type with the repeat_type */
5296
5297 repeat_type += op_type;
5298
5299 /* A minimum of zero is handled either as the special case * or ?, or as
5300 an UPTO, with the maximum given. */
5301
5302 if (repeat_min == 0)
5303 {
5304 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5305 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5306 else
5307 {
5308 *code++ = OP_UPTO + repeat_type;
5309 PUT2INC(code, 0, repeat_max);
5310 }
5311 }
5312
5313 /* A repeat minimum of 1 is optimized into some special cases. If the
5314 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5315 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5316 one less than the maximum. */
5317
5318 else if (repeat_min == 1)
5319 {
5320 if (repeat_max == -1)
5321 *code++ = OP_PLUS + repeat_type;
5322 else
5323 {
5324 code = oldcode; /* leave previous item in place */
5325 if (repeat_max == 1) goto END_REPEAT;
5326 *code++ = OP_UPTO + repeat_type;
5327 PUT2INC(code, 0, repeat_max - 1);
5328 }
5329 }
5330
5331 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5332 handled as an EXACT followed by an UPTO. */
5333
5334 else
5335 {
5336 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5337 PUT2INC(code, 0, repeat_min);
5338
5339 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5340 we have to insert the character for the previous code. For a repeated
5341 Unicode property match, there are two extra bytes that define the
5342 required property. In UTF-8 mode, long characters have their length in
5343 c, with the UTF_LENGTH bit as a flag. */
5344
5345 if (repeat_max < 0)
5346 {
5347 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5348 if (utf && (c & UTF_LENGTH) != 0)
5349 {
5350 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5351 code += c & 7;
5352 }
5353 else
5354 #endif
5355 {
5356 *code++ = c;
5357 if (prop_type >= 0)
5358 {
5359 *code++ = prop_type;
5360 *code++ = prop_value;
5361 }
5362 }
5363 *code++ = OP_STAR + repeat_type;
5364 }
5365
5366 /* Else insert an UPTO if the max is greater than the min, again
5367 preceded by the character, for the previously inserted code. If the
5368 UPTO is just for 1 instance, we can use QUERY instead. */
5369
5370 else if (repeat_max != repeat_min)
5371 {
5372 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5373 if (utf && (c & UTF_LENGTH) != 0)
5374 {
5375 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5376 code += c & 7;
5377 }
5378 else
5379 #endif
5380 *code++ = c;
5381 if (prop_type >= 0)
5382 {
5383 *code++ = prop_type;
5384 *code++ = prop_value;
5385 }
5386 repeat_max -= repeat_min;
5387
5388 if (repeat_max == 1)
5389 {
5390 *code++ = OP_QUERY + repeat_type;
5391 }
5392 else
5393 {
5394 *code++ = OP_UPTO + repeat_type;
5395 PUT2INC(code, 0, repeat_max);
5396 }
5397 }
5398 }
5399
5400 /* The character or character type itself comes last in all cases. */
5401
5402 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5403 if (utf && (c & UTF_LENGTH) != 0)
5404 {
5405 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5406 code += c & 7;
5407 }
5408 else
5409 #endif
5410 *code++ = c;
5411
5412 /* For a repeated Unicode property match, there are two extra bytes that
5413 define the required property. */
5414
5415 #ifdef SUPPORT_UCP
5416 if (prop_type >= 0)
5417 {
5418 *code++ = prop_type;
5419 *code++ = prop_value;
5420 }
5421 #endif
5422 }
5423
5424 /* If previous was a character class or a back reference, we put the repeat
5425 stuff after it, but just skip the item if the repeat was {0,0}. */
5426
5427 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5428 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5429 *previous == OP_XCLASS ||
5430 #endif
5431 *previous == OP_REF || *previous == OP_REFI ||
5432 *previous == OP_DNREF || *previous == OP_DNREFI)
5433 {
5434 if (repeat_max == 0)
5435 {
5436 code = previous;
5437 goto END_REPEAT;
5438 }
5439
5440 if (repeat_min == 0 && repeat_max == -1)
5441 *code++ = OP_CRSTAR + repeat_type;
5442 else if (repeat_min == 1 && repeat_max == -1)
5443 *code++ = OP_CRPLUS + repeat_type;
5444 else if (repeat_min == 0 && repeat_max == 1)
5445 *code++ = OP_CRQUERY + repeat_type;
5446 else
5447 {
5448 *code++ = OP_CRRANGE + repeat_type;
5449 PUT2INC(code, 0, repeat_min);
5450 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5451 PUT2INC(code, 0, repeat_max);
5452 }
5453 }
5454
5455 /* If previous was a bracket group, we may have to replicate it in certain
5456 cases. Note that at this point we can encounter only the "basic" bracket
5457 opcodes such as BRA and CBRA, as this is the place where they get converted
5458 into the more special varieties such as BRAPOS and SBRA. A test for >=
5459 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5460 ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5461 repetition of assertions, but now it does, for Perl compatibility. */
5462
5463 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5464 {
5465 register int i;
5466 int len = (int)(code - previous);
5467 pcre_uchar *bralink = NULL;
5468 pcre_uchar *brazeroptr = NULL;
5469
5470 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5471 we just ignore the repeat. */
5472
5473 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5474 goto END_REPEAT;
5475
5476 /* There is no sense in actually repeating assertions. The only potential
5477 use of repetition is in cases when the assertion is optional. Therefore,
5478 if the minimum is greater than zero, just ignore the repeat. If the
5479 maximum is not not zero or one, set it to 1. */
5480
5481 if (*previous < OP_ONCE) /* Assertion */
5482 {
5483 if (repeat_min > 0) goto END_REPEAT;
5484 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5485 }
5486
5487 /* The case of a zero minimum is special because of the need to stick
5488 OP_BRAZERO in front of it, and because the group appears once in the
5489 data, whereas in other cases it appears the minimum number of times. For
5490 this reason, it is simplest to treat this case separately, as otherwise
5491 the code gets far too messy. There are several special subcases when the
5492 minimum is zero. */
5493
5494 if (repeat_min == 0)
5495 {
5496 /* If the maximum is also zero, we used to just omit the group from the
5497 output altogether, like this:
5498
5499 ** if (repeat_max == 0)
5500 ** {
5501 ** code = previous;
5502 ** goto END_REPEAT;
5503 ** }
5504
5505 However, that fails when a group or a subgroup within it is referenced
5506 as a subroutine from elsewhere in the pattern, so now we stick in
5507 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5508 don't have a list of which groups are referenced, we cannot do this
5509 selectively.
5510
5511 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5512 and do no more at this point. However, we do need to adjust any
5513 OP_RECURSE calls inside the group that refer to the group itself or any
5514 internal or forward referenced group, because the offset is from the
5515 start of the whole regex. Temporarily terminate the pattern while doing
5516 this. */
5517
5518 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5519 {
5520 *code = OP_END;
5521 adjust_recurse(previous, 1, utf, cd, save_hwm);
5522 memmove(previous + 1, previous, IN_UCHARS(len));
5523 code++;
5524 if (repeat_max == 0)
5525 {
5526 *previous++ = OP_SKIPZERO;
5527 goto END_REPEAT;
5528 }
5529 brazeroptr = previous; /* Save for possessive optimizing */
5530 *previous++ = OP_BRAZERO + repeat_type;
5531 }
5532
5533 /* If the maximum is greater than 1 and limited, we have to replicate
5534 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5535 The first one has to be handled carefully because it's the original
5536 copy, which has to be moved up. The remainder can be handled by code
5537 that is common with the non-zero minimum case below. We have to
5538 adjust the value or repeat_max, since one less copy is required. Once
5539 again, we may have to adjust any OP_RECURSE calls inside the group. */
5540
5541 else
5542 {
5543 int offset;
5544 *code = OP_END;
5545 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5546 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5547 code += 2 + LINK_SIZE;
5548 *previous++ = OP_BRAZERO + repeat_type;
5549 *previous++ = OP_BRA;
5550
5551 /* We chain together the bracket offset fields that have to be
5552 filled in later when the ends of the brackets are reached. */
5553
5554 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5555 bralink = previous;
5556 PUTINC(previous, 0, offset);
5557 }
5558
5559 repeat_max--;
5560 }
5561
5562 /* If the minimum is greater than zero, replicate the group as many
5563 times as necessary, and adjust the maximum to the number of subsequent
5564 copies that we need. If we set a first char from the group, and didn't
5565 set a required char, copy the latter from the former. If there are any
5566 forward reference subroutine calls in the group, there will be entries on
5567 the workspace list; replicate these with an appropriate increment. */
5568
5569 else
5570 {
5571 if (repeat_min > 1)
5572 {
5573 /* In the pre-compile phase, we don't actually do the replication. We
5574 just adjust the length as if we had. Do some paranoid checks for
5575 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5576 integer type when available, otherwise double. */
5577
5578 if (lengthptr != NULL)
5579 {
5580 int delta = (repeat_min - 1)*length_prevgroup;
5581 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5582 (INT64_OR_DOUBLE)length_prevgroup >
5583 (INT64_OR_DOUBLE)INT_MAX ||
5584 OFLOW_MAX - *lengthptr < delta)
5585 {
5586 *errorcodeptr = ERR20;
5587 goto FAILED;
5588 }
5589 *lengthptr += delta;
5590 }
5591
5592 /* This is compiling for real. If there is a set first byte for
5593 the group, and we have not yet set a "required byte", set it. Make
5594 sure there is enough workspace for copying forward references before
5595 doing the copy. */
5596
5597 else
5598 {
5599 if (groupsetfirstchar && reqcharflags < 0)
5600 {
5601 reqchar = firstchar;
5602 reqcharflags = firstcharflags;
5603 }
5604
5605 for (i = 1; i < repeat_min; i++)
5606 {
5607 pcre_uchar *hc;
5608 pcre_uchar *this_hwm = cd->hwm;
5609 memcpy(code, previous, IN_UCHARS(len));
5610
5611 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5612 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5613 {
5614 int save_offset = save_hwm - cd->start_workspace;
5615 int this_offset = this_hwm - cd->start_workspace;
5616 *errorcodeptr = expand_workspace(cd);
5617 if (*errorcodeptr != 0) goto FAILED;
5618 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5619 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5620 }
5621
5622 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5623 {
5624 PUT(cd->hwm, 0, GET(hc, 0) + len);
5625 cd->hwm += LINK_SIZE;
5626 }
5627 save_hwm = this_hwm;
5628 code += len;
5629 }
5630 }
5631 }
5632
5633 if (repeat_max > 0) repeat_max -= repeat_min;
5634 }
5635
5636 /* This code is common to both the zero and non-zero minimum cases. If
5637 the maximum is limited, it replicates the group in a nested fashion,
5638 remembering the bracket starts on a stack. In the case of a zero minimum,
5639 the first one was set up above. In all cases the repeat_max now specifies
5640 the number of additional copies needed. Again, we must remember to
5641 replicate entries on the forward reference list. */
5642
5643 if (repeat_max >= 0)
5644 {
5645 /* In the pre-compile phase, we don't actually do the replication. We
5646 just adjust the length as if we had. For each repetition we must add 1
5647 to the length for BRAZERO and for all but the last repetition we must
5648 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5649 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5650 a 64-bit integer type when available, otherwise double. */
5651
5652 if (lengthptr != NULL && repeat_max > 0)
5653 {
5654 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5655 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5656 if ((INT64_OR_DOUBLE)repeat_max *
5657 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5658 > (INT64_OR_DOUBLE)INT_MAX ||
5659 OFLOW_MAX - *lengthptr < delta)
5660 {
5661 *errorcodeptr = ERR20;
5662 goto FAILED;
5663 }
5664 *lengthptr += delta;
5665 }
5666
5667 /* This is compiling for real */
5668
5669 else for (i = repeat_max - 1; i >= 0; i--)
5670 {
5671 pcre_uchar *hc;
5672 pcre_uchar *this_hwm = cd->hwm;
5673
5674 *code++ = OP_BRAZERO + repeat_type;
5675
5676 /* All but the final copy start a new nesting, maintaining the
5677 chain of brackets outstanding. */
5678
5679 if (i != 0)
5680 {
5681 int offset;
5682 *code++ = OP_BRA;
5683 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5684 bralink = code;
5685 PUTINC(code, 0, offset);
5686 }
5687
5688 memcpy(code, previous, IN_UCHARS(len));
5689
5690 /* Ensure there is enough workspace for forward references before
5691 copying them. */
5692
5693 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5694 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5695 {
5696 int save_offset = save_hwm - cd->start_workspace;
5697 int this_offset = this_hwm - cd->start_workspace;
5698 *errorcodeptr = expand_workspace(cd);
5699 if (*errorcodeptr != 0) goto FAILED;
5700 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5701 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5702 }
5703
5704 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5705 {
5706 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5707 cd->hwm += LINK_SIZE;
5708 }
5709 save_hwm = this_hwm;
5710 code += len;
5711 }
5712
5713 /* Now chain through the pending brackets, and fill in their length
5714 fields (which are holding the chain links pro tem). */
5715
5716 while (bralink != NULL)
5717 {
5718 int oldlinkoffset;
5719 int offset = (int)(code - bralink + 1);
5720 pcre_uchar *bra = code - offset;
5721 oldlinkoffset = GET(bra, 1);
5722 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5723 *code++ = OP_KET;
5724 PUTINC(code, 0, offset);
5725 PUT(bra, 1, offset);
5726 }
5727 }
5728
5729 /* If the maximum is unlimited, set a repeater in the final copy. For
5730 ONCE brackets, that's all we need to do. However, possessively repeated
5731 ONCE brackets can be converted into non-capturing brackets, as the
5732 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5733 deal with possessive ONCEs specially.
5734
5735 Otherwise, when we are doing the actual compile phase, check to see
5736 whether this group is one that could match an empty string. If so,
5737 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5738 that runtime checking can be done. [This check is also applied to ONCE
5739 groups at runtime, but in a different way.]
5740
5741 Then, if the quantifier was possessive and the bracket is not a
5742 conditional, we convert the BRA code to the POS form, and the KET code to
5743 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5744 subpattern at both the start and at the end.) The use of special opcodes
5745 makes it possible to reduce greatly the stack usage in pcre_exec(). If
5746 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5747
5748 Then, if the minimum number of matches is 1 or 0, cancel the possessive
5749 flag so that the default action below, of wrapping everything inside
5750 atomic brackets, does not happen. When the minimum is greater than 1,
5751 there will be earlier copies of the group, and so we still have to wrap
5752 the whole thing. */
5753
5754 else
5755 {
5756 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5757 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5758
5759 /* Convert possessive ONCE brackets to non-capturing */
5760
5761 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5762 possessive_quantifier) *bracode = OP_BRA;
5763
5764 /* For non-possessive ONCE brackets, all we need to do is to
5765 set the KET. */
5766
5767 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5768 *ketcode = OP_KETRMAX + repeat_type;
5769
5770 /* Handle non-ONCE brackets and possessive ONCEs (which have been
5771 converted to non-capturing above). */
5772
5773 else
5774 {
5775 /* In the compile phase, check for empty string matching. */
5776
5777 if (lengthptr == NULL)
5778 {
5779 pcre_uchar *scode = bracode;
5780 do
5781 {
5782 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
5783 {
5784 *bracode += OP_SBRA - OP_BRA;
5785 break;
5786 }
5787 scode += GET(scode, 1);
5788 }
5789 while (*scode == OP_ALT);
5790 }
5791
5792 /* Handle possessive quantifiers. */
5793
5794 if (possessive_quantifier)
5795 {
5796 /* For COND brackets, we wrap the whole thing in a possessively
5797 repeated non-capturing bracket, because we have not invented POS
5798 versions of the COND opcodes. Because we are moving code along, we
5799 must ensure that any pending recursive references are updated. */
5800
5801 if (*bracode == OP_COND || *bracode == OP_SCOND)
5802 {
5803 int nlen = (int)(code - bracode);
5804 *code = OP_END;
5805 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5806 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5807 code += 1 + LINK_SIZE;
5808 nlen += 1 + LINK_SIZE;
5809 *bracode = OP_BRAPOS;
5810 *code++ = OP_KETRPOS;
5811 PUTINC(code, 0, nlen);
5812 PUT(bracode, 1, nlen);
5813 }
5814
5815 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5816
5817 else
5818 {
5819 *bracode += 1; /* Switch to xxxPOS opcodes */
5820 *ketcode = OP_KETRPOS;
5821 }
5822
5823 /* If the minimum is zero, mark it as possessive, then unset the
5824 possessive flag when the minimum is 0 or 1. */
5825
5826 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5827 if (repeat_min < 2) possessive_quantifier = FALSE;
5828 }
5829
5830 /* Non-possessive quantifier */
5831
5832 else *ketcode = OP_KETRMAX + repeat_type;
5833 }
5834 }
5835 }
5836
5837 /* If previous is OP_FAIL, it was generated by an empty class [] in
5838 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5839 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5840 error above. We can just ignore the repeat in JS case. */
5841
5842 else if (*previous == OP_FAIL) goto END_REPEAT;
5843
5844 /* Else there's some kind of shambles */
5845
5846 else
5847 {
5848 *errorcodeptr = ERR11;
5849 goto FAILED;
5850 }
5851
5852 /* If the character following a repeat is '+', or if certain optimization
5853 tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5854 there are special alternative opcodes for this case. For anything else, we
5855 wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5856 notation is just syntactic sugar, taken from Sun's Java package, but the
5857 special opcodes can optimize it.
5858
5859 Some (but not all) possessively repeated subpatterns have already been
5860 completely handled in the code just above. For them, possessive_quantifier
5861 is always FALSE at this stage.
5862
5863 Note that the repeated item starts at tempcode, not at previous, which
5864 might be the first part of a string whose (former) last char we repeated.
5865
5866 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5867 an 'upto' may follow. We skip over an 'exact' item, and then test the
5868 length of what remains before proceeding. */
5869
5870 if (possessive_quantifier)
5871 {
5872 int len;
5873
5874 if (*tempcode == OP_TYPEEXACT)
5875 tempcode += PRIV(OP_lengths)[*tempcode] +
5876 ((tempcode[1 + IMM2_SIZE] == OP_PROP
5877 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5878
5879 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5880 {
5881 tempcode += PRIV(OP_lengths)[*tempcode];
5882 #ifdef SUPPORT_UTF
5883 if (utf && HAS_EXTRALEN(tempcode[-1]))
5884 tempcode += GET_EXTRALEN(tempcode[-1]);
5885 #endif
5886 }
5887
5888 len = (int)(code - tempcode);
5889 if (len > 0) switch (*tempcode)
5890 {
5891 case OP_STAR: *tempcode = OP_POSSTAR; break;
5892 case OP_PLUS: *tempcode = OP_POSPLUS; break;
5893 case OP_QUERY: *tempcode = OP_POSQUERY; break;
5894 case OP_UPTO: *tempcode = OP_POSUPTO; break;
5895
5896 case OP_STARI: *tempcode = OP_POSSTARI; break;
5897 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
5898 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5899 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
5900
5901 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
5902 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
5903 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5904 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
5905
5906 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
5907 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
5908 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5909 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
5910
5911 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
5912 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
5913 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5914 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
5915
5916 /* Because we are moving code along, we must ensure that any
5917 pending recursive references are updated. */
5918
5919 default:
5920 *code = OP_END;
5921 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5922 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5923 code += 1 + LINK_SIZE;
5924 len += 1 + LINK_SIZE;
5925 tempcode[0] = OP_ONCE;
5926 *code++ = OP_KET;
5927 PUTINC(code, 0, len);
5928 PUT(tempcode, 1, len);
5929 break;
5930 }
5931 }
5932
5933 /* In all case we no longer have a previous item. We also set the
5934 "follows varying string" flag for subsequently encountered reqchars if
5935 it isn't already set and we have just passed a varying length item. */
5936
5937 END_REPEAT:
5938 previous = NULL;
5939 cd->req_varyopt |= reqvary;
5940 break;
5941
5942
5943 /* ===================================================================*/
5944 /* Start of nested parenthesized sub-expression, or comment or lookahead or
5945 lookbehind or option setting or condition or all the other extended
5946 parenthesis forms. */
5947
5948 case CHAR_LEFT_PARENTHESIS:
5949 newoptions = options;
5950 skipbytes = 0;
5951 bravalue = OP_CBRA;
5952 save_hwm = cd->hwm;
5953 reset_bracount = FALSE;
5954
5955 /* First deal with various "verbs" that can be introduced by '*'. */
5956
5957 ptr++;
5958 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5959 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5960 {
5961 int i, namelen;
5962 int arglen = 0;
5963 const char *vn = verbnames;
5964 const pcre_uchar *name = ptr + 1;
5965 const pcre_uchar *arg = NULL;
5966 previous = NULL;
5967 ptr++;
5968 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5969 namelen = (int)(ptr - name);
5970
5971 /* It appears that Perl allows any characters whatsoever, other than
5972 a closing parenthesis, to appear in arguments, so we no longer insist on
5973 letters, digits, and underscores. */
5974
5975 if (*ptr == CHAR_COLON)
5976 {
5977 arg = ++ptr;
5978 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5979 arglen = (int)(ptr - arg);
5980 if ((unsigned int)arglen > MAX_MARK)
5981 {
5982 *errorcodeptr = ERR75;
5983 goto FAILED;
5984 }
5985 }
5986
5987 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5988 {
5989 *errorcodeptr = ERR60;
5990 goto FAILED;
5991 }
5992
5993 /* Scan the table of verb names */
5994
5995 for (i = 0; i < verbcount; i++)
5996 {
5997 if (namelen == verbs[i].len &&
5998 STRNCMP_UC_C8(name, vn, namelen) == 0)
5999 {
6000 int setverb;
6001
6002 /* Check for open captures before ACCEPT and convert it to
6003 ASSERT_ACCEPT if in an assertion. */
6004
6005 if (verbs[i].op == OP_ACCEPT)
6006 {
6007 open_capitem *oc;
6008 if (arglen != 0)
6009 {
6010 *errorcodeptr = ERR59;
6011 goto FAILED;
6012 }
6013 cd->had_accept = TRUE;
6014 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6015 {
6016 *code++ = OP_CLOSE;
6017 PUT2INC(code, 0, oc->number);
6018 }
6019 setverb = *code++ =
6020 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6021
6022 /* Do not set firstchar after *ACCEPT */
6023 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6024 }
6025
6026 /* Handle other cases with/without an argument */
6027
6028 else if (arglen == 0)
6029 {
6030 if (verbs[i].op < 0) /* Argument is mandatory */
6031 {
6032 *errorcodeptr = ERR66;
6033 goto FAILED;
6034 }
6035 setverb = *code++ = verbs[i].op;
6036 }
6037
6038 else
6039 {
6040 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6041 {
6042 *errorcodeptr = ERR59;
6043 goto FAILED;
6044 }
6045 setverb = *code++ = verbs[i].op_arg;
6046 *code++ = arglen;
6047 memcpy(code, arg, IN_UCHARS(arglen));
6048 code += arglen;
6049 *code++ = 0;
6050 }
6051
6052 switch (setverb)
6053 {
6054 case OP_THEN:
6055 case OP_THEN_ARG:
6056 cd->external_flags |= PCRE_HASTHEN;
6057 break;
6058
6059 case OP_PRUNE:
6060 case OP_PRUNE_ARG:
6061 case OP_SKIP:
6062 case OP_SKIP_ARG:
6063 cd->had_pruneorskip = TRUE;
6064 break;
6065 }
6066
6067 break; /* Found verb, exit loop */
6068 }
6069
6070 vn += verbs[i].len + 1;
6071 }
6072
6073 if (i < verbcount) continue; /* Successfully handled a verb */
6074 *errorcodeptr = ERR60; /* Verb not recognized */
6075 goto FAILED;
6076 }
6077
6078 /* Deal with the extended parentheses; all are introduced by '?', and the
6079 appearance of any of them means that this is not a capturing group. */
6080
6081 else if (*ptr == CHAR_QUESTION_MARK)
6082 {
6083 int i, set, unset, namelen;
6084 int *optset;
6085 const pcre_uchar *name;
6086 pcre_uchar *slot;
6087
6088 switch (*(++ptr))
6089 {
6090 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
6091 ptr++;
6092 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6093 if (*ptr == CHAR_NULL)
6094 {
6095 *errorcodeptr = ERR18;
6096 goto FAILED;
6097 }
6098 continue;
6099
6100
6101 /* ------------------------------------------------------------ */
6102 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6103 reset_bracount = TRUE;
6104 /* Fall through */
6105
6106 /* ------------------------------------------------------------ */
6107 case CHAR_COLON: /* Non-capturing bracket */
6108 bravalue = OP_BRA;
6109 ptr++;
6110 break;
6111
6112
6113 /* ------------------------------------------------------------ */
6114 case CHAR_LEFT_PARENTHESIS:
6115 bravalue = OP_COND; /* Conditional group */
6116 tempptr = ptr;
6117
6118 /* A condition can be an assertion, a number (referring to a numbered
6119 group), a name (referring to a named group), or 'R', referring to
6120 recursion. R<digits> and R&name are also permitted for recursion tests.
6121
6122 There are several syntaxes for testing a named group: (?(name)) is used
6123 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
6124
6125 There are two unfortunate ambiguities, caused by history. (a) 'R' can
6126 be the recursive thing or the name 'R' (and similarly for 'R' followed
6127 by digits), and (b) a number could be a name that consists of digits.
6128 In both cases, we look for a name first; if not found, we try the other
6129 cases.
6130
6131 For compatibility with auto-callouts, we allow a callout to be
6132 specified before a condition that is an assertion. First, check for the
6133 syntax of a callout; if found, adjust the temporary pointer that is
6134 used to check for an assertion condition. That's all that is needed! */
6135
6136 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6137 {
6138 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6139 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6140 tempptr += i + 1;
6141 }
6142
6143 /* For conditions that are assertions, check the syntax, and then exit
6144 the switch. This will take control down to where bracketed groups,
6145 including assertions, are processed. */
6146
6147 if (tempptr[1] == CHAR_QUESTION_MARK &&
6148 (tempptr[2] == CHAR_EQUALS_SIGN ||
6149 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6150 tempptr[2] == CHAR_LESS_THAN_SIGN))
6151 break;
6152
6153 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6154 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6155
6156 code[1+LINK_SIZE] = OP_CREF;
6157 skipbytes = 1+IMM2_SIZE;
6158 refsign = -1;
6159
6160 /* Check for a test for recursion in a named group. */
6161
6162 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
6163 {
6164 terminator = -1;
6165 ptr += 2;
6166 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6167 }
6168
6169 /* Check for a test for a named group's having been set, using the Perl
6170 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6171 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6172 consist entirely of digits, there is scope for ambiguity. */
6173
6174 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6175 {
6176 terminator = CHAR_GREATER_THAN_SIGN;
6177 ptr++;
6178 }
6179 else if (ptr[1] == CHAR_APOSTROPHE)
6180 {
6181 terminator = CHAR_APOSTROPHE;
6182 ptr++;
6183 }
6184 else
6185 {
6186 terminator = CHAR_NULL;
6187 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6188 }
6189
6190 /* When a name is one of a number of duplicates, a different opcode is
6191 used and it needs more memory. Unfortunately we cannot tell whether a
6192 name is a duplicate in the first pass, so we have to allow for more
6193 memory except when we know it is a relative numerical reference. */
6194
6195 if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6196
6197 /* We now expect to read a name (possibly all digits); any thing else
6198 is an error. In the case of all digits, also get it as a number. */
6199
6200 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6201 {
6202 ptr += 1; /* To get the right offset */
6203 *errorcodeptr = ERR28;
6204 goto FAILED;
6205 }
6206
6207 recno = 0;
6208 name = ++ptr;
6209 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6210 {
6211 if (recno >= 0)
6212 recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
6213 ptr++;
6214 }
6215 namelen = (int)(ptr - name);
6216
6217 /* Check the terminator */
6218
6219 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6220 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6221 {
6222 ptr--; /* Error offset */
6223 *errorcodeptr = ERR26;
6224 goto FAILED;
6225 }
6226
6227 /* Do no further checking in the pre-compile phase. */
6228
6229 if (lengthptr != NULL) break;
6230
6231 /* In the real compile we do the work of looking for the actual
6232 reference. If the string started with "+" or "-" we require the rest to
6233 be digits, in which case recno will be set. */
6234
6235 if (refsign > 0)
6236 {
6237 if (recno <= 0)
6238 {
6239 *errorcodeptr = ERR58;
6240 goto FAILED;
6241 }
6242 recno = (refsign == CHAR_MINUS)?
6243 cd->bracount - recno + 1 : recno +cd->bracount;
6244 if (recno <= 0 || recno > cd->final_bracount)
6245 {
6246 *errorcodeptr = ERR15;
6247 goto FAILED;
6248 }
6249 PUT2(code, 2+LINK_SIZE, recno);
6250 break;
6251 }
6252
6253 /* Otherwise (did not start with "+" or "-"), start by looking for the
6254 name. */
6255
6256 slot = cd->name_table;
6257 for (i = 0; i < cd->names_found; i++)
6258 {
6259 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6260 slot += cd->name_entry_size;
6261 }
6262
6263 /* Found the named subpattern. If the name is duplicated, add one to
6264 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6265 appropriate data values. Otherwise, just insert the unique subpattern
6266 number. */
6267
6268 if (i < cd->names_found)
6269 {
6270 int offset = i++;
6271 int count = 1;
6272 recno = GET2(slot, 0); /* Number from first found */
6273 for (; i < cd->names_found; i++)
6274 {
6275 slot += cd->name_entry_size;
6276 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6277 count++;
6278 }
6279 if (count > 1)
6280 {
6281 PUT2(code, 2+LINK_SIZE, offset);
6282 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6283 skipbytes += IMM2_SIZE;
6284 code[1+LINK_SIZE]++;
6285 }
6286 else /* Not a duplicated name */
6287 {
6288 PUT2(code, 2+LINK_SIZE, recno);
6289 }
6290 }
6291
6292 /* If terminator == CHAR_NULL it means that the name followed directly
6293 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6294 are some further alternatives to try. For the cases where terminator !=
6295 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
6296 now checked all the possibilities, so give an error. */
6297
6298 else if (terminator != CHAR_NULL)
6299 {
6300 *errorcodeptr = ERR15;
6301 goto FAILED;
6302 }
6303
6304 /* Check for (?(R) for recursion. Allow digits after R to specify a
6305 specific group number. */
6306
6307 else if (*name == CHAR_R)
6308 {
6309 recno = 0;
6310 for (i = 1; i < namelen; i++)
6311 {
6312 if (!IS_DIGIT(name[i]))
6313 {
6314 *errorcodeptr = ERR15;
6315 goto FAILED;
6316 }
6317 recno = recno * 10 + name[i] - CHAR_0;
6318 }
6319 if (recno == 0) recno = RREF_ANY;
6320 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
6321 PUT2(code, 2+LINK_SIZE, recno);
6322 }
6323
6324 /* Similarly, check for the (?(DEFINE) "condition", which is always
6325 false. */
6326
6327 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6328 {
6329 code[1+LINK_SIZE] = OP_DEF;
6330 skipbytes = 1;
6331 }
6332
6333 /* Check for the "name" actually being a subpattern number. We are
6334 in the second pass here, so final_bracount is set. */
6335
6336 else if (recno > 0 && recno <= cd->final_bracount)
6337 {
6338 PUT2(code, 2+LINK_SIZE, recno);
6339 }
6340
6341 /* Either an unidentified subpattern, or a reference to (?(0) */
6342
6343 else
6344 {
6345 *errorcodeptr = (recno == 0)? ERR35: ERR15;
6346 goto FAILED;
6347 }
6348 break;
6349
6350
6351 /* ------------------------------------------------------------ */
6352 case CHAR_EQUALS_SIGN: /* Positive lookahead */
6353 bravalue = OP_ASSERT;
6354 cd->assert_depth += 1;
6355 ptr++;
6356 break;
6357
6358
6359 /* ------------------------------------------------------------ */
6360 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
6361 ptr++;
6362 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
6363 {
6364 *code++ = OP_FAIL;
6365 previous = NULL;
6366 continue;
6367 }
6368 bravalue = OP_ASSERT_NOT;
6369 cd->assert_depth += 1;
6370 break;
6371
6372
6373 /* ------------------------------------------------------------ */
6374 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
6375 switch (ptr[1])
6376 {
6377 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
6378 bravalue = OP_ASSERTBACK;
6379 cd->assert_depth += 1;
6380 ptr += 2;
6381 break;
6382
6383 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
6384 bravalue = OP_ASSERTBACK_NOT;
6385 cd->assert_depth += 1;
6386 ptr += 2;
6387 break;
6388
6389 default: /* Could be name define, else bad */
6390 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
6391 goto DEFINE_NAME;
6392 ptr++; /* Correct offset for error */
6393 *errorcodeptr = ERR24;
6394 goto FAILED;
6395 }
6396 break;
6397
6398
6399 /* ------------------------------------------------------------ */
6400 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
6401 bravalue = OP_ONCE;
6402 ptr++;
6403 break;
6404
6405
6406 /* ------------------------------------------------------------ */
6407 case CHAR_C: /* Callout - may be followed by digits; */
6408 previous_callout = code; /* Save for later completion */
6409 after_manual_callout = 1; /* Skip one item before completing */
6410 *code++ = OP_CALLOUT;
6411 {
6412 int n = 0;
6413 ptr++;
6414 while(IS_DIGIT(*ptr))
6415 n = n * 10 + *ptr++ - CHAR_0;
6416 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6417 {
6418 *errorcodeptr = ERR39;
6419 goto FAILED;
6420 }
6421 if (n > 255)
6422 {
6423 *errorcodeptr = ERR38;
6424 goto FAILED;
6425 }
6426 *code++ = n;
6427 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6428 PUT(code, LINK_SIZE, 0); /* Default length */
6429 code += 2 * LINK_SIZE;
6430 }
6431 previous = NULL;
6432 continue;
6433
6434
6435 /* ------------------------------------------------------------ */
6436 case CHAR_P: /* Python-style named subpattern handling */
6437 if (*(++ptr) == CHAR_EQUALS_SIGN ||
6438 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
6439 {
6440 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6441 terminator = CHAR_RIGHT_PARENTHESIS;
6442 goto NAMED_REF_OR_RECURSE;
6443 }
6444 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
6445 {
6446 *errorcodeptr = ERR41;
6447 goto FAILED;
6448 }
6449 /* Fall through to handle (?P< as (?< is handled */
6450
6451
6452 /* ------------------------------------------------------------ */
6453 DEFINE_NAME: /* Come here from (?< handling */
6454 case CHAR_APOSTROPHE:
6455 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6456 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6457 name = ++ptr;
6458
6459 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6460 namelen = (int)(ptr - name);
6461
6462 /* In the pre-compile phase, do a syntax check, remember the longest
6463 name, and then remember the group in a vector, expanding it if
6464 necessary. Duplicates for the same number are skipped; other duplicates
6465 are checked for validity. In the actual compile, there is nothing to
6466 do. */
6467
6468 if (lengthptr != NULL)
6469 {
6470 named_group *ng;
6471 pcre_uint32 number = cd->bracount + 1;
6472
6473 if (*ptr != (pcre_uchar)terminator)
6474 {
6475 *errorcodeptr = ERR42;
6476 goto FAILED;
6477 }
6478
6479 if (cd->names_found >= MAX_NAME_COUNT)
6480 {
6481 *errorcodeptr = ERR49;
6482 goto FAILED;
6483 }
6484
6485 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6486 {
6487 cd->name_entry_size = namelen + IMM2_SIZE + 1;
6488 if (namelen > MAX_NAME_SIZE)
6489 {
6490 *errorcodeptr = ERR48;
6491 goto FAILED;
6492 }
6493 }
6494
6495 /* Scan the list to check for duplicates. For duplicate names, if the
6496 number is the same, break the loop, which causes the name to be
6497 discarded; otherwise, if DUPNAMES is not set, give an error.
6498 If it is set, allow the name with a different number, but continue
6499 scanning in case this is a duplicate with the same number. For
6500 non-duplicate names, give an error if the number is duplicated. */
6501
6502 ng = cd->named_groups;
6503 for (i = 0; i < cd->names_found; i++, ng++)
6504 {
6505 if (namelen == ng->length &&
6506 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6507 {
6508 if (ng->number == number) break;
6509 if ((options & PCRE_DUPNAMES) == 0)
6510 {
6511 *errorcodeptr = ERR43;
6512 goto FAILED;
6513 }
6514 cd->dupnames = TRUE; /* Duplicate names exist */
6515 }
6516 else if (ng->number == number)
6517 {
6518 *errorcodeptr = ERR65;
6519 goto FAILED;
6520 }
6521 }
6522
6523 if (i >= cd->names_found) /* Not a duplicate with same number */
6524 {
6525 /* Increase the list size if necessary */
6526
6527 if (cd->names_found >= cd->named_group_list_size)
6528 {
6529 int newsize = cd->named_group_list_size * 2;
6530 named_group *newspace = (PUBL(malloc))
6531 (newsize * sizeof(named_group));
6532
6533 if (newspace == NULL)
6534 {
6535 *errorcodeptr = ERR21;
6536 goto FAILED;
6537 }
6538
6539 memcpy(newspace, cd->named_groups,
6540 cd->named_group_list_size * sizeof(named_group));
6541 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
6542 (PUBL(free))((void *)cd->named_groups);
6543 cd->named_groups = newspace;
6544 cd->named_group_list_size = newsize;
6545 }
6546
6547 cd->named_groups[cd->names_found].name = name;
6548 cd->named_groups[cd->names_found].length = namelen;
6549 cd->named_groups[cd->names_found].number = number;
6550 cd->names_found++;
6551 }
6552 }
6553
6554 ptr++; /* Move past > or ' in both passes. */
6555 goto NUMBERED_GROUP;
6556
6557
6558 /* ------------------------------------------------------------ */
6559 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
6560 terminator = CHAR_RIGHT_PARENTHESIS;
6561 is_recurse = TRUE;
6562 /* Fall through */
6563
6564 /* We come here from the Python syntax above that handles both
6565 references (?P=name) and recursion (?P>name), as well as falling
6566 through from the Perl recursion syntax (?&name). We also come here from
6567 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6568 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6569
6570 NAMED_REF_OR_RECURSE:
6571 name = ++ptr;
6572 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6573 namelen = (int)(ptr - name);
6574
6575 /* In the pre-compile phase, do a syntax check. We used to just set
6576 a dummy reference number, because it was not used in the first pass.
6577 However, with the change of recursive back references to be atomic,
6578 we have to look for the number so that this state can be identified, as
6579 otherwise the incorrect length is computed. If it's not a backwards
6580 reference, the dummy number will do. */
6581
6582 if (lengthptr != NULL)
6583 {
6584 named_group *ng;
6585
6586 if (namelen == 0)
6587 {
6588 *errorcodeptr = ERR62;
6589 goto FAILED;
6590 }
6591 if (*ptr != (pcre_uchar)terminator)
6592 {
6593 *errorcodeptr = ERR42;
6594 goto FAILED;
6595 }
6596 if (namelen > MAX_NAME_SIZE)
6597 {
6598 *errorcodeptr = ERR48;
6599 goto FAILED;
6600 }
6601
6602 /* The name table does not exist in the first pass; instead we must
6603 scan the list of names encountered so far in order to get the
6604 number. If the name is not found, set the value to 0 for a forward
6605 reference. */
6606
6607 ng = cd->named_groups;
6608 for (i = 0; i < cd->names_found; i++, ng++)
6609 {
6610 if (namelen == ng->length &&
6611 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6612 break;
6613 }
6614 recno = (i < cd->names_found)? ng->number : 0;
6615
6616 /* Count named back references. */
6617
6618 if (!is_recurse) cd->namedrefcount++;
6619 }
6620
6621 /* In the real compile, search the name table. We check the name
6622 first, and then check that we have reached the end of the name in the
6623 table. That way, if the name is longer than any in the table, the
6624 comparison will fail without reading beyond the table entry. */
6625
6626 else
6627 {
6628 slot = cd->name_table;
6629 for (i = 0; i < cd->names_found; i++)
6630 {
6631 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6632 slot[IMM2_SIZE+namelen] == 0)
6633 break;
6634 slot += cd->name_entry_size;
6635 }
6636
6637 if (i < cd->names_found)
6638 {
6639 recno = GET2(slot, 0);
6640 }
6641 else
6642 {
6643 *errorcodeptr = ERR15;
6644 goto FAILED;
6645 }
6646 }
6647
6648 /* In both phases, for recursions, we can now go to the code than
6649 handles numerical recursion. */
6650
6651 if (is_recurse) goto HANDLE_RECURSION;
6652
6653