/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1373 - (show annotations)
Sat Oct 12 14:54:53 2013 UTC (5 years, 11 months ago) by chpe
File MIME type: text/plain
File size: 293836 byte(s)
Fix \o{...} to accept characters between 0x80000000 and 0xffffffff

The 32-bit library in non-UTF-32 mode can accept any 32-bit character, not
just up to 0x7fffffff.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Tables of names of POSIX character classes and their lengths. The names are
264 now all in a single string, to reduce the number of relocations when a shared
265 library is dynamically loaded. The list of lengths is terminated by a zero
266 length entry. The first three must be alpha, lower, upper, as this is assumed
267 for handling case independence. */
268
269 static const char posix_names[] =
270 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
271 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
272 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
273 STRING_word0 STRING_xdigit;
274
275 static const pcre_uint8 posix_name_lengths[] = {
276 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
277
278 /* Table of class bit maps for each POSIX class. Each class is formed from a
279 base map, with an optional addition or removal of another map. Then, for some
280 classes, there is some additional tweaking: for [:blank:] the vertical space
281 characters are removed, and for [:alpha:] and [:alnum:] the underscore
282 character is removed. The triples in the table consist of the base map offset,
283 second map offset or -1 if no second map, and a non-negative value for map
284 addition or a negative value for map subtraction (if there are two maps). The
285 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
286 remove vertical space characters, 2 => remove underscore. */
287
288 static const int posix_class_maps[] = {
289 cbit_word, cbit_digit, -2, /* alpha */
290 cbit_lower, -1, 0, /* lower */
291 cbit_upper, -1, 0, /* upper */
292 cbit_word, -1, 2, /* alnum - word without underscore */
293 cbit_print, cbit_cntrl, 0, /* ascii */
294 cbit_space, -1, 1, /* blank - a GNU extension */
295 cbit_cntrl, -1, 0, /* cntrl */
296 cbit_digit, -1, 0, /* digit */
297 cbit_graph, -1, 0, /* graph */
298 cbit_print, -1, 0, /* print */
299 cbit_punct, -1, 0, /* punct */
300 cbit_space, -1, 0, /* space */
301 cbit_word, -1, 0, /* word - a Perl extension */
302 cbit_xdigit,-1, 0 /* xdigit */
303 };
304
305 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
306 substitutes must be in the order of the names, defined above, and there are
307 both positive and negative cases. NULL means no substitute. */
308
309 #ifdef SUPPORT_UCP
310 static const pcre_uchar string_PNd[] = {
311 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
312 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
313 static const pcre_uchar string_pNd[] = {
314 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
315 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
316 static const pcre_uchar string_PXsp[] = {
317 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
318 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319 static const pcre_uchar string_pXsp[] = {
320 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322 static const pcre_uchar string_PXwd[] = {
323 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
324 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325 static const pcre_uchar string_pXwd[] = {
326 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328
329 static const pcre_uchar *substitutes[] = {
330 string_PNd, /* \D */
331 string_pNd, /* \d */
332 string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
333 string_pXsp, /* \s */
334 string_PXwd, /* \W */
335 string_pXwd /* \w */
336 };
337
338 static const pcre_uchar string_pL[] = {
339 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
340 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pLl[] = {
342 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_pLu[] = {
345 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
346 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXan[] = {
348 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 static const pcre_uchar string_h[] = {
351 CHAR_BACKSLASH, CHAR_h, '\0' };
352 static const pcre_uchar string_pXps[] = {
353 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355 static const pcre_uchar string_PL[] = {
356 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
357 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358 static const pcre_uchar string_PLl[] = {
359 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
360 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361 static const pcre_uchar string_PLu[] = {
362 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
363 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
364 static const pcre_uchar string_PXan[] = {
365 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
366 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
367 static const pcre_uchar string_H[] = {
368 CHAR_BACKSLASH, CHAR_H, '\0' };
369 static const pcre_uchar string_PXps[] = {
370 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372
373 static const pcre_uchar *posix_substitutes[] = {
374 string_pL, /* alpha */
375 string_pLl, /* lower */
376 string_pLu, /* upper */
377 string_pXan, /* alnum */
378 NULL, /* ascii */
379 string_h, /* blank */
380 NULL, /* cntrl */
381 string_pNd, /* digit */
382 NULL, /* graph */
383 NULL, /* print */
384 NULL, /* punct */
385 string_pXps, /* space */ /* NOTE: Xps is POSIX space */
386 string_pXwd, /* word */
387 NULL, /* xdigit */
388 /* Negated cases */
389 string_PL, /* ^alpha */
390 string_PLl, /* ^lower */
391 string_PLu, /* ^upper */
392 string_PXan, /* ^alnum */
393 NULL, /* ^ascii */
394 string_H, /* ^blank */
395 NULL, /* ^cntrl */
396 string_PNd, /* ^digit */
397 NULL, /* ^graph */
398 NULL, /* ^print */
399 NULL, /* ^punct */
400 string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
401 string_PXwd, /* ^word */
402 NULL /* ^xdigit */
403 };
404 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
405 #endif
406
407 #define STRING(a) # a
408 #define XSTRING(s) STRING(s)
409
410 /* The texts of compile-time error messages. These are "char *" because they
411 are passed to the outside world. Do not ever re-use any error number, because
412 they are documented. Always add a new error instead. Messages marked DEAD below
413 are no longer used. This used to be a table of strings, but in order to reduce
414 the number of relocations needed when a shared library is loaded dynamically,
415 it is now one long string. We cannot use a table of offsets, because the
416 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
417 simply count through to the one we want - this isn't a performance issue
418 because these strings are used only when there is a compilation error.
419
420 Each substring ends with \0 to insert a null character. This includes the final
421 substring, so that the whole string ends with \0\0, which can be detected when
422 counting through. */
423
424 static const char error_texts[] =
425 "no error\0"
426 "\\ at end of pattern\0"
427 "\\c at end of pattern\0"
428 "unrecognized character follows \\\0"
429 "numbers out of order in {} quantifier\0"
430 /* 5 */
431 "number too big in {} quantifier\0"
432 "missing terminating ] for character class\0"
433 "invalid escape sequence in character class\0"
434 "range out of order in character class\0"
435 "nothing to repeat\0"
436 /* 10 */
437 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
438 "internal error: unexpected repeat\0"
439 "unrecognized character after (? or (?-\0"
440 "POSIX named classes are supported only within a class\0"
441 "missing )\0"
442 /* 15 */
443 "reference to non-existent subpattern\0"
444 "erroffset passed as NULL\0"
445 "unknown option bit(s) set\0"
446 "missing ) after comment\0"
447 "parentheses nested too deeply\0" /** DEAD **/
448 /* 20 */
449 "regular expression is too large\0"
450 "failed to get memory\0"
451 "unmatched parentheses\0"
452 "internal error: code overflow\0"
453 "unrecognized character after (?<\0"
454 /* 25 */
455 "lookbehind assertion is not fixed length\0"
456 "malformed number or name after (?(\0"
457 "conditional group contains more than two branches\0"
458 "assertion expected after (?(\0"
459 "(?R or (?[+-]digits must be followed by )\0"
460 /* 30 */
461 "unknown POSIX class name\0"
462 "POSIX collating elements are not supported\0"
463 "this version of PCRE is compiled without UTF support\0"
464 "spare error\0" /** DEAD **/
465 "character value in \\x{} or \\o{} is too large\0"
466 /* 35 */
467 "invalid condition (?(0)\0"
468 "\\C not allowed in lookbehind assertion\0"
469 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
470 "number after (?C is > 255\0"
471 "closing ) for (?C expected\0"
472 /* 40 */
473 "recursive call could loop indefinitely\0"
474 "unrecognized character after (?P\0"
475 "syntax error in subpattern name (missing terminator)\0"
476 "two named subpatterns have the same name\0"
477 "invalid UTF-8 string\0"
478 /* 45 */
479 "support for \\P, \\p, and \\X has not been compiled\0"
480 "malformed \\P or \\p sequence\0"
481 "unknown property name after \\P or \\p\0"
482 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
483 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
484 /* 50 */
485 "repeated subpattern is too long\0" /** DEAD **/
486 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
487 "internal error: overran compiling workspace\0"
488 "internal error: previously-checked referenced subpattern not found\0"
489 "DEFINE group contains more than one branch\0"
490 /* 55 */
491 "repeating a DEFINE group is not allowed\0" /** DEAD **/
492 "inconsistent NEWLINE options\0"
493 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
494 "a numbered reference must not be zero\0"
495 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
496 /* 60 */
497 "(*VERB) not recognized or malformed\0"
498 "number is too big\0"
499 "subpattern name expected\0"
500 "digit expected after (?+\0"
501 "] is an invalid data character in JavaScript compatibility mode\0"
502 /* 65 */
503 "different names for subpatterns of the same number are not allowed\0"
504 "(*MARK) must have an argument\0"
505 "this version of PCRE is not compiled with Unicode property support\0"
506 "\\c must be followed by an ASCII character\0"
507 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
508 /* 70 */
509 "internal error: unknown opcode in find_fixedlength()\0"
510 "\\N is not supported in a class\0"
511 "too many forward references\0"
512 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
513 "invalid UTF-16 string\0"
514 /* 75 */
515 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
516 "character value in \\u.... sequence is too large\0"
517 "invalid UTF-32 string\0"
518 "setting UTF is disabled by the application\0"
519 "non-hex character in \\x{} (closing brace missing?)\0"
520 /* 80 */
521 "non-octal character in \\o{} (closing brace missing?)\0"
522 "missing opening brace after \\o\0"
523 ;
524
525 /* Table to identify digits and hex digits. This is used when compiling
526 patterns. Note that the tables in chartables are dependent on the locale, and
527 may mark arbitrary characters as digits - but the PCRE compiling code expects
528 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
529 a private table here. It costs 256 bytes, but it is a lot faster than doing
530 character value tests (at least in some simple cases I timed), and in some
531 applications one wants PCRE to compile efficiently as well as match
532 efficiently.
533
534 For convenience, we use the same bit definitions as in chartables:
535
536 0x04 decimal digit
537 0x08 hexadecimal digit
538
539 Then we can use ctype_digit and ctype_xdigit in the code. */
540
541 /* Using a simple comparison for decimal numbers rather than a memory read
542 is much faster, and the resulting code is simpler (the compiler turns it
543 into a subtraction and unsigned comparison). */
544
545 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
546
547 #ifndef EBCDIC
548
549 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
550 UTF-8 mode. */
551
552 static const pcre_uint8 digitab[] =
553 {
554 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
555 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
556 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
557 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
558 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
559 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
560 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
561 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
562 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
563 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
564 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
565 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
566 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
586
587 #else
588
589 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
590
591 static const pcre_uint8 digitab[] =
592 {
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
609 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
617 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
623 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
624 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
625
626 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
627 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
628 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
629 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
631 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
635 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
636 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
638 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
640 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
643 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
644 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
645 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
646 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
647 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
648 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
649 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
650 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
651 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
652 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
653 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
654 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
655 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
656 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
657 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
658 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
659 #endif
660
661
662 /* This table is used to check whether auto-possessification is possible
663 between adjacent character-type opcodes. The left-hand (repeated) opcode is
664 used to select the row, and the right-hand opcode is use to select the column.
665 A value of 1 means that auto-possessification is OK. For example, the second
666 value in the first row means that \D+\d can be turned into \D++\d.
667
668 The Unicode property types (\P and \p) have to be present to fill out the table
669 because of what their opcode values are, but the table values should always be
670 zero because property types are handled separately in the code. The last four
671 columns apply to items that cannot be repeated, so there is no need to have
672 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
673 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
674
675 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
676 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
677
678 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
679 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
680 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
681 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
682 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
683 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
684 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
685 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
686 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
687 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
688 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
689 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
690 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
691 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
692 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
693 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
694 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
695 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
696 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
697 };
698
699
700 /* This table is used to check whether auto-possessification is possible
701 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
702 left-hand (repeated) opcode is used to select the row, and the right-hand
703 opcode is used to select the column. The values are as follows:
704
705 0 Always return FALSE (never auto-possessify)
706 1 Character groups are distinct (possessify if both are OP_PROP)
707 2 Check character categories in the same group (general or particular)
708 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
709
710 4 Check left general category vs right particular category
711 5 Check right general category vs left particular category
712
713 6 Left alphanum vs right general category
714 7 Left space vs right general category
715 8 Left word vs right general category
716
717 9 Right alphanum vs left general category
718 10 Right space vs left general category
719 11 Right word vs left general category
720
721 12 Left alphanum vs right particular category
722 13 Left space vs right particular category
723 14 Left word vs right particular category
724
725 15 Right alphanum vs left particular category
726 16 Right space vs left particular category
727 17 Right word vs left particular category
728 */
729
730 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
731 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
732 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
733 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
734 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
735 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
736 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
737 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
738 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
739 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
740 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
741 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
742 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
743 };
744
745 /* This table is used to check whether auto-possessification is possible
746 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
747 specifies a general category and the other specifies a particular category. The
748 row is selected by the general category and the column by the particular
749 category. The value is 1 if the particular category is not part of the general
750 category. */
751
752 static const pcre_uint8 catposstab[7][30] = {
753 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
754 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
755 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
756 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
757 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
758 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
759 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
760 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
761 };
762
763 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
764 a general or particular category. The properties in each row are those
765 that apply to the character set in question. Duplication means that a little
766 unnecessary work is done when checking, but this keeps things much simpler
767 because they can all use the same code. For more details see the comment where
768 this table is used.
769
770 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
771 "space", but from Perl 5.18 it's included, so both categories are treated the
772 same here. */
773
774 static const pcre_uint8 posspropstab[3][4] = {
775 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
776 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
777 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
778 };
779
780
781
782 /*************************************************
783 * Find an error text *
784 *************************************************/
785
786 /* The error texts are now all in one long string, to save on relocations. As
787 some of the text is of unknown length, we can't use a table of offsets.
788 Instead, just count through the strings. This is not a performance issue
789 because it happens only when there has been a compilation error.
790
791 Argument: the error number
792 Returns: pointer to the error string
793 */
794
795 static const char *
796 find_error_text(int n)
797 {
798 const char *s = error_texts;
799 for (; n > 0; n--)
800 {
801 while (*s++ != CHAR_NULL) {};
802 if (*s == CHAR_NULL) return "Error text not found (please report)";
803 }
804 return s;
805 }
806
807
808
809 /*************************************************
810 * Expand the workspace *
811 *************************************************/
812
813 /* This function is called during the second compiling phase, if the number of
814 forward references fills the existing workspace, which is originally a block on
815 the stack. A larger block is obtained from malloc() unless the ultimate limit
816 has been reached or the increase will be rather small.
817
818 Argument: pointer to the compile data block
819 Returns: 0 if all went well, else an error number
820 */
821
822 static int
823 expand_workspace(compile_data *cd)
824 {
825 pcre_uchar *newspace;
826 int newsize = cd->workspace_size * 2;
827
828 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
829 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
830 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
831 return ERR72;
832
833 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
834 if (newspace == NULL) return ERR21;
835 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
836 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
837 if (cd->workspace_size > COMPILE_WORK_SIZE)
838 (PUBL(free))((void *)cd->start_workspace);
839 cd->start_workspace = newspace;
840 cd->workspace_size = newsize;
841 return 0;
842 }
843
844
845
846 /*************************************************
847 * Check for counted repeat *
848 *************************************************/
849
850 /* This function is called when a '{' is encountered in a place where it might
851 start a quantifier. It looks ahead to see if it really is a quantifier or not.
852 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
853 where the ddds are digits.
854
855 Arguments:
856 p pointer to the first char after '{'
857
858 Returns: TRUE or FALSE
859 */
860
861 static BOOL
862 is_counted_repeat(const pcre_uchar *p)
863 {
864 if (!IS_DIGIT(*p)) return FALSE;
865 p++;
866 while (IS_DIGIT(*p)) p++;
867 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
868
869 if (*p++ != CHAR_COMMA) return FALSE;
870 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
871
872 if (!IS_DIGIT(*p)) return FALSE;
873 p++;
874 while (IS_DIGIT(*p)) p++;
875
876 return (*p == CHAR_RIGHT_CURLY_BRACKET);
877 }
878
879
880
881 /*************************************************
882 * Handle escapes *
883 *************************************************/
884
885 /* This function is called when a \ has been encountered. It either returns a
886 positive value for a simple escape such as \n, or 0 for a data character which
887 will be placed in chptr. A backreference to group n is returned as negative n.
888 When UTF-8 is enabled, a positive value greater than 255 may be returned in
889 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
890 character of the escape sequence.
891
892 Arguments:
893 ptrptr points to the pattern position pointer
894 chptr points to a returned data character
895 errorcodeptr points to the errorcode variable
896 bracount number of previous extracting brackets
897 options the options bits
898 isclass TRUE if inside a character class
899
900 Returns: zero => a data character
901 positive => a special escape sequence
902 negative => a back reference
903 on error, errorcodeptr is set
904 */
905
906 static int
907 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
908 int bracount, int options, BOOL isclass)
909 {
910 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
911 BOOL utf = (options & PCRE_UTF8) != 0;
912 const pcre_uchar *ptr = *ptrptr + 1;
913 pcre_uint32 c;
914 int escape = 0;
915 int i;
916
917 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
918 ptr--; /* Set pointer back to the last byte */
919
920 /* If backslash is at the end of the pattern, it's an error. */
921
922 if (c == CHAR_NULL) *errorcodeptr = ERR1;
923
924 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
925 in a table. A non-zero result is something that can be returned immediately.
926 Otherwise further processing may be required. */
927
928 #ifndef EBCDIC /* ASCII/UTF-8 coding */
929 /* Not alphanumeric */
930 else if (c < CHAR_0 || c > CHAR_z) {}
931 else if ((i = escapes[c - CHAR_0]) != 0)
932 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
933
934 #else /* EBCDIC coding */
935 /* Not alphanumeric */
936 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
937 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
938 #endif
939
940 /* Escapes that need further processing, or are illegal. */
941
942 else
943 {
944 const pcre_uchar *oldptr;
945 BOOL braced, negated, overflow;
946 int s;
947
948 switch (c)
949 {
950 /* A number of Perl escapes are not handled by PCRE. We give an explicit
951 error. */
952
953 case CHAR_l:
954 case CHAR_L:
955 *errorcodeptr = ERR37;
956 break;
957
958 case CHAR_u:
959 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
960 {
961 /* In JavaScript, \u must be followed by four hexadecimal numbers.
962 Otherwise it is a lowercase u letter. */
963 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
964 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
965 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
966 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
967 {
968 c = 0;
969 for (i = 0; i < 4; ++i)
970 {
971 register pcre_uint32 cc = *(++ptr);
972 #ifndef EBCDIC /* ASCII/UTF-8 coding */
973 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
974 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
975 #else /* EBCDIC coding */
976 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
977 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
978 #endif
979 }
980
981 #if defined COMPILE_PCRE8
982 if (c > (utf ? 0x10ffffU : 0xffU))
983 #elif defined COMPILE_PCRE16
984 if (c > (utf ? 0x10ffffU : 0xffffU))
985 #elif defined COMPILE_PCRE32
986 if (utf && c > 0x10ffffU)
987 #endif
988 {
989 *errorcodeptr = ERR76;
990 }
991 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
992 }
993 }
994 else
995 *errorcodeptr = ERR37;
996 break;
997
998 case CHAR_U:
999 /* In JavaScript, \U is an uppercase U letter. */
1000 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1001 break;
1002
1003 /* In a character class, \g is just a literal "g". Outside a character
1004 class, \g must be followed by one of a number of specific things:
1005
1006 (1) A number, either plain or braced. If positive, it is an absolute
1007 backreference. If negative, it is a relative backreference. This is a Perl
1008 5.10 feature.
1009
1010 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1011 is part of Perl's movement towards a unified syntax for back references. As
1012 this is synonymous with \k{name}, we fudge it up by pretending it really
1013 was \k.
1014
1015 (3) For Oniguruma compatibility we also support \g followed by a name or a
1016 number either in angle brackets or in single quotes. However, these are
1017 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1018 the ESC_g code (cf \k). */
1019
1020 case CHAR_g:
1021 if (isclass) break;
1022 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1023 {
1024 escape = ESC_g;
1025 break;
1026 }
1027
1028 /* Handle the Perl-compatible cases */
1029
1030 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1031 {
1032 const pcre_uchar *p;
1033 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1034 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1035 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1036 {
1037 escape = ESC_k;
1038 break;
1039 }
1040 braced = TRUE;
1041 ptr++;
1042 }
1043 else braced = FALSE;
1044
1045 if (ptr[1] == CHAR_MINUS)
1046 {
1047 negated = TRUE;
1048 ptr++;
1049 }
1050 else negated = FALSE;
1051
1052 /* The integer range is limited by the machine's int representation. */
1053 s = 0;
1054 overflow = FALSE;
1055 while (IS_DIGIT(ptr[1]))
1056 {
1057 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1058 {
1059 overflow = TRUE;
1060 break;
1061 }
1062 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1063 }
1064 if (overflow) /* Integer overflow */
1065 {
1066 while (IS_DIGIT(ptr[1]))
1067 ptr++;
1068 *errorcodeptr = ERR61;
1069 break;
1070 }
1071
1072 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1073 {
1074 *errorcodeptr = ERR57;
1075 break;
1076 }
1077
1078 if (s == 0)
1079 {
1080 *errorcodeptr = ERR58;
1081 break;
1082 }
1083
1084 if (negated)
1085 {
1086 if (s > bracount)
1087 {
1088 *errorcodeptr = ERR15;
1089 break;
1090 }
1091 s = bracount - (s - 1);
1092 }
1093
1094 escape = -s;
1095 break;
1096
1097 /* The handling of escape sequences consisting of a string of digits
1098 starting with one that is not zero is not straightforward. Perl has changed
1099 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1100 recommended to avoid the ambiguities in the old syntax.
1101
1102 Outside a character class, the digits are read as a decimal number. If the
1103 number is less than 8 (used to be 10), or if there are that many previous
1104 extracting left brackets, then it is a back reference. Otherwise, up to
1105 three octal digits are read to form an escaped byte. Thus \123 is likely to
1106 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1107 the octal value is greater than 377, the least significant 8 bits are
1108 taken. \8 and \9 are treated as the literal characters 8 and 9.
1109
1110 Inside a character class, \ followed by a digit is always either a literal
1111 8 or 9 or an octal number. */
1112
1113 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1114 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1115
1116 if (!isclass)
1117 {
1118 oldptr = ptr;
1119 /* The integer range is limited by the machine's int representation. */
1120 s = (int)(c -CHAR_0);
1121 overflow = FALSE;
1122 while (IS_DIGIT(ptr[1]))
1123 {
1124 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1125 {
1126 overflow = TRUE;
1127 break;
1128 }
1129 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1130 }
1131 if (overflow) /* Integer overflow */
1132 {
1133 while (IS_DIGIT(ptr[1]))
1134 ptr++;
1135 *errorcodeptr = ERR61;
1136 break;
1137 }
1138 if (s < 8 || s <= bracount) /* Check for back reference */
1139 {
1140 escape = -s;
1141 break;
1142 }
1143 ptr = oldptr; /* Put the pointer back and fall through */
1144 }
1145
1146 /* Handle a digit following \ when the number is not a back reference. If
1147 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1148 then treat the digit as a following literal. At least by Perl 5.18 this
1149 changed so as not to insert the binary zero. */
1150
1151 if ((c = *ptr) >= CHAR_8) break;
1152
1153 /* Fall through with a digit less than 8 */
1154
1155 /* \0 always starts an octal number, but we may drop through to here with a
1156 larger first octal digit. The original code used just to take the least
1157 significant 8 bits of octal numbers (I think this is what early Perls used
1158 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1159 but no more than 3 octal digits. */
1160
1161 case CHAR_0:
1162 c -= CHAR_0;
1163 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1164 c = c * 8 + *(++ptr) - CHAR_0;
1165 #ifdef COMPILE_PCRE8
1166 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1167 #endif
1168 break;
1169
1170 /* \o is a relatively new Perl feature, supporting a more general way of
1171 specifying character codes in octal. The only supported form is \o{ddd}. */
1172
1173 case CHAR_o:
1174 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1175 {
1176 ptr += 2;
1177 c = 0;
1178 overflow = FALSE;
1179 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1180 {
1181 register pcre_uint32 cc = *ptr++;
1182 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1183 #ifdef COMPILE_PCRE32
1184 if (c >= 0x20000000l) { overflow = TRUE; break; }
1185 #endif
1186 c = (c << 3) + cc - CHAR_0 ;
1187 #if defined COMPILE_PCRE8
1188 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1189 #elif defined COMPILE_PCRE16
1190 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1191 #elif defined COMPILE_PCRE32
1192 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1193 #endif
1194 }
1195 if (overflow)
1196 {
1197 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1198 *errorcodeptr = ERR34;
1199 }
1200 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1201 {
1202 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1203 }
1204 else *errorcodeptr = ERR80;
1205 }
1206 break;
1207
1208 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1209 numbers. Otherwise it is a lowercase x letter. */
1210
1211 case CHAR_x:
1212 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1213 {
1214 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1215 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1216 {
1217 c = 0;
1218 for (i = 0; i < 2; ++i)
1219 {
1220 register pcre_uint32 cc = *(++ptr);
1221 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1222 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1223 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1224 #else /* EBCDIC coding */
1225 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1226 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1227 #endif
1228 }
1229 }
1230 } /* End JavaScript handling */
1231
1232 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1233 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1234 digits. If not, { used to be treated as a data character. However, Perl
1235 seems to read hex digits up to the first non-such, and ignore the rest, so
1236 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1237 now gives an error. */
1238
1239 else
1240 {
1241 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1242 {
1243 ptr += 2;
1244 c = 0;
1245 overflow = FALSE;
1246 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1247 {
1248 register pcre_uint32 cc = *ptr++;
1249 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1250
1251 #ifdef COMPILE_PCRE32
1252 if (c >= 0x10000000l) { overflow = TRUE; break; }
1253 #endif
1254
1255 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1256 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1257 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1258 #else /* EBCDIC coding */
1259 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1260 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1261 #endif
1262
1263 #if defined COMPILE_PCRE8
1264 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1265 #elif defined COMPILE_PCRE16
1266 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1267 #elif defined COMPILE_PCRE32
1268 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1269 #endif
1270 }
1271
1272 if (overflow)
1273 {
1274 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1275 *errorcodeptr = ERR34;
1276 }
1277
1278 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1279 {
1280 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1281 }
1282
1283 /* If the sequence of hex digits does not end with '}', give an error.
1284 We used just to recognize this construct and fall through to the normal
1285 \x handling, but nowadays Perl gives an error, which seems much more
1286 sensible, so we do too. */
1287
1288 else *errorcodeptr = ERR79;
1289 } /* End of \x{} processing */
1290
1291 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1292
1293 else
1294 {
1295 c = 0;
1296 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1297 {
1298 pcre_uint32 cc; /* Some compilers don't like */
1299 cc = *(++ptr); /* ++ in initializers */
1300 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1301 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1302 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1303 #else /* EBCDIC coding */
1304 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1305 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1306 #endif
1307 }
1308 } /* End of \xdd handling */
1309 } /* End of Perl-style \x handling */
1310 break;
1311
1312 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1313 An error is given if the byte following \c is not an ASCII character. This
1314 coding is ASCII-specific, but then the whole concept of \cx is
1315 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1316
1317 case CHAR_c:
1318 c = *(++ptr);
1319 if (c == CHAR_NULL)
1320 {
1321 *errorcodeptr = ERR2;
1322 break;
1323 }
1324 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1325 if (c > 127) /* Excludes all non-ASCII in either mode */
1326 {
1327 *errorcodeptr = ERR68;
1328 break;
1329 }
1330 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1331 c ^= 0x40;
1332 #else /* EBCDIC coding */
1333 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1334 c ^= 0xC0;
1335 #endif
1336 break;
1337
1338 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1339 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1340 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1341 odd, but there used to be some cases other than the default, and there may
1342 be again in future, so I haven't "optimized" it. */
1343
1344 default:
1345 if ((options & PCRE_EXTRA) != 0) switch(c)
1346 {
1347 default:
1348 *errorcodeptr = ERR3;
1349 break;
1350 }
1351 break;
1352 }
1353 }
1354
1355 /* Perl supports \N{name} for character names, as well as plain \N for "not
1356 newline". PCRE does not support \N{name}. However, it does support
1357 quantification such as \N{2,3}. */
1358
1359 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1360 !is_counted_repeat(ptr+2))
1361 *errorcodeptr = ERR37;
1362
1363 /* If PCRE_UCP is set, we change the values for \d etc. */
1364
1365 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1366 escape += (ESC_DU - ESC_D);
1367
1368 /* Set the pointer to the final character before returning. */
1369
1370 *ptrptr = ptr;
1371 *chptr = c;
1372 return escape;
1373 }
1374
1375
1376
1377 #ifdef SUPPORT_UCP
1378 /*************************************************
1379 * Handle \P and \p *
1380 *************************************************/
1381
1382 /* This function is called after \P or \p has been encountered, provided that
1383 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1384 pointing at the P or p. On exit, it is pointing at the final character of the
1385 escape sequence.
1386
1387 Argument:
1388 ptrptr points to the pattern position pointer
1389 negptr points to a boolean that is set TRUE for negation else FALSE
1390 ptypeptr points to an unsigned int that is set to the type value
1391 pdataptr points to an unsigned int that is set to the detailed property value
1392 errorcodeptr points to the error code variable
1393
1394 Returns: TRUE if the type value was found, or FALSE for an invalid type
1395 */
1396
1397 static BOOL
1398 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1399 unsigned int *pdataptr, int *errorcodeptr)
1400 {
1401 pcre_uchar c;
1402 int i, bot, top;
1403 const pcre_uchar *ptr = *ptrptr;
1404 pcre_uchar name[32];
1405
1406 c = *(++ptr);
1407 if (c == CHAR_NULL) goto ERROR_RETURN;
1408
1409 *negptr = FALSE;
1410
1411 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1412 negation. */
1413
1414 if (c == CHAR_LEFT_CURLY_BRACKET)
1415 {
1416 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1417 {
1418 *negptr = TRUE;
1419 ptr++;
1420 }
1421 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1422 {
1423 c = *(++ptr);
1424 if (c == CHAR_NULL) goto ERROR_RETURN;
1425 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1426 name[i] = c;
1427 }
1428 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1429 name[i] = 0;
1430 }
1431
1432 /* Otherwise there is just one following character */
1433
1434 else
1435 {
1436 name[0] = c;
1437 name[1] = 0;
1438 }
1439
1440 *ptrptr = ptr;
1441
1442 /* Search for a recognized property name using binary chop */
1443
1444 bot = 0;
1445 top = PRIV(utt_size);
1446
1447 while (bot < top)
1448 {
1449 int r;
1450 i = (bot + top) >> 1;
1451 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1452 if (r == 0)
1453 {
1454 *ptypeptr = PRIV(utt)[i].type;
1455 *pdataptr = PRIV(utt)[i].value;
1456 return TRUE;
1457 }
1458 if (r > 0) bot = i + 1; else top = i;
1459 }
1460
1461 *errorcodeptr = ERR47;
1462 *ptrptr = ptr;
1463 return FALSE;
1464
1465 ERROR_RETURN:
1466 *errorcodeptr = ERR46;
1467 *ptrptr = ptr;
1468 return FALSE;
1469 }
1470 #endif
1471
1472
1473
1474 /*************************************************
1475 * Read repeat counts *
1476 *************************************************/
1477
1478 /* Read an item of the form {n,m} and return the values. This is called only
1479 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1480 so the syntax is guaranteed to be correct, but we need to check the values.
1481
1482 Arguments:
1483 p pointer to first char after '{'
1484 minp pointer to int for min
1485 maxp pointer to int for max
1486 returned as -1 if no max
1487 errorcodeptr points to error code variable
1488
1489 Returns: pointer to '}' on success;
1490 current ptr on error, with errorcodeptr set non-zero
1491 */
1492
1493 static const pcre_uchar *
1494 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1495 {
1496 int min = 0;
1497 int max = -1;
1498
1499 /* Read the minimum value and do a paranoid check: a negative value indicates
1500 an integer overflow. */
1501
1502 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1503 if (min < 0 || min > 65535)
1504 {
1505 *errorcodeptr = ERR5;
1506 return p;
1507 }
1508
1509 /* Read the maximum value if there is one, and again do a paranoid on its size.
1510 Also, max must not be less than min. */
1511
1512 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1513 {
1514 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1515 {
1516 max = 0;
1517 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1518 if (max < 0 || max > 65535)
1519 {
1520 *errorcodeptr = ERR5;
1521 return p;
1522 }
1523 if (max < min)
1524 {
1525 *errorcodeptr = ERR4;
1526 return p;
1527 }
1528 }
1529 }
1530
1531 /* Fill in the required variables, and pass back the pointer to the terminating
1532 '}'. */
1533
1534 *minp = min;
1535 *maxp = max;
1536 return p;
1537 }
1538
1539
1540
1541 /*************************************************
1542 * Find first significant op code *
1543 *************************************************/
1544
1545 /* This is called by several functions that scan a compiled expression looking
1546 for a fixed first character, or an anchoring op code etc. It skips over things
1547 that do not influence this. For some calls, it makes sense to skip negative
1548 forward and all backward assertions, and also the \b assertion; for others it
1549 does not.
1550
1551 Arguments:
1552 code pointer to the start of the group
1553 skipassert TRUE if certain assertions are to be skipped
1554
1555 Returns: pointer to the first significant opcode
1556 */
1557
1558 static const pcre_uchar*
1559 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1560 {
1561 for (;;)
1562 {
1563 switch ((int)*code)
1564 {
1565 case OP_ASSERT_NOT:
1566 case OP_ASSERTBACK:
1567 case OP_ASSERTBACK_NOT:
1568 if (!skipassert) return code;
1569 do code += GET(code, 1); while (*code == OP_ALT);
1570 code += PRIV(OP_lengths)[*code];
1571 break;
1572
1573 case OP_WORD_BOUNDARY:
1574 case OP_NOT_WORD_BOUNDARY:
1575 if (!skipassert) return code;
1576 /* Fall through */
1577
1578 case OP_CALLOUT:
1579 case OP_CREF:
1580 case OP_DNCREF:
1581 case OP_RREF:
1582 case OP_DNRREF:
1583 case OP_DEF:
1584 code += PRIV(OP_lengths)[*code];
1585 break;
1586
1587 default:
1588 return code;
1589 }
1590 }
1591 /* Control never reaches here */
1592 }
1593
1594
1595
1596 /*************************************************
1597 * Find the fixed length of a branch *
1598 *************************************************/
1599
1600 /* Scan a branch and compute the fixed length of subject that will match it,
1601 if the length is fixed. This is needed for dealing with backward assertions.
1602 In UTF8 mode, the result is in characters rather than bytes. The branch is
1603 temporarily terminated with OP_END when this function is called.
1604
1605 This function is called when a backward assertion is encountered, so that if it
1606 fails, the error message can point to the correct place in the pattern.
1607 However, we cannot do this when the assertion contains subroutine calls,
1608 because they can be forward references. We solve this by remembering this case
1609 and doing the check at the end; a flag specifies which mode we are running in.
1610
1611 Arguments:
1612 code points to the start of the pattern (the bracket)
1613 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1614 atend TRUE if called when the pattern is complete
1615 cd the "compile data" structure
1616
1617 Returns: the fixed length,
1618 or -1 if there is no fixed length,
1619 or -2 if \C was encountered (in UTF-8 mode only)
1620 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1621 or -4 if an unknown opcode was encountered (internal error)
1622 */
1623
1624 static int
1625 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1626 {
1627 int length = -1;
1628
1629 register int branchlength = 0;
1630 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1631
1632 /* Scan along the opcodes for this branch. If we get to the end of the
1633 branch, check the length against that of the other branches. */
1634
1635 for (;;)
1636 {
1637 int d;
1638 pcre_uchar *ce, *cs;
1639 register pcre_uchar op = *cc;
1640
1641 switch (op)
1642 {
1643 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1644 OP_BRA (normal non-capturing bracket) because the other variants of these
1645 opcodes are all concerned with unlimited repeated groups, which of course
1646 are not of fixed length. */
1647
1648 case OP_CBRA:
1649 case OP_BRA:
1650 case OP_ONCE:
1651 case OP_ONCE_NC:
1652 case OP_COND:
1653 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1654 if (d < 0) return d;
1655 branchlength += d;
1656 do cc += GET(cc, 1); while (*cc == OP_ALT);
1657 cc += 1 + LINK_SIZE;
1658 break;
1659
1660 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1661 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1662 an ALT. If it is END it's the end of the outer call. All can be handled by
1663 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1664 because they all imply an unlimited repeat. */
1665
1666 case OP_ALT:
1667 case OP_KET:
1668 case OP_END:
1669 case OP_ACCEPT:
1670 case OP_ASSERT_ACCEPT:
1671 if (length < 0) length = branchlength;
1672 else if (length != branchlength) return -1;
1673 if (*cc != OP_ALT) return length;
1674 cc += 1 + LINK_SIZE;
1675 branchlength = 0;
1676 break;
1677
1678 /* A true recursion implies not fixed length, but a subroutine call may
1679 be OK. If the subroutine is a forward reference, we can't deal with
1680 it until the end of the pattern, so return -3. */
1681
1682 case OP_RECURSE:
1683 if (!atend) return -3;
1684 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1685 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1686 if (cc > cs && cc < ce) return -1; /* Recursion */
1687 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1688 if (d < 0) return d;
1689 branchlength += d;
1690 cc += 1 + LINK_SIZE;
1691 break;
1692
1693 /* Skip over assertive subpatterns */
1694
1695 case OP_ASSERT:
1696 case OP_ASSERT_NOT:
1697 case OP_ASSERTBACK:
1698 case OP_ASSERTBACK_NOT:
1699 do cc += GET(cc, 1); while (*cc == OP_ALT);
1700 cc += PRIV(OP_lengths)[*cc];
1701 break;
1702
1703 /* Skip over things that don't match chars */
1704
1705 case OP_MARK:
1706 case OP_PRUNE_ARG:
1707 case OP_SKIP_ARG:
1708 case OP_THEN_ARG:
1709 cc += cc[1] + PRIV(OP_lengths)[*cc];
1710 break;
1711
1712 case OP_CALLOUT:
1713 case OP_CIRC:
1714 case OP_CIRCM:
1715 case OP_CLOSE:
1716 case OP_COMMIT:
1717 case OP_CREF:
1718 case OP_DEF:
1719 case OP_DNCREF:
1720 case OP_DNRREF:
1721 case OP_DOLL:
1722 case OP_DOLLM:
1723 case OP_EOD:
1724 case OP_EODN:
1725 case OP_FAIL:
1726 case OP_NOT_WORD_BOUNDARY:
1727 case OP_PRUNE:
1728 case OP_REVERSE:
1729 case OP_RREF:
1730 case OP_SET_SOM:
1731 case OP_SKIP:
1732 case OP_SOD:
1733 case OP_SOM:
1734 case OP_THEN:
1735 case OP_WORD_BOUNDARY:
1736 cc += PRIV(OP_lengths)[*cc];
1737 break;
1738
1739 /* Handle literal characters */
1740
1741 case OP_CHAR:
1742 case OP_CHARI:
1743 case OP_NOT:
1744 case OP_NOTI:
1745 branchlength++;
1746 cc += 2;
1747 #ifdef SUPPORT_UTF
1748 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1749 #endif
1750 break;
1751
1752 /* Handle exact repetitions. The count is already in characters, but we
1753 need to skip over a multibyte character in UTF8 mode. */
1754
1755 case OP_EXACT:
1756 case OP_EXACTI:
1757 case OP_NOTEXACT:
1758 case OP_NOTEXACTI:
1759 branchlength += (int)GET2(cc,1);
1760 cc += 2 + IMM2_SIZE;
1761 #ifdef SUPPORT_UTF
1762 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1763 #endif
1764 break;
1765
1766 case OP_TYPEEXACT:
1767 branchlength += GET2(cc,1);
1768 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1769 cc += 2;
1770 cc += 1 + IMM2_SIZE + 1;
1771 break;
1772
1773 /* Handle single-char matchers */
1774
1775 case OP_PROP:
1776 case OP_NOTPROP:
1777 cc += 2;
1778 /* Fall through */
1779
1780 case OP_HSPACE:
1781 case OP_VSPACE:
1782 case OP_NOT_HSPACE:
1783 case OP_NOT_VSPACE:
1784 case OP_NOT_DIGIT:
1785 case OP_DIGIT:
1786 case OP_NOT_WHITESPACE:
1787 case OP_WHITESPACE:
1788 case OP_NOT_WORDCHAR:
1789 case OP_WORDCHAR:
1790 case OP_ANY:
1791 case OP_ALLANY:
1792 branchlength++;
1793 cc++;
1794 break;
1795
1796 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1797 otherwise \C is coded as OP_ALLANY. */
1798
1799 case OP_ANYBYTE:
1800 return -2;
1801
1802 /* Check a class for variable quantification */
1803
1804 case OP_CLASS:
1805 case OP_NCLASS:
1806 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1807 case OP_XCLASS:
1808 /* The original code caused an unsigned overflow in 64 bit systems,
1809 so now we use a conditional statement. */
1810 if (op == OP_XCLASS)
1811 cc += GET(cc, 1);
1812 else
1813 cc += PRIV(OP_lengths)[OP_CLASS];
1814 #else
1815 cc += PRIV(OP_lengths)[OP_CLASS];
1816 #endif
1817
1818 switch (*cc)
1819 {
1820 case OP_CRPLUS:
1821 case OP_CRMINPLUS:
1822 case OP_CRSTAR:
1823 case OP_CRMINSTAR:
1824 case OP_CRQUERY:
1825 case OP_CRMINQUERY:
1826 return -1;
1827
1828 case OP_CRRANGE:
1829 case OP_CRMINRANGE:
1830 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1831 branchlength += (int)GET2(cc,1);
1832 cc += 1 + 2 * IMM2_SIZE;
1833 break;
1834
1835 default:
1836 branchlength++;
1837 }
1838 break;
1839
1840 /* Anything else is variable length */
1841
1842 case OP_ANYNL:
1843 case OP_BRAMINZERO:
1844 case OP_BRAPOS:
1845 case OP_BRAPOSZERO:
1846 case OP_BRAZERO:
1847 case OP_CBRAPOS:
1848 case OP_EXTUNI:
1849 case OP_KETRMAX:
1850 case OP_KETRMIN:
1851 case OP_KETRPOS:
1852 case OP_MINPLUS:
1853 case OP_MINPLUSI:
1854 case OP_MINQUERY:
1855 case OP_MINQUERYI:
1856 case OP_MINSTAR:
1857 case OP_MINSTARI:
1858 case OP_MINUPTO:
1859 case OP_MINUPTOI:
1860 case OP_NOTMINPLUS:
1861 case OP_NOTMINPLUSI:
1862 case OP_NOTMINQUERY:
1863 case OP_NOTMINQUERYI:
1864 case OP_NOTMINSTAR:
1865 case OP_NOTMINSTARI:
1866 case OP_NOTMINUPTO:
1867 case OP_NOTMINUPTOI:
1868 case OP_NOTPLUS:
1869 case OP_NOTPLUSI:
1870 case OP_NOTPOSPLUS:
1871 case OP_NOTPOSPLUSI:
1872 case OP_NOTPOSQUERY:
1873 case OP_NOTPOSQUERYI:
1874 case OP_NOTPOSSTAR:
1875 case OP_NOTPOSSTARI:
1876 case OP_NOTPOSUPTO:
1877 case OP_NOTPOSUPTOI:
1878 case OP_NOTQUERY:
1879 case OP_NOTQUERYI:
1880 case OP_NOTSTAR:
1881 case OP_NOTSTARI:
1882 case OP_NOTUPTO:
1883 case OP_NOTUPTOI:
1884 case OP_PLUS:
1885 case OP_PLUSI:
1886 case OP_POSPLUS:
1887 case OP_POSPLUSI:
1888 case OP_POSQUERY:
1889 case OP_POSQUERYI:
1890 case OP_POSSTAR:
1891 case OP_POSSTARI:
1892 case OP_POSUPTO:
1893 case OP_POSUPTOI:
1894 case OP_QUERY:
1895 case OP_QUERYI:
1896 case OP_REF:
1897 case OP_REFI:
1898 case OP_DNREF:
1899 case OP_DNREFI:
1900 case OP_SBRA:
1901 case OP_SBRAPOS:
1902 case OP_SCBRA:
1903 case OP_SCBRAPOS:
1904 case OP_SCOND:
1905 case OP_SKIPZERO:
1906 case OP_STAR:
1907 case OP_STARI:
1908 case OP_TYPEMINPLUS:
1909 case OP_TYPEMINQUERY:
1910 case OP_TYPEMINSTAR:
1911 case OP_TYPEMINUPTO:
1912 case OP_TYPEPLUS:
1913 case OP_TYPEPOSPLUS:
1914 case OP_TYPEPOSQUERY:
1915 case OP_TYPEPOSSTAR:
1916 case OP_TYPEPOSUPTO:
1917 case OP_TYPEQUERY:
1918 case OP_TYPESTAR:
1919 case OP_TYPEUPTO:
1920 case OP_UPTO:
1921 case OP_UPTOI:
1922 return -1;
1923
1924 /* Catch unrecognized opcodes so that when new ones are added they
1925 are not forgotten, as has happened in the past. */
1926
1927 default:
1928 return -4;
1929 }
1930 }
1931 /* Control never gets here */
1932 }
1933
1934
1935
1936 /*************************************************
1937 * Scan compiled regex for specific bracket *
1938 *************************************************/
1939
1940 /* This little function scans through a compiled pattern until it finds a
1941 capturing bracket with the given number, or, if the number is negative, an
1942 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1943 so that it can be called from pcre_study() when finding the minimum matching
1944 length.
1945
1946 Arguments:
1947 code points to start of expression
1948 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1949 number the required bracket number or negative to find a lookbehind
1950
1951 Returns: pointer to the opcode for the bracket, or NULL if not found
1952 */
1953
1954 const pcre_uchar *
1955 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
1956 {
1957 for (;;)
1958 {
1959 register pcre_uchar c = *code;
1960
1961 if (c == OP_END) return NULL;
1962
1963 /* XCLASS is used for classes that cannot be represented just by a bit
1964 map. This includes negated single high-valued characters. The length in
1965 the table is zero; the actual length is stored in the compiled code. */
1966
1967 if (c == OP_XCLASS) code += GET(code, 1);
1968
1969 /* Handle recursion */
1970
1971 else if (c == OP_REVERSE)
1972 {
1973 if (number < 0) return (pcre_uchar *)code;
1974 code += PRIV(OP_lengths)[c];
1975 }
1976
1977 /* Handle capturing bracket */
1978
1979 else if (c == OP_CBRA || c == OP_SCBRA ||
1980 c == OP_CBRAPOS || c == OP_SCBRAPOS)
1981 {
1982 int n = (int)GET2(code, 1+LINK_SIZE);
1983 if (n == number) return (pcre_uchar *)code;
1984 code += PRIV(OP_lengths)[c];
1985 }
1986
1987 /* Otherwise, we can get the item's length from the table, except that for
1988 repeated character types, we have to test for \p and \P, which have an extra
1989 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1990 must add in its length. */
1991
1992 else
1993 {
1994 switch(c)
1995 {
1996 case OP_TYPESTAR:
1997 case OP_TYPEMINSTAR:
1998 case OP_TYPEPLUS:
1999 case OP_TYPEMINPLUS:
2000 case OP_TYPEQUERY:
2001 case OP_TYPEMINQUERY:
2002 case OP_TYPEPOSSTAR:
2003 case OP_TYPEPOSPLUS:
2004 case OP_TYPEPOSQUERY:
2005 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2006 break;
2007
2008 case OP_TYPEUPTO:
2009 case OP_TYPEMINUPTO:
2010 case OP_TYPEEXACT:
2011 case OP_TYPEPOSUPTO:
2012 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2013 code += 2;
2014 break;
2015
2016 case OP_MARK:
2017 case OP_PRUNE_ARG:
2018 case OP_SKIP_ARG:
2019 case OP_THEN_ARG:
2020 code += code[1];
2021 break;
2022 }
2023
2024 /* Add in the fixed length from the table */
2025
2026 code += PRIV(OP_lengths)[c];
2027
2028 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2029 a multi-byte character. The length in the table is a minimum, so we have to
2030 arrange to skip the extra bytes. */
2031
2032 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2033 if (utf) switch(c)
2034 {
2035 case OP_CHAR:
2036 case OP_CHARI:
2037 case OP_EXACT:
2038 case OP_EXACTI:
2039 case OP_UPTO:
2040 case OP_UPTOI:
2041 case OP_MINUPTO:
2042 case OP_MINUPTOI:
2043 case OP_POSUPTO:
2044 case OP_POSUPTOI:
2045 case OP_STAR:
2046 case OP_STARI:
2047 case OP_MINSTAR:
2048 case OP_MINSTARI:
2049 case OP_POSSTAR:
2050 case OP_POSSTARI:
2051 case OP_PLUS:
2052 case OP_PLUSI:
2053 case OP_MINPLUS:
2054 case OP_MINPLUSI:
2055 case OP_POSPLUS:
2056 case OP_POSPLUSI:
2057 case OP_QUERY:
2058 case OP_QUERYI:
2059 case OP_MINQUERY:
2060 case OP_MINQUERYI:
2061 case OP_POSQUERY:
2062 case OP_POSQUERYI:
2063 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2064 break;
2065 }
2066 #else
2067 (void)(utf); /* Keep compiler happy by referencing function argument */
2068 #endif
2069 }
2070 }
2071 }
2072
2073
2074
2075 /*************************************************
2076 * Scan compiled regex for recursion reference *
2077 *************************************************/
2078
2079 /* This little function scans through a compiled pattern until it finds an
2080 instance of OP_RECURSE.
2081
2082 Arguments:
2083 code points to start of expression
2084 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2085
2086 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2087 */
2088
2089 static const pcre_uchar *
2090 find_recurse(const pcre_uchar *code, BOOL utf)
2091 {
2092 for (;;)
2093 {
2094 register pcre_uchar c = *code;
2095 if (c == OP_END) return NULL;
2096 if (c == OP_RECURSE) return code;
2097
2098 /* XCLASS is used for classes that cannot be represented just by a bit
2099 map. This includes negated single high-valued characters. The length in
2100 the table is zero; the actual length is stored in the compiled code. */
2101
2102 if (c == OP_XCLASS) code += GET(code, 1);
2103
2104 /* Otherwise, we can get the item's length from the table, except that for
2105 repeated character types, we have to test for \p and \P, which have an extra
2106 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2107 must add in its length. */
2108
2109 else
2110 {
2111 switch(c)
2112 {
2113 case OP_TYPESTAR:
2114 case OP_TYPEMINSTAR:
2115 case OP_TYPEPLUS:
2116 case OP_TYPEMINPLUS:
2117 case OP_TYPEQUERY:
2118 case OP_TYPEMINQUERY:
2119 case OP_TYPEPOSSTAR:
2120 case OP_TYPEPOSPLUS:
2121 case OP_TYPEPOSQUERY:
2122 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2123 break;
2124
2125 case OP_TYPEPOSUPTO:
2126 case OP_TYPEUPTO:
2127 case OP_TYPEMINUPTO:
2128 case OP_TYPEEXACT:
2129 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2130 code += 2;
2131 break;
2132
2133 case OP_MARK:
2134 case OP_PRUNE_ARG:
2135 case OP_SKIP_ARG:
2136 case OP_THEN_ARG:
2137 code += code[1];
2138 break;
2139 }
2140
2141 /* Add in the fixed length from the table */
2142
2143 code += PRIV(OP_lengths)[c];
2144
2145 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2146 by a multi-byte character. The length in the table is a minimum, so we have
2147 to arrange to skip the extra bytes. */
2148
2149 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2150 if (utf) switch(c)
2151 {
2152 case OP_CHAR:
2153 case OP_CHARI:
2154 case OP_NOT:
2155 case OP_NOTI:
2156 case OP_EXACT:
2157 case OP_EXACTI:
2158 case OP_NOTEXACT:
2159 case OP_NOTEXACTI:
2160 case OP_UPTO:
2161 case OP_UPTOI:
2162 case OP_NOTUPTO:
2163 case OP_NOTUPTOI:
2164 case OP_MINUPTO:
2165 case OP_MINUPTOI:
2166 case OP_NOTMINUPTO:
2167 case OP_NOTMINUPTOI:
2168 case OP_POSUPTO:
2169 case OP_POSUPTOI:
2170 case OP_NOTPOSUPTO:
2171 case OP_NOTPOSUPTOI:
2172 case OP_STAR:
2173 case OP_STARI:
2174 case OP_NOTSTAR:
2175 case OP_NOTSTARI:
2176 case OP_MINSTAR:
2177 case OP_MINSTARI:
2178 case OP_NOTMINSTAR:
2179 case OP_NOTMINSTARI:
2180 case OP_POSSTAR:
2181 case OP_POSSTARI:
2182 case OP_NOTPOSSTAR:
2183 case OP_NOTPOSSTARI:
2184 case OP_PLUS:
2185 case OP_PLUSI:
2186 case OP_NOTPLUS:
2187 case OP_NOTPLUSI:
2188 case OP_MINPLUS:
2189 case OP_MINPLUSI:
2190 case OP_NOTMINPLUS:
2191 case OP_NOTMINPLUSI:
2192 case OP_POSPLUS:
2193 case OP_POSPLUSI:
2194 case OP_NOTPOSPLUS:
2195 case OP_NOTPOSPLUSI:
2196 case OP_QUERY:
2197 case OP_QUERYI:
2198 case OP_NOTQUERY:
2199 case OP_NOTQUERYI:
2200 case OP_MINQUERY:
2201 case OP_MINQUERYI:
2202 case OP_NOTMINQUERY:
2203 case OP_NOTMINQUERYI:
2204 case OP_POSQUERY:
2205 case OP_POSQUERYI:
2206 case OP_NOTPOSQUERY:
2207 case OP_NOTPOSQUERYI:
2208 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2209 break;
2210 }
2211 #else
2212 (void)(utf); /* Keep compiler happy by referencing function argument */
2213 #endif
2214 }
2215 }
2216 }
2217
2218
2219
2220 /*************************************************
2221 * Scan compiled branch for non-emptiness *
2222 *************************************************/
2223
2224 /* This function scans through a branch of a compiled pattern to see whether it
2225 can match the empty string or not. It is called from could_be_empty()
2226 below and from compile_branch() when checking for an unlimited repeat of a
2227 group that can match nothing. Note that first_significant_code() skips over
2228 backward and negative forward assertions when its final argument is TRUE. If we
2229 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2230 bracket whose current branch will already have been scanned.
2231
2232 Arguments:
2233 code points to start of search
2234 endcode points to where to stop
2235 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2236 cd contains pointers to tables etc.
2237 recurses chain of recurse_check to catch mutual recursion
2238
2239 Returns: TRUE if what is matched could be empty
2240 */
2241
2242 typedef struct recurse_check {
2243 struct recurse_check *prev;
2244 const pcre_uchar *group;
2245 } recurse_check;
2246
2247 static BOOL
2248 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2249 BOOL utf, compile_data *cd, recurse_check *recurses)
2250 {
2251 register pcre_uchar c;
2252 recurse_check this_recurse;
2253
2254 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2255 code < endcode;
2256 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2257 {
2258 const pcre_uchar *ccode;
2259
2260 c = *code;
2261
2262 /* Skip over forward assertions; the other assertions are skipped by
2263 first_significant_code() with a TRUE final argument. */
2264
2265 if (c == OP_ASSERT)
2266 {
2267 do code += GET(code, 1); while (*code == OP_ALT);
2268 c = *code;
2269 continue;
2270 }
2271
2272 /* For a recursion/subroutine call, if its end has been reached, which
2273 implies a backward reference subroutine call, we can scan it. If it's a
2274 forward reference subroutine call, we can't. To detect forward reference
2275 we have to scan up the list that is kept in the workspace. This function is
2276 called only when doing the real compile, not during the pre-compile that
2277 measures the size of the compiled pattern. */
2278
2279 if (c == OP_RECURSE)
2280 {
2281 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2282 BOOL empty_branch;
2283
2284 /* Test for forward reference or uncompleted reference. This is disabled
2285 when called to scan a completed pattern by setting cd->start_workspace to
2286 NULL. */
2287
2288 if (cd->start_workspace != NULL)
2289 {
2290 const pcre_uchar *tcode;
2291 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2292 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2293 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2294 }
2295
2296 /* If we are scanning a completed pattern, there are no forward references
2297 and all groups are complete. We need to detect whether this is a recursive
2298 call, as otherwise there will be an infinite loop. If it is a recursion,
2299 just skip over it. Simple recursions are easily detected. For mutual
2300 recursions we keep a chain on the stack. */
2301
2302 else
2303 {
2304 recurse_check *r = recurses;
2305 const pcre_uchar *endgroup = scode;
2306
2307 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2308 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2309
2310 for (r = recurses; r != NULL; r = r->prev)
2311 if (r->group == scode) break;
2312 if (r != NULL) continue; /* Mutual recursion */
2313 }
2314
2315 /* Completed reference; scan the referenced group, remembering it on the
2316 stack chain to detect mutual recursions. */
2317
2318 empty_branch = FALSE;
2319 this_recurse.prev = recurses;
2320 this_recurse.group = scode;
2321
2322 do
2323 {
2324 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2325 {
2326 empty_branch = TRUE;
2327 break;
2328 }
2329 scode += GET(scode, 1);
2330 }
2331 while (*scode == OP_ALT);
2332
2333 if (!empty_branch) return FALSE; /* All branches are non-empty */
2334 continue;
2335 }
2336
2337 /* Groups with zero repeats can of course be empty; skip them. */
2338
2339 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2340 c == OP_BRAPOSZERO)
2341 {
2342 code += PRIV(OP_lengths)[c];
2343 do code += GET(code, 1); while (*code == OP_ALT);
2344 c = *code;
2345 continue;
2346 }
2347
2348 /* A nested group that is already marked as "could be empty" can just be
2349 skipped. */
2350
2351 if (c == OP_SBRA || c == OP_SBRAPOS ||
2352 c == OP_SCBRA || c == OP_SCBRAPOS)
2353 {
2354 do code += GET(code, 1); while (*code == OP_ALT);
2355 c = *code;
2356 continue;
2357 }
2358
2359 /* For other groups, scan the branches. */
2360
2361 if (c == OP_BRA || c == OP_BRAPOS ||
2362 c == OP_CBRA || c == OP_CBRAPOS ||
2363 c == OP_ONCE || c == OP_ONCE_NC ||
2364 c == OP_COND)
2365 {
2366 BOOL empty_branch;
2367 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2368
2369 /* If a conditional group has only one branch, there is a second, implied,
2370 empty branch, so just skip over the conditional, because it could be empty.
2371 Otherwise, scan the individual branches of the group. */
2372
2373 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2374 code += GET(code, 1);
2375 else
2376 {
2377 empty_branch = FALSE;
2378 do
2379 {
2380 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2381 empty_branch = TRUE;
2382 code += GET(code, 1);
2383 }
2384 while (*code == OP_ALT);
2385 if (!empty_branch) return FALSE; /* All branches are non-empty */
2386 }
2387
2388 c = *code;
2389 continue;
2390 }
2391
2392 /* Handle the other opcodes */
2393
2394 switch (c)
2395 {
2396 /* Check for quantifiers after a class. XCLASS is used for classes that
2397 cannot be represented just by a bit map. This includes negated single
2398 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2399 actual length is stored in the compiled code, so we must update "code"
2400 here. */
2401
2402 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2403 case OP_XCLASS:
2404 ccode = code += GET(code, 1);
2405 goto CHECK_CLASS_REPEAT;
2406 #endif
2407
2408 case OP_CLASS:
2409 case OP_NCLASS:
2410 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2411
2412 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2413 CHECK_CLASS_REPEAT:
2414 #endif
2415
2416 switch (*ccode)
2417 {
2418 case OP_CRSTAR: /* These could be empty; continue */
2419 case OP_CRMINSTAR:
2420 case OP_CRQUERY:
2421 case OP_CRMINQUERY:
2422 break;
2423
2424 default: /* Non-repeat => class must match */
2425 case OP_CRPLUS: /* These repeats aren't empty */
2426 case OP_CRMINPLUS:
2427 return FALSE;
2428
2429 case OP_CRRANGE:
2430 case OP_CRMINRANGE:
2431 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2432 break;
2433 }
2434 break;
2435
2436 /* Opcodes that must match a character */
2437
2438 case OP_ANY:
2439 case OP_ALLANY:
2440 case OP_ANYBYTE:
2441
2442 case OP_PROP:
2443 case OP_NOTPROP:
2444 case OP_ANYNL:
2445
2446 case OP_NOT_HSPACE:
2447 case OP_HSPACE:
2448 case OP_NOT_VSPACE:
2449 case OP_VSPACE:
2450 case OP_EXTUNI:
2451
2452 case OP_NOT_DIGIT:
2453 case OP_DIGIT:
2454 case OP_NOT_WHITESPACE:
2455 case OP_WHITESPACE:
2456 case OP_NOT_WORDCHAR:
2457 case OP_WORDCHAR:
2458
2459 case OP_CHAR:
2460 case OP_CHARI:
2461 case OP_NOT:
2462 case OP_NOTI:
2463
2464 case OP_PLUS:
2465 case OP_PLUSI:
2466 case OP_MINPLUS:
2467 case OP_MINPLUSI:
2468
2469 case OP_NOTPLUS:
2470 case OP_NOTPLUSI:
2471 case OP_NOTMINPLUS:
2472 case OP_NOTMINPLUSI:
2473
2474 case OP_POSPLUS:
2475 case OP_POSPLUSI:
2476 case OP_NOTPOSPLUS:
2477 case OP_NOTPOSPLUSI:
2478
2479 case OP_EXACT:
2480 case OP_EXACTI:
2481 case OP_NOTEXACT:
2482 case OP_NOTEXACTI:
2483
2484 case OP_TYPEPLUS:
2485 case OP_TYPEMINPLUS:
2486 case OP_TYPEPOSPLUS:
2487 case OP_TYPEEXACT:
2488
2489 return FALSE;
2490
2491 /* These are going to continue, as they may be empty, but we have to
2492 fudge the length for the \p and \P cases. */
2493
2494 case OP_TYPESTAR:
2495 case OP_TYPEMINSTAR:
2496 case OP_TYPEPOSSTAR:
2497 case OP_TYPEQUERY:
2498 case OP_TYPEMINQUERY:
2499 case OP_TYPEPOSQUERY:
2500 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2501 break;
2502
2503 /* Same for these */
2504
2505 case OP_TYPEUPTO:
2506 case OP_TYPEMINUPTO:
2507 case OP_TYPEPOSUPTO:
2508 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2509 code += 2;
2510 break;
2511
2512 /* End of branch */
2513
2514 case OP_KET:
2515 case OP_KETRMAX:
2516 case OP_KETRMIN:
2517 case OP_KETRPOS:
2518 case OP_ALT:
2519 return TRUE;
2520
2521 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2522 MINUPTO, and POSUPTO and their caseless and negative versions may be
2523 followed by a multibyte character. */
2524
2525 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2526 case OP_STAR:
2527 case OP_STARI:
2528 case OP_NOTSTAR:
2529 case OP_NOTSTARI:
2530
2531 case OP_MINSTAR:
2532 case OP_MINSTARI:
2533 case OP_NOTMINSTAR:
2534 case OP_NOTMINSTARI:
2535
2536 case OP_POSSTAR:
2537 case OP_POSSTARI:
2538 case OP_NOTPOSSTAR:
2539 case OP_NOTPOSSTARI:
2540
2541 case OP_QUERY:
2542 case OP_QUERYI:
2543 case OP_NOTQUERY:
2544 case OP_NOTQUERYI:
2545
2546 case OP_MINQUERY:
2547 case OP_MINQUERYI:
2548 case OP_NOTMINQUERY:
2549 case OP_NOTMINQUERYI:
2550
2551 case OP_POSQUERY:
2552 case OP_POSQUERYI:
2553 case OP_NOTPOSQUERY:
2554 case OP_NOTPOSQUERYI:
2555
2556 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2557 break;
2558
2559 case OP_UPTO:
2560 case OP_UPTOI:
2561 case OP_NOTUPTO:
2562 case OP_NOTUPTOI:
2563
2564 case OP_MINUPTO:
2565 case OP_MINUPTOI:
2566 case OP_NOTMINUPTO:
2567 case OP_NOTMINUPTOI:
2568
2569 case OP_POSUPTO:
2570 case OP_POSUPTOI:
2571 case OP_NOTPOSUPTO:
2572 case OP_NOTPOSUPTOI:
2573
2574 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2575 break;
2576 #endif
2577
2578 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2579 string. */
2580
2581 case OP_MARK:
2582 case OP_PRUNE_ARG:
2583 case OP_SKIP_ARG:
2584 case OP_THEN_ARG:
2585 code += code[1];
2586 break;
2587
2588 /* None of the remaining opcodes are required to match a character. */
2589
2590 default:
2591 break;
2592 }
2593 }
2594
2595 return TRUE;
2596 }
2597
2598
2599
2600 /*************************************************
2601 * Scan compiled regex for non-emptiness *
2602 *************************************************/
2603
2604 /* This function is called to check for left recursive calls. We want to check
2605 the current branch of the current pattern to see if it could match the empty
2606 string. If it could, we must look outwards for branches at other levels,
2607 stopping when we pass beyond the bracket which is the subject of the recursion.
2608 This function is called only during the real compile, not during the
2609 pre-compile.
2610
2611 Arguments:
2612 code points to start of the recursion
2613 endcode points to where to stop (current RECURSE item)
2614 bcptr points to the chain of current (unclosed) branch starts
2615 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2616 cd pointers to tables etc
2617
2618 Returns: TRUE if what is matched could be empty
2619 */
2620
2621 static BOOL
2622 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2623 branch_chain *bcptr, BOOL utf, compile_data *cd)
2624 {
2625 while (bcptr != NULL && bcptr->current_branch >= code)
2626 {
2627 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2628 return FALSE;
2629 bcptr = bcptr->outer;
2630 }
2631 return TRUE;
2632 }
2633
2634
2635
2636 /*************************************************
2637 * Base opcode of repeated opcodes *
2638 *************************************************/
2639
2640 /* Returns the base opcode for repeated single character type opcodes. If the
2641 opcode is not a repeated character type, it returns with the original value.
2642
2643 Arguments: c opcode
2644 Returns: base opcode for the type
2645 */
2646
2647 static pcre_uchar
2648 get_repeat_base(pcre_uchar c)
2649 {
2650 return (c > OP_TYPEPOSUPTO)? c :
2651 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2652 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2653 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2654 (c >= OP_STARI)? OP_STARI :
2655 OP_STAR;
2656 }
2657
2658
2659
2660 #ifdef SUPPORT_UCP
2661 /*************************************************
2662 * Check a character and a property *
2663 *************************************************/
2664
2665 /* This function is called by check_auto_possessive() when a property item
2666 is adjacent to a fixed character.
2667
2668 Arguments:
2669 c the character
2670 ptype the property type
2671 pdata the data for the type
2672 negated TRUE if it's a negated property (\P or \p{^)
2673
2674 Returns: TRUE if auto-possessifying is OK
2675 */
2676
2677 static BOOL
2678 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2679 BOOL negated)
2680 {
2681 const pcre_uint32 *p;
2682 const ucd_record *prop = GET_UCD(c);
2683
2684 switch(ptype)
2685 {
2686 case PT_LAMP:
2687 return (prop->chartype == ucp_Lu ||
2688 prop->chartype == ucp_Ll ||
2689 prop->chartype == ucp_Lt) == negated;
2690
2691 case PT_GC:
2692 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2693
2694 case PT_PC:
2695 return (pdata == prop->chartype) == negated;
2696
2697 case PT_SC:
2698 return (pdata == prop->script) == negated;
2699
2700 /* These are specials */
2701
2702 case PT_ALNUM:
2703 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2704 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2705
2706 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2707 means that Perl space and POSIX space are now identical. PCRE was changed
2708 at release 8.34. */
2709
2710 case PT_SPACE: /* Perl space */
2711 case PT_PXSPACE: /* POSIX space */
2712 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2713 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2714 c == CHAR_FF || c == CHAR_CR)
2715 == negated;
2716
2717 case PT_WORD:
2718 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2719 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2720 c == CHAR_UNDERSCORE) == negated;
2721
2722 case PT_CLIST:
2723 p = PRIV(ucd_caseless_sets) + prop->caseset;
2724 for (;;)
2725 {
2726 if (c < *p) return !negated;
2727 if (c == *p++) return negated;
2728 }
2729 break; /* Control never reaches here */
2730 }
2731
2732 return FALSE;
2733 }
2734 #endif /* SUPPORT_UCP */
2735
2736
2737
2738 /*************************************************
2739 * Fill the character property list *
2740 *************************************************/
2741
2742 /* Checks whether the code points to an opcode that can take part in auto-
2743 possessification, and if so, fills a list with its properties.
2744
2745 Arguments:
2746 code points to start of expression
2747 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2748 fcc points to case-flipping table
2749 list points to output list
2750 list[0] will be filled with the opcode
2751 list[1] will be non-zero if this opcode
2752 can match an empty character string
2753 list[2..7] depends on the opcode
2754
2755 Returns: points to the start of the next opcode if *code is accepted
2756 NULL if *code is not accepted
2757 */
2758
2759 static const pcre_uchar *
2760 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2761 const pcre_uint8 *fcc, pcre_uint32 *list)
2762 {
2763 pcre_uchar c = *code;
2764 const pcre_uchar *end;
2765 const pcre_uint32 *clist_src;
2766 pcre_uint32 *clist_dest;
2767 pcre_uint32 chr;
2768 pcre_uchar base;
2769
2770 list[0] = c;
2771 list[1] = FALSE;
2772 code++;
2773
2774 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2775 {
2776 base = get_repeat_base(c);
2777 c -= (base - OP_STAR);
2778
2779 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2780 code += IMM2_SIZE;
2781
2782 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2783
2784 switch(base)
2785 {
2786 case OP_STAR:
2787 list[0] = OP_CHAR;
2788 break;
2789
2790 case OP_STARI:
2791 list[0] = OP_CHARI;
2792 break;
2793
2794 case OP_NOTSTAR:
2795 list[0] = OP_NOT;
2796 break;
2797
2798 case OP_NOTSTARI:
2799 list[0] = OP_NOTI;
2800 break;
2801
2802 case OP_TYPESTAR:
2803 list[0] = *code;
2804 code++;
2805 break;
2806 }
2807 c = list[0];
2808 }
2809
2810 switch(c)
2811 {
2812 case OP_NOT_DIGIT:
2813 case OP_DIGIT:
2814 case OP_NOT_WHITESPACE:
2815 case OP_WHITESPACE:
2816 case OP_NOT_WORDCHAR:
2817 case OP_WORDCHAR:
2818 case OP_ANY:
2819 case OP_ALLANY:
2820 case OP_ANYNL:
2821 case OP_NOT_HSPACE:
2822 case OP_HSPACE:
2823 case OP_NOT_VSPACE:
2824 case OP_VSPACE:
2825 case OP_EXTUNI:
2826 case OP_EODN:
2827 case OP_EOD:
2828 case OP_DOLL:
2829 case OP_DOLLM:
2830 return code;
2831
2832 case OP_CHAR:
2833 case OP_NOT:
2834 GETCHARINCTEST(chr, code);
2835 list[2] = chr;
2836 list[3] = NOTACHAR;
2837 return code;
2838
2839 case OP_CHARI:
2840 case OP_NOTI:
2841 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2842 GETCHARINCTEST(chr, code);
2843 list[2] = chr;
2844
2845 #ifdef SUPPORT_UCP
2846 if (chr < 128 || (chr < 256 && !utf))
2847 list[3] = fcc[chr];
2848 else
2849 list[3] = UCD_OTHERCASE(chr);
2850 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2851 list[3] = (chr < 256) ? fcc[chr] : chr;
2852 #else
2853 list[3] = fcc[chr];
2854 #endif
2855
2856 /* The othercase might be the same value. */
2857
2858 if (chr == list[3])
2859 list[3] = NOTACHAR;
2860 else
2861 list[4] = NOTACHAR;
2862 return code;
2863
2864 #ifdef SUPPORT_UCP
2865 case OP_PROP:
2866 case OP_NOTPROP:
2867 if (code[0] != PT_CLIST)
2868 {
2869 list[2] = code[0];
2870 list[3] = code[1];
2871 return code + 2;
2872 }
2873
2874 /* Convert only if we have anough space. */
2875
2876 clist_src = PRIV(ucd_caseless_sets) + code[1];
2877 clist_dest = list + 2;
2878 code += 2;
2879
2880 do {
2881 /* Early return if there is not enough space. */
2882 if (clist_dest >= list + 8)
2883 {
2884 list[2] = code[0];
2885 list[3] = code[1];
2886 return code;
2887 }
2888 *clist_dest++ = *clist_src;
2889 }
2890 while(*clist_src++ != NOTACHAR);
2891
2892 /* Enough space to store all characters. */
2893
2894 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2895 return code;
2896 #endif
2897
2898 case OP_NCLASS:
2899 case OP_CLASS:
2900 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2901 case OP_XCLASS:
2902
2903 if (c == OP_XCLASS)
2904 end = code + GET(code, 0);
2905 else
2906 #endif
2907 end = code + 32 / sizeof(pcre_uchar);
2908
2909 switch(*end)
2910 {
2911 case OP_CRSTAR:
2912 case OP_CRMINSTAR:
2913 case OP_CRQUERY:
2914 case OP_CRMINQUERY:
2915 list[1] = TRUE;
2916 end++;
2917 break;
2918
2919 case OP_CRRANGE:
2920 case OP_CRMINRANGE:
2921 list[1] = (GET2(end, 1) == 0);
2922 end += 1 + 2 * IMM2_SIZE;
2923 break;
2924 }
2925 list[2] = end - code;
2926 return end;
2927 }
2928 return NULL; /* Opcode not accepted */
2929 }
2930
2931
2932
2933 /*************************************************
2934 * Scan further character sets for match *
2935 *************************************************/
2936
2937 /* Checks whether the base and the current opcode have a common character, in
2938 which case the base cannot be possessified.
2939
2940 Arguments:
2941 code points to the byte code
2942 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2943 cd static compile data
2944 base_list the data list of the base opcode
2945
2946 Returns: TRUE if the auto-possessification is possible
2947 */
2948
2949 static BOOL
2950 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
2951 const pcre_uint32* base_list)
2952 {
2953 pcre_uchar c;
2954 pcre_uint32 list[8];
2955 const pcre_uint32* chr_ptr;
2956 const pcre_uint32* ochr_ptr;
2957 const pcre_uint32* list_ptr;
2958 pcre_uint32 chr;
2959
2960 for(;;)
2961 {
2962 c = *code;
2963
2964 /* Skip over callouts */
2965
2966 if (c == OP_CALLOUT)
2967 {
2968 code += PRIV(OP_lengths)[c];
2969 continue;
2970 }
2971
2972 if (c == OP_ALT)
2973 {
2974 do code += GET(code, 1); while (*code == OP_ALT);
2975 c = *code;
2976 }
2977
2978 switch(c)
2979 {
2980 case OP_END:
2981 /* TRUE only in greedy case. The non-greedy case could be replaced by an
2982 OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses
2983 more memory, which we cannot get at this stage.) */
2984
2985 return base_list[1] != 0;
2986
2987 case OP_KET:
2988 /* If the bracket is capturing, and referenced by an OP_RECURSE, the
2989 non-greedy case cannot be converted to a possessive form. We do not test
2990 the bracket type at the moment, but we might do it in the future to improve
2991 this condition. (But note that recursive calls are always atomic.) */
2992
2993 if (base_list[1] == 0) return FALSE;
2994 code += PRIV(OP_lengths)[c];
2995 continue;
2996 }
2997
2998 /* Check for a supported opcode, and load its properties. */
2999
3000 code = get_chr_property_list(code, utf, cd->fcc, list);
3001 if (code == NULL) return FALSE; /* Unsupported */
3002
3003 /* If either opcode is a small character list, set pointers for comparing
3004 characters from that list with another list, or with a property. */
3005
3006 if (base_list[0] == OP_CHAR)
3007 {
3008 chr_ptr = base_list + 2;
3009 list_ptr = list;
3010 }
3011 else if (list[0] == OP_CHAR)
3012 {
3013 chr_ptr = list + 2;
3014 list_ptr = base_list;
3015 }
3016
3017 /* Some property combinations also acceptable. Unicode property opcodes are
3018 processed specially; the rest can be handled with a lookup table. */
3019
3020 else
3021 {
3022 pcre_uint32 leftop, rightop;
3023
3024 if (list[1] != 0) return FALSE; /* Must match at least one character */
3025 leftop = base_list[0];
3026 rightop = list[0];
3027
3028 #ifdef SUPPORT_UCP
3029 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3030 {
3031 if (rightop == OP_EOD) return TRUE;
3032 if (rightop == OP_PROP || rightop == OP_NOTPROP)
3033 {
3034 int n;
3035 const pcre_uint8 *p;
3036 BOOL same = leftop == rightop;
3037 BOOL lisprop = leftop == OP_PROP;
3038 BOOL risprop = rightop == OP_PROP;
3039 BOOL bothprop = lisprop && risprop;
3040
3041 /* There's a table that specifies how each combination is to be
3042 processed:
3043 0 Always return FALSE (never auto-possessify)
3044 1 Character groups are distinct (possessify if both are OP_PROP)
3045 2 Check character categories in the same group (general or particular)
3046 3 Return TRUE if the two opcodes are not the same
3047 ... see comments below
3048 */
3049
3050 n = propposstab[base_list[2]][list[2]];
3051 switch(n)
3052 {
3053 case 0: return FALSE;
3054 case 1: return bothprop;
3055 case 2: return (base_list[3] == list[3]) != same;
3056 case 3: return !same;
3057
3058 case 4: /* Left general category, right particular category */
3059 return risprop && catposstab[base_list[3]][list[3]] == same;
3060
3061 case 5: /* Right general category, left particular category */
3062 return lisprop && catposstab[list[3]][base_list[3]] == same;
3063
3064 /* This code is logically tricky. Think hard before fiddling with it.
3065 The posspropstab table has four entries per row. Each row relates to
3066 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3067 Only WORD actually needs all four entries, but using repeats for the
3068 others means they can all use the same code below.
3069
3070 The first two entries in each row are Unicode general categories, and
3071 apply always, because all the characters they include are part of the
3072 PCRE character set. The third and fourth entries are a general and a
3073 particular category, respectively, that include one or more relevant
3074 characters. One or the other is used, depending on whether the check
3075 is for a general or a particular category. However, in both cases the
3076 category contains more characters than the specials that are defined
3077 for the property being tested against. Therefore, it cannot be used
3078 in a NOTPROP case.
3079
3080 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3081 Underscore is covered by ucp_P or ucp_Po. */
3082
3083 case 6: /* Left alphanum vs right general category */
3084 case 7: /* Left space vs right general category */
3085 case 8: /* Left word vs right general category */
3086 p = posspropstab[n-6];
3087 return risprop && lisprop ==
3088 (list[3] != p[0] &&
3089 list[3] != p[1] &&
3090 (list[3] != p[2] || !lisprop));
3091
3092 case 9: /* Right alphanum vs left general category */
3093 case 10: /* Right space vs left general category */
3094 case 11: /* Right word vs left general category */
3095 p = posspropstab[n-9];
3096 return lisprop && risprop ==
3097 (base_list[3] != p[0] &&
3098 base_list[3] != p[1] &&
3099 (base_list[3] != p[2] || !risprop));
3100
3101 case 12: /* Left alphanum vs right particular category */
3102 case 13: /* Left space vs right particular category */
3103 case 14: /* Left word vs right particular category */
3104 p = posspropstab[n-12];
3105 return risprop && lisprop ==
3106 (catposstab[p[0]][list[3]] &&
3107 catposstab[p[1]][list[3]] &&
3108 (list[3] != p[3] || !lisprop));
3109
3110 case 15: /* Right alphanum vs left particular category */
3111 case 16: /* Right space vs left particular category */
3112 case 17: /* Right word vs left particular category */
3113 p = posspropstab[n-15];
3114 return lisprop && risprop ==
3115 (catposstab[p[0]][base_list[3]] &&
3116 catposstab[p[1]][base_list[3]] &&
3117 (base_list[3] != p[3] || !risprop));
3118 }
3119 }
3120 return FALSE;
3121 }
3122
3123 else
3124 #endif /* SUPPORT_UCP */
3125
3126 return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3127 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3128 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3129 }
3130
3131 /* Control reaches here only if one of the items is a small character list.
3132 All characters are checked against the other side. */
3133
3134 do
3135 {
3136 chr = *chr_ptr;
3137
3138 switch(list_ptr[0])
3139 {
3140 case OP_CHAR:
3141 ochr_ptr = list_ptr + 2;
3142 do
3143 {
3144 if (chr == *ochr_ptr) return FALSE;
3145 ochr_ptr++;
3146 }
3147 while(*ochr_ptr != NOTACHAR);
3148 break;
3149
3150 case OP_NOT:
3151 ochr_ptr = list_ptr + 2;
3152 do
3153 {
3154 if (chr == *ochr_ptr)
3155 break;
3156 ochr_ptr++;
3157 }
3158 while(*ochr_ptr != NOTACHAR);
3159 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3160 break;
3161
3162 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3163 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3164
3165 case OP_DIGIT:
3166 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3167 break;
3168
3169 case OP_NOT_DIGIT:
3170 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3171 break;
3172
3173 case OP_WHITESPACE:
3174 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3175 break;
3176
3177 case OP_NOT_WHITESPACE:
3178 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3179 break;
3180
3181 case OP_WORDCHAR:
3182 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3183 break;
3184
3185 case OP_NOT_WORDCHAR:
3186 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3187 break;
3188
3189 case OP_HSPACE:
3190 switch(chr)
3191 {
3192 HSPACE_CASES: return FALSE;
3193 default: break;
3194 }
3195 break;
3196
3197 case OP_NOT_HSPACE:
3198 switch(chr)
3199 {
3200 HSPACE_CASES: break;
3201 default: return FALSE;
3202 }
3203 break;
3204
3205 case OP_ANYNL:
3206 case OP_VSPACE:
3207 switch(chr)
3208 {
3209 VSPACE_CASES: return FALSE;
3210 default: break;
3211 }
3212 break;
3213
3214 case OP_NOT_VSPACE:
3215 switch(chr)
3216 {
3217 VSPACE_CASES: break;
3218 default: return FALSE;
3219 }
3220 break;
3221
3222 case OP_DOLL:
3223 case OP_EODN:
3224 switch (chr)
3225 {
3226 case CHAR_CR:
3227 case CHAR_LF:
3228 case CHAR_VT:
3229 case CHAR_FF:
3230 case CHAR_NEL:
3231 #ifndef EBCDIC
3232 case 0x2028:
3233 case 0x2029:
3234 #endif /* Not EBCDIC */
3235 return FALSE;
3236 }
3237 break;
3238
3239 case OP_EOD: /* Can always possessify before \z */
3240 break;
3241
3242 case OP_PROP:
3243 case OP_NOTPROP:
3244 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3245 list_ptr[0] == OP_NOTPROP))
3246 return FALSE;
3247 break;
3248
3249 /* The class comparisons work only when the class is the second item
3250 of the pair, because there are at present no possessive forms of the
3251 class opcodes. Note also that the "code" variable that is used below
3252 points after the second item, and that the pointer for the first item
3253 is not available, so even if there were possessive forms of the class
3254 opcodes, the correct comparison could not be done. */
3255
3256 case OP_NCLASS:
3257 if (chr > 255) return FALSE;
3258 /* Fall through */
3259
3260 case OP_CLASS:
3261 if (list_ptr != list) return FALSE; /* Class is first opcode */
3262 if (chr > 255) break;
3263 if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)
3264 return FALSE;
3265 break;
3266
3267 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3268 case OP_XCLASS:
3269 if (list_ptr != list) return FALSE; /* Class is first opcode */
3270 if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))
3271 return FALSE;
3272 break;
3273 #endif
3274
3275 default:
3276 return FALSE;
3277 }
3278
3279 chr_ptr++;
3280 }
3281 while(*chr_ptr != NOTACHAR);
3282
3283 /* At least one character must be matched from this opcode. */
3284
3285 if (list[1] == 0) return TRUE;
3286 }
3287
3288 return FALSE;
3289 }
3290
3291
3292
3293 /*************************************************
3294 * Scan compiled regex for auto-possession *
3295 *************************************************/
3296
3297 /* Replaces single character iterations with their possessive alternatives
3298 if appropriate. This function modifies the compiled opcode!
3299
3300 Arguments:
3301 code points to start of the byte code
3302 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3303 cd static compile data
3304
3305 Returns: nothing
3306 */
3307
3308 static void
3309 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3310 {
3311 register pcre_uchar c;
3312 const pcre_uchar *end;
3313 pcre_uint32 list[8];
3314
3315 for (;;)
3316 {
3317 c = *code;
3318
3319 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3320 {
3321 c -= get_repeat_base(c) - OP_STAR;
3322 end = (c <= OP_MINUPTO) ?
3323 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3324 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3325
3326 if (end != NULL && compare_opcodes(end, utf, cd, list))
3327 {
3328 switch(c)
3329 {
3330 case OP_STAR:
3331 *code += OP_POSSTAR - OP_STAR;
3332 break;
3333
3334 case OP_MINSTAR:
3335 *code += OP_POSSTAR - OP_MINSTAR;
3336 break;
3337
3338 case OP_PLUS:
3339 *code += OP_POSPLUS - OP_PLUS;
3340 break;
3341
3342 case OP_MINPLUS:
3343 *code += OP_POSPLUS - OP_MINPLUS;
3344 break;
3345
3346 case OP_QUERY:
3347 *code += OP_POSQUERY - OP_QUERY;
3348 break;
3349
3350 case OP_MINQUERY:
3351 *code += OP_POSQUERY - OP_MINQUERY;
3352 break;
3353
3354 case OP_UPTO:
3355 *code += OP_POSUPTO - OP_UPTO;
3356 break;
3357
3358 case OP_MINUPTO:
3359 *code += OP_MINUPTO - OP_UPTO;
3360 break;
3361 }
3362 }
3363 c = *code;
3364 }
3365
3366 switch(c)
3367 {
3368 case OP_END:
3369 return;
3370
3371 case OP_TYPESTAR:
3372 case OP_TYPEMINSTAR:
3373 case OP_TYPEPLUS:
3374 case OP_TYPEMINPLUS:
3375 case OP_TYPEQUERY:
3376 case OP_TYPEMINQUERY:
3377 case OP_TYPEPOSSTAR:
3378 case OP_TYPEPOSPLUS:
3379 case OP_TYPEPOSQUERY:
3380 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3381 break;
3382
3383 case OP_TYPEUPTO:
3384 case OP_TYPEMINUPTO:
3385 case OP_TYPEEXACT:
3386 case OP_TYPEPOSUPTO:
3387 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3388 code += 2;
3389 break;
3390
3391 case OP_XCLASS:
3392 code += GET(code, 1);
3393 break;
3394
3395 case OP_MARK:
3396 case OP_PRUNE_ARG:
3397 case OP_SKIP_ARG:
3398 case OP_THEN_ARG:
3399 code += code[1];
3400 break;
3401 }
3402
3403 /* Add in the fixed length from the table */
3404
3405 code += PRIV(OP_lengths)[c];
3406
3407 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3408 a multi-byte character. The length in the table is a minimum, so we have to
3409 arrange to skip the extra bytes. */
3410
3411 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3412 if (utf) switch(c)
3413 {
3414 case OP_CHAR:
3415 case OP_CHARI:
3416 case OP_NOT:
3417 case OP_NOTI:
3418 case OP_STAR:
3419 case OP_MINSTAR:
3420 case OP_PLUS:
3421 case OP_MINPLUS:
3422 case OP_QUERY:
3423 case OP_MINQUERY:
3424 case OP_UPTO:
3425 case OP_MINUPTO:
3426 case OP_EXACT:
3427 case OP_POSSTAR:
3428 case OP_POSPLUS:
3429 case OP_POSQUERY:
3430 case OP_POSUPTO:
3431 case OP_STARI:
3432 case OP_MINSTARI:
3433 case OP_PLUSI:
3434 case OP_MINPLUSI:
3435 case OP_QUERYI:
3436 case OP_MINQUERYI:
3437 case OP_UPTOI:
3438 case OP_MINUPTOI:
3439 case OP_EXACTI:
3440 case OP_POSSTARI:
3441 case OP_POSPLUSI:
3442 case OP_POSQUERYI:
3443 case OP_POSUPTOI:
3444 case OP_NOTSTAR:
3445 case OP_NOTMINSTAR:
3446 case OP_NOTPLUS:
3447 case OP_NOTMINPLUS:
3448 case OP_NOTQUERY:
3449 case OP_NOTMINQUERY:
3450 case OP_NOTUPTO:
3451 case OP_NOTMINUPTO:
3452 case OP_NOTEXACT:
3453 case OP_NOTPOSSTAR:
3454 case OP_NOTPOSPLUS:
3455 case OP_NOTPOSQUERY:
3456 case OP_NOTPOSUPTO:
3457 case OP_NOTSTARI:
3458 case OP_NOTMINSTARI:
3459 case OP_NOTPLUSI:
3460 case OP_NOTMINPLUSI:
3461 case OP_NOTQUERYI:
3462 case OP_NOTMINQUERYI:
3463 case OP_NOTUPTOI:
3464 case OP_NOTMINUPTOI:
3465 case OP_NOTEXACTI:
3466 case OP_NOTPOSSTARI:
3467 case OP_NOTPOSPLUSI:
3468 case OP_NOTPOSQUERYI:
3469 case OP_NOTPOSUPTOI:
3470 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3471 break;
3472 }
3473 #else
3474 (void)(utf); /* Keep compiler happy by referencing function argument */
3475 #endif
3476 }
3477 }
3478
3479
3480
3481 /*************************************************
3482 * Check for POSIX class syntax *
3483 *************************************************/
3484
3485 /* This function is called when the sequence "[:" or "[." or "[=" is
3486 encountered in a character class. It checks whether this is followed by a
3487 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3488 reach an unescaped ']' without the special preceding character, return FALSE.
3489
3490 Originally, this function only recognized a sequence of letters between the
3491 terminators, but it seems that Perl recognizes any sequence of characters,
3492 though of course unknown POSIX names are subsequently rejected. Perl gives an
3493 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3494 didn't consider this to be a POSIX class. Likewise for [:1234:].
3495
3496 The problem in trying to be exactly like Perl is in the handling of escapes. We
3497 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3498 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3499 below handles the special case of \], but does not try to do any other escape
3500 processing. This makes it different from Perl for cases such as [:l\ower:]
3501 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3502 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
3503 I think.
3504
3505 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3506 It seems that the appearance of a nested POSIX class supersedes an apparent
3507 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3508 a digit.
3509
3510 In Perl, unescaped square brackets may also appear as part of class names. For
3511 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3512 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3513 seem right at all. PCRE does not allow closing square brackets in POSIX class
3514 names.
3515
3516 Arguments:
3517 ptr pointer to the initial [
3518 endptr where to return the end pointer
3519
3520 Returns: TRUE or FALSE
3521 */
3522
3523 static BOOL
3524 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3525 {
3526 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3527 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3528 for (++ptr; *ptr != CHAR_NULL; ptr++)
3529 {
3530 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3531 ptr++;
3532 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3533 else
3534 {
3535 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3536 {
3537 *endptr = ptr;
3538 return TRUE;
3539 }
3540 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3541 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3542 ptr[1] == CHAR_EQUALS_SIGN) &&
3543 check_posix_syntax(ptr, endptr))
3544 return FALSE;
3545 }
3546 }
3547 return FALSE;
3548 }
3549
3550
3551
3552
3553 /*************************************************
3554 * Check POSIX class name *
3555 *************************************************/
3556
3557 /* This function is called to check the name given in a POSIX-style class entry
3558 such as [:alnum:].
3559
3560 Arguments:
3561 ptr points to the first letter
3562 len the length of the name
3563
3564 Returns: a value representing the name, or -1 if unknown
3565 */
3566
3567 static int
3568 check_posix_name(const pcre_uchar *ptr, int len)
3569 {
3570 const char *pn = posix_names;
3571 register int yield = 0;
3572 while (posix_name_lengths[yield] != 0)
3573 {
3574 if (len == posix_name_lengths[yield] &&
3575 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3576 pn += posix_name_lengths[yield] + 1;
3577 yield++;
3578 }
3579 return -1;
3580 }
3581
3582
3583 /*************************************************
3584 * Adjust OP_RECURSE items in repeated group *
3585 *************************************************/
3586
3587 /* OP_RECURSE items contain an offset from the start of the regex to the group
3588 that is referenced. This means that groups can be replicated for fixed
3589 repetition simply by copying (because the recursion is allowed to refer to
3590 earlier groups that are outside the current group). However, when a group is
3591 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3592 inserted before it, after it has been compiled. This means that any OP_RECURSE
3593 items within it that refer to the group itself or any contained groups have to
3594 have their offsets adjusted. That one of the jobs of this function. Before it
3595 is called, the partially compiled regex must be temporarily terminated with
3596 OP_END.
3597
3598 This function has been extended with the possibility of forward references for
3599 recursions and subroutine calls. It must also check the list of such references
3600 for the group we are dealing with. If it finds that one of the recursions in
3601 the current group is on this list, it adjusts the offset in the list, not the
3602 value in the reference (which is a group number).
3603
3604 Arguments:
3605 group points to the start of the group
3606 adjust the amount by which the group is to be moved
3607 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3608 cd contains pointers to tables etc.
3609 save_hwm the hwm forward reference pointer at the start of the group
3610
3611 Returns: nothing
3612 */
3613
3614 static void
3615 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3616 pcre_uchar *save_hwm)
3617 {
3618 pcre_uchar *ptr = group;
3619
3620 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3621 {
3622 int offset;
3623 pcre_uchar *hc;
3624
3625 /* See if this recursion is on the forward reference list. If so, adjust the
3626 reference. */
3627
3628 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3629 {
3630 offset = (int)GET(hc, 0);
3631 if (cd->start_code + offset == ptr + 1)
3632 {
3633 PUT(hc, 0, offset + adjust);
3634 break;
3635 }
3636 }
3637
3638 /* Otherwise, adjust the recursion offset if it's after the start of this
3639 group. */
3640
3641 if (hc >= cd->hwm)
3642 {
3643 offset = (int)GET(ptr, 1);
3644 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3645 }
3646
3647 ptr += 1 + LINK_SIZE;
3648 }
3649 }
3650
3651
3652
3653 /*************************************************
3654 * Insert an automatic callout point *
3655 *************************************************/
3656
3657 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3658 callout points before each pattern item.
3659
3660 Arguments:
3661 code current code pointer
3662 ptr current pattern pointer
3663 cd pointers to tables etc
3664
3665 Returns: new code pointer
3666 */
3667
3668 static pcre_uchar *
3669 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3670 {
3671 *code++ = OP_CALLOUT;
3672 *code++ = 255;
3673 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
3674 PUT(code, LINK_SIZE, 0); /* Default length */
3675 return code + 2 * LINK_SIZE;
3676 }
3677
3678
3679
3680 /*************************************************
3681 * Complete a callout item *
3682 *************************************************/
3683
3684 /* A callout item contains the length of the next item in the pattern, which
3685 we can't fill in till after we have reached the relevant point. This is used
3686 for both automatic and manual callouts.
3687
3688 Arguments:
3689 previous_callout points to previous callout item
3690 ptr current pattern pointer
3691 cd pointers to tables etc
3692
3693 Returns: nothing
3694 */
3695
3696 static void
3697 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3698 {
3699 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3700 PUT(previous_callout, 2 + LINK_SIZE, length);
3701 }
3702
3703
3704
3705 #ifdef SUPPORT_UCP
3706 /*************************************************
3707 * Get othercase range *
3708 *************************************************/
3709
3710 /* This function is passed the start and end of a class range, in UTF-8 mode
3711 with UCP support. It searches up the characters, looking for ranges of
3712 characters in the "other" case. Each call returns the next one, updating the
3713 start address. A character with multiple other cases is returned on its own
3714 with a special return value.
3715
3716 Arguments:
3717 cptr points to starting character value; updated
3718 d end value
3719 ocptr where to put start of othercase range
3720 odptr where to put end of othercase range
3721
3722 Yield: -1 when no more
3723 0 when a range is returned
3724 >0 the CASESET offset for char with multiple other cases
3725 in this case, ocptr contains the original
3726 */
3727
3728 static int
3729 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
3730 pcre_uint32 *odptr)
3731 {
3732 pcre_uint32 c, othercase, next;
3733 unsigned int co;
3734
3735 /* Find the first character that has an other case. If it has multiple other
3736 cases, return its case offset value. */
3737
3738 for (c = *cptr; c <= d; c++)
3739 {
3740 if ((co = UCD_CASESET(c)) != 0)
3741 {
3742 *ocptr = c++; /* Character that has the set */
3743 *cptr = c; /* Rest of input range */
3744 return (int)co;
3745 }
3746 if ((othercase = UCD_OTHERCASE(c)) != c) break;
3747 }
3748
3749 if (c > d) return -1; /* Reached end of range */
3750
3751 *ocptr = othercase;
3752 next = othercase + 1;
3753
3754 for (++c; c <= d; c++)
3755 {
3756 if (UCD_OTHERCASE(c) != next) break;
3757 next++;
3758 }
3759
3760 *odptr = next - 1; /* End of othercase range */
3761 *cptr = c; /* Rest of input range */
3762 return 0;
3763 }
3764 #endif /* SUPPORT_UCP */
3765
3766
3767
3768 /*************************************************
3769 * Add a character or range to a class *
3770 *************************************************/
3771
3772 /* This function packages up the logic of adding a character or range of
3773 characters to a class. The character values in the arguments will be within the
3774 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3775 mutually recursive with the function immediately below.
3776
3777 Arguments:
3778 classbits the bit map for characters < 256
3779 uchardptr points to the pointer for extra data
3780 options the options word
3781 cd contains pointers to tables etc.
3782 start start of range character
3783 end end of range character
3784
3785 Returns: the number of < 256 characters added
3786 the pointer to extra data is updated
3787 */
3788
3789 static int
3790 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3791 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
3792 {
3793 pcre_uint32 c;
3794 int n8 = 0;
3795
3796 /* If caseless matching is required, scan the range and process alternate
3797 cases. In Unicode, there are 8-bit characters that have alternate cases that
3798 are greater than 255 and vice-versa. Sometimes we can just extend the original
3799 range. */
3800
3801 if ((options & PCRE_CASELESS) != 0)
3802 {
3803 #ifdef SUPPORT_UCP
3804 if ((options & PCRE_UTF8) != 0)
3805 {
3806 int rc;
3807 pcre_uint32 oc, od;
3808
3809 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
3810 c = start;
3811
3812 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3813 {
3814 /* Handle a single character that has more than one other case. */
3815
3816 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3817 PRIV(ucd_caseless_sets) + rc, oc);
3818
3819 /* Do nothing if the other case range is within the original range. */
3820
3821 else if (oc >= start && od <= end) continue;
3822
3823 /* Extend the original range if there is overlap, noting that if oc < c, we
3824 can't have od > end because a subrange is always shorter than the basic
3825 range. Otherwise, use a recursive call to add the additional range. */
3826
3827 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3828 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
3829 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3830 }
3831 }
3832 else
3833 #endif /* SUPPORT_UCP */
3834
3835 /* Not UTF-mode, or no UCP */
3836
3837 for (c = start; c <= end && c < 256; c++)
3838 {
3839 SETBIT(classbits, cd->fcc[c]);
3840 n8++;
3841 }
3842 }
3843
3844 /* Now handle the original range. Adjust the final value according to the bit
3845 length - this means that the same lists of (e.g.) horizontal spaces can be used
3846 in all cases. */
3847
3848 #if defined COMPILE_PCRE8
3849 #ifdef SUPPORT_UTF
3850 if ((options & PCRE_UTF8) == 0)
3851 #endif
3852 if (end > 0xff) end = 0xff;
3853
3854 #elif defined COMPILE_PCRE16
3855 #ifdef SUPPORT_UTF
3856 if ((options & PCRE_UTF16) == 0)
3857 #endif
3858 if (end > 0xffff) end = 0xffff;
3859
3860 #endif /* COMPILE_PCRE[8|16] */
3861
3862 /* If all characters are less than 256, use the bit map. Otherwise use extra
3863 data. */
3864
3865 if (end < 0x100)
3866 {
3867 for (c = start; c <= end; c++)
3868 {
3869 n8++;
3870 SETBIT(classbits, c);
3871 }
3872 }
3873
3874 else
3875 {
3876 pcre_uchar *uchardata = *uchardptr;
3877
3878 #ifdef SUPPORT_UTF
3879 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
3880 {
3881 if (start < end)
3882 {
3883 *uchardata++ = XCL_RANGE;
3884 uchardata += PRIV(ord2utf)(start, uchardata);
3885 uchardata += PRIV(ord2utf)(end, uchardata);
3886 }
3887 else if (start == end)
3888 {
3889 *uchardata++ = XCL_SINGLE;
3890 uchardata += PRIV(ord2utf)(start, uchardata);
3891 }
3892 }
3893 else
3894 #endif /* SUPPORT_UTF */
3895
3896 /* Without UTF support, character values are constrained by the bit length,
3897 and can only be > 256 for 16-bit and 32-bit libraries. */
3898
3899 #ifdef COMPILE_PCRE8
3900 {}
3901 #else
3902 if (start < end)
3903 {
3904 *uchardata++ = XCL_RANGE;
3905 *uchardata++ = start;
3906 *uchardata++ = end;
3907 }
3908 else if (start == end)
3909 {
3910 *uchardata++ = XCL_SINGLE;
3911 *uchardata++ = start;
3912 }
3913 #endif
3914
3915 *uchardptr = uchardata; /* Updata extra data pointer */
3916 }
3917
3918 return n8; /* Number of 8-bit characters */
3919 }
3920
3921
3922
3923
3924 /*************************************************
3925 * Add a list of characters to a class *
3926 *************************************************/
3927
3928 /* This function is used for adding a list of case-equivalent characters to a
3929 class, and also for adding a list of horizontal or vertical whitespace. If the
3930 list is in order (which it should be), ranges of characters are detected and
3931 handled appropriately. This function is mutually recursive with the function
3932 above.
3933
3934 Arguments:
3935 classbits the bit map for characters < 256
3936 uchardptr points to the pointer for extra data
3937 options the options word
3938 cd contains pointers to tables etc.
3939 p points to row of 32-bit values, terminated by NOTACHAR
3940 except character to omit; this is used when adding lists of
3941 case-equivalent characters to avoid including the one we
3942 already know about
3943
3944 Returns: the number of < 256 characters added
3945 the pointer to extra data is updated
3946 */
3947
3948 static int
3949 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3950 compile_data *cd, const pcre_uint32 *p, unsigned int except)
3951 {
3952 int n8 = 0;
3953 while (p[0] < NOTACHAR)
3954 {
3955 int n = 0;
3956 if (p[0] != except)
3957 {
3958 while(p[n+1] == p[0] + n + 1) n++;
3959 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
3960 }
3961 p += n + 1;
3962 }
3963 return n8;
3964 }
3965
3966
3967
3968 /*************************************************
3969 * Add characters not in a list to a class *
3970 *************************************************/
3971
3972 /* This function is used for adding the complement of a list of horizontal or
3973 vertical whitespace to a class. The list must be in order.
3974
3975 Arguments:
3976 classbits the bit map for characters < 256
3977 uchardptr points to the pointer for extra data
3978 options the options word
3979 cd contains pointers to tables etc.
3980 p points to row of 32-bit values, terminated by NOTACHAR
3981
3982 Returns: the number of < 256 characters added
3983 the pointer to extra data is updated
3984 */
3985
3986 static int
3987 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
3988 int options, compile_data *cd, const pcre_uint32 *p)
3989 {
3990 BOOL utf = (options & PCRE_UTF8) != 0;
3991 int n8 = 0;
3992 if (p[0] > 0)
3993 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
3994 while (p[0] < NOTACHAR)
3995 {
3996 while (p[1] == p[0] + 1) p++;
3997 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
3998 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3999 p++;
4000 }
4001 return n8;
4002 }
4003
4004
4005
4006 /*************************************************
4007 * Compile one branch *
4008 *************************************************/
4009
4010 /* Scan the pattern, compiling it into the a vector. If the options are
4011 changed during the branch, the pointer is used to change the external options
4012 bits. This function is used during the pre-compile phase when we are trying
4013 to find out the amount of memory needed, as well as during the real compile
4014 phase. The value of lengthptr distinguishes the two phases.
4015
4016 Arguments:
4017 optionsptr pointer to the option bits
4018 codeptr points to the pointer to the current code point
4019 ptrptr points to the current pattern pointer
4020 errorcodeptr points to error code variable
4021 firstcharptr place to put the first required character
4022 firstcharflagsptr place to put the first character flags, or a negative number
4023 reqcharptr place to put the last required character
4024 reqcharflagsptr place to put the last required character flags, or a negative number
4025 bcptr points to current branch chain
4026 cond_depth conditional nesting depth
4027 cd contains pointers to tables etc.
4028 lengthptr NULL during the real compile phase
4029 points to length accumulator during pre-compile phase
4030
4031 Returns: TRUE on success
4032 FALSE, with *errorcodeptr set non-zero on error
4033 */
4034
4035 static BOOL
4036 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4037 const pcre_uchar **ptrptr, int *errorcodeptr,
4038 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4039 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4040 branch_chain *bcptr, int cond_depth,
4041 compile_data *cd, int *lengthptr)
4042 {
4043 int repeat_type, op_type;
4044 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4045 int bravalue = 0;
4046 int greedy_default, greedy_non_default;
4047 pcre_uint32 firstchar, reqchar;
4048 pcre_int32 firstcharflags, reqcharflags;
4049 pcre_uint32 zeroreqchar, zerofirstchar;
4050 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4051 pcre_int32 req_caseopt, reqvary, tempreqvary;
4052 int options = *optionsptr; /* May change dynamically */
4053 int after_manual_callout = 0;
4054 int length_prevgroup = 0;
4055 register pcre_uint32 c;
4056 int escape;
4057 register pcre_uchar *code = *codeptr;
4058 pcre_uchar *last_code = code;
4059 pcre_uchar *orig_code = code;
4060 pcre_uchar *tempcode;
4061 BOOL inescq = FALSE;
4062 BOOL groupsetfirstchar = FALSE;
4063 const pcre_uchar *ptr = *ptrptr;
4064 const pcre_uchar *tempptr;
4065 const pcre_uchar *nestptr = NULL;
4066 pcre_uchar *previous = NULL;
4067 pcre_uchar *previous_callout = NULL;
4068 pcre_uchar *save_hwm = NULL;
4069 pcre_uint8 classbits[32];
4070
4071 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4072 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4073 dynamically as we process the pattern. */
4074
4075 #ifdef SUPPORT_UTF
4076 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4077 BOOL utf = (options & PCRE_UTF8) != 0;
4078 #ifndef COMPILE_PCRE32
4079 pcre_uchar utf_chars[6];
4080 #endif
4081 #else
4082 BOOL utf = FALSE;
4083 #endif
4084
4085 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4086 class_uchardata always so that it can be passed to add_to_class() always,
4087 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4088 alternative calls for the different cases. */
4089
4090 pcre_uchar *class_uchardata;
4091 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4092 BOOL xclass;
4093 pcre_uchar *class_uchardata_base;
4094 #endif
4095
4096 #ifdef PCRE_DEBUG
4097 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4098 #endif
4099
4100 /* Set up the default and non-default settings for greediness */
4101
4102 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4103 greedy_non_default = greedy_default ^ 1;
4104
4105 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4106 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4107 matches a non-fixed char first char; reqchar just remains unset if we never
4108 find one.
4109
4110 When we hit a repeat whose minimum is zero, we may have to adjust these values
4111 to take the zero repeat into account. This is implemented by setting them to
4112 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4113 item types that can be repeated set these backoff variables appropriately. */
4114
4115 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4116 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4117
4118 /* The variable req_caseopt contains either the REQ_CASELESS value
4119 or zero, according to the current setting of the caseless flag. The
4120 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4121 firstchar or reqchar variables to record the case status of the
4122 value. This is used only for ASCII characters. */
4123
4124 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4125
4126 /* Switch on next character until the end of the branch */
4127
4128 for (;; ptr++)
4129 {
4130 BOOL negate_class;
4131 BOOL should_flip_negation;
4132 BOOL possessive_quantifier;
4133 BOOL is_quantifier;
4134 BOOL is_recurse;
4135 BOOL reset_bracount;
4136 int class_has_8bitchar;
4137 int class_one_char;
4138 int newoptions;
4139 int recno;
4140 int refsign;
4141 int skipbytes;
4142 pcre_uint32 subreqchar, subfirstchar;
4143 pcre_int32 subreqcharflags, subfirstcharflags;
4144 int terminator;
4145 unsigned int mclength;
4146 unsigned int tempbracount;
4147 pcre_uint32 ec;
4148 pcre_uchar mcbuffer[8];
4149
4150 /* Get next character in the pattern */
4151
4152 c = *ptr;
4153
4154 /* If we are at the end of a nested substitution, revert to the outer level
4155 string. Nesting only happens one level deep. */
4156
4157 if (c == CHAR_NULL && nestptr != NULL)
4158 {
4159 ptr = nestptr;
4160 nestptr = NULL;
4161 c = *ptr;
4162 }
4163
4164 /* If we are in the pre-compile phase, accumulate the length used for the
4165 previous cycle of this loop. */
4166
4167 if (lengthptr != NULL)
4168 {
4169 #ifdef PCRE_DEBUG
4170 if (code > cd->hwm) cd->hwm = code; /* High water info */
4171 #endif
4172 if (code > cd->start_workspace + cd->workspace_size -
4173 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4174 {
4175 *errorcodeptr = ERR52;
4176 goto FAILED;
4177 }
4178
4179 /* There is at least one situation where code goes backwards: this is the
4180 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4181 the class is simply eliminated. However, it is created first, so we have to
4182 allow memory for it. Therefore, don't ever reduce the length at this point.
4183 */
4184
4185 if (code < last_code) code = last_code;
4186
4187 /* Paranoid check for integer overflow */
4188
4189 if (OFLOW_MAX - *lengthptr < code - last_code)
4190 {
4191 *errorcodeptr = ERR20;
4192 goto FAILED;
4193 }
4194
4195 *lengthptr += (int)(code - last_code);
4196 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4197 (int)(code - last_code), c, c));
4198
4199 /* If "previous" is set and it is not at the start of the work space, move
4200 it back to there, in order to avoid filling up the work space. Otherwise,
4201 if "previous" is NULL, reset the current code pointer to the start. */
4202
4203 if (previous != NULL)
4204 {
4205 if (previous > orig_code)
4206 {
4207 memmove(orig_code, previous, IN_UCHARS(code - previous));
4208 code -= previous - orig_code;
4209 previous = orig_code;
4210 }
4211 }
4212 else code = orig_code;
4213
4214 /* Remember where this code item starts so we can pick up the length
4215 next time round. */
4216
4217 last_code = code;
4218 }
4219
4220 /* In the real compile phase, just check the workspace used by the forward
4221 reference list. */
4222
4223 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4224 WORK_SIZE_SAFETY_MARGIN)
4225 {
4226 *errorcodeptr = ERR52;
4227 goto FAILED;
4228 }
4229
4230 /* If in \Q...\E, check for the end; if not, we have a literal */
4231
4232 if (inescq && c != CHAR_NULL)
4233 {
4234 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4235 {
4236 inescq = FALSE;
4237 ptr++;
4238 continue;
4239 }
4240 else
4241 {
4242 if (previous_callout != NULL)
4243 {
4244 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4245 complete_callout(previous_callout, ptr, cd);
4246 previous_callout = NULL;
4247 }
4248 if ((options & PCRE_AUTO_CALLOUT) != 0)
4249 {
4250 previous_callout = code;
4251 code = auto_callout(code, ptr, cd);
4252 }
4253 goto NORMAL_CHAR;
4254 }
4255 }
4256
4257 is_quantifier =
4258 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4259 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4260
4261 /* Fill in length of a previous callout, except when the next thing is a
4262 quantifier or when processing a property substitution string in UCP mode. */
4263
4264 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4265 after_manual_callout-- <= 0)
4266 {
4267 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4268 complete_callout(previous_callout, ptr, cd);
4269 previous_callout = NULL;
4270 }
4271
4272 /* In extended mode, skip white space and comments. */
4273
4274 if ((options & PCRE_EXTENDED) != 0)
4275 {
4276 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
4277 if (c == CHAR_NUMBER_SIGN)
4278 {
4279 ptr++;
4280 while (*ptr != CHAR_NULL)
4281 {
4282 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
4283 ptr++;
4284 #ifdef SUPPORT_UTF
4285 if (utf) FORWARDCHAR(ptr);
4286 #endif
4287 }
4288 if (*ptr != CHAR_NULL) continue;
4289
4290 /* Else fall through to handle end of string */
4291 c = 0;
4292 }
4293 }
4294
4295 /* No auto callout for quantifiers, or while processing property strings that
4296 are substituted for \w etc in UCP mode. */
4297
4298 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4299 {
4300 previous_callout = code;
4301 code = auto_callout(code, ptr, cd);
4302 }
4303
4304 switch(c)
4305 {
4306 /* ===================================================================*/
4307 case 0: /* The branch terminates at string end */
4308 case CHAR_VERTICAL_LINE: /* or | or ) */
4309 case CHAR_RIGHT_PARENTHESIS:
4310 *firstcharptr = firstchar;
4311 *firstcharflagsptr = firstcharflags;
4312 *reqcharptr = reqchar;
4313 *reqcharflagsptr = reqcharflags;
4314 *codeptr = code;
4315 *ptrptr = ptr;
4316 if (lengthptr != NULL)
4317 {
4318 if (OFLOW_MAX - *lengthptr < code - last_code)
4319 {
4320 *errorcodeptr = ERR20;
4321 goto FAILED;
4322 }
4323 *lengthptr += (int)(code - last_code); /* To include callout length */
4324 DPRINTF((">> end branch\n"));
4325 }
4326 return TRUE;
4327
4328
4329 /* ===================================================================*/
4330 /* Handle single-character metacharacters. In multiline mode, ^ disables
4331 the setting of any following char as a first character. */
4332
4333 case CHAR_CIRCUMFLEX_ACCENT:
4334 previous = NULL;
4335 if ((options & PCRE_MULTILINE) != 0)
4336 {
4337 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4338 *code++ = OP_CIRCM;
4339 }
4340 else *code++ = OP_CIRC;
4341 break;
4342
4343 case CHAR_DOLLAR_SIGN:
4344 previous = NULL;
4345 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4346 break;
4347
4348 /* There can never be a first char if '.' is first, whatever happens about
4349 repeats. The value of reqchar doesn't change either. */
4350
4351 case CHAR_DOT:
4352 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4353 zerofirstchar = firstchar;
4354 zerofirstcharflags = firstcharflags;
4355 zeroreqchar = reqchar;
4356 zeroreqcharflags = reqcharflags;
4357 previous = code;
4358 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4359 break;
4360
4361
4362 /* ===================================================================*/
4363 /* Character classes. If the included characters are all < 256, we build a
4364 32-byte bitmap of the permitted characters, except in the special case
4365 where there is only one such character. For negated classes, we build the
4366 map as usual, then invert it at the end. However, we use a different opcode
4367 so that data characters > 255 can be handled correctly.
4368
4369 If the class contains characters outside the 0-255 range, a different
4370 opcode is compiled. It may optionally have a bit map for characters < 256,
4371 but those above are are explicitly listed afterwards. A flag byte tells
4372 whether the bitmap is present, and whether this is a negated class or not.
4373
4374 In JavaScript compatibility mode, an isolated ']' causes an error. In
4375 default (Perl) mode, it is treated as a data character. */
4376
4377 case CHAR_RIGHT_SQUARE_BRACKET:
4378 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4379 {
4380 *errorcodeptr = ERR64;
4381 goto FAILED;
4382 }
4383 goto NORMAL_CHAR;
4384
4385 case CHAR_LEFT_SQUARE_BRACKET:
4386 previous = code;
4387
4388 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4389 they are encountered at the top level, so we'll do that too. */
4390
4391 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4392 ptr[1] == CHAR_EQUALS_SIGN) &&
4393 check_posix_syntax(ptr, &tempptr))
4394 {
4395 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4396 goto FAILED;
4397 }
4398
4399 /* If the first character is '^', set the negation flag and skip it. Also,
4400 if the first few characters (either before or after ^) are \Q\E or \E we
4401 skip them too. This makes for compatibility with Perl. */
4402
4403 negate_class = FALSE;
4404 for (;;)
4405 {
4406 c = *(++ptr);
4407 if (c == CHAR_BACKSLASH)
4408 {
4409 if (ptr[1] == CHAR_E)
4410 ptr++;
4411 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4412 ptr += 3;
4413 else
4414 break;
4415 }
4416 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4417 negate_class = TRUE;
4418 else break;
4419 }
4420
4421 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4422 an initial ']' is taken as a data character -- the code below handles
4423 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4424 [^] must match any character, so generate OP_ALLANY. */
4425
4426 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4427 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4428 {
4429 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4430 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4431 zerofirstchar = firstchar;
4432 zerofirstcharflags = firstcharflags;
4433 break;
4434 }
4435
4436 /* If a class contains a negative special such as \S, we need to flip the
4437 negation flag at the end, so that support for characters > 255 works
4438 correctly (they are all included in the class). */
4439
4440 should_flip_negation = FALSE;
4441
4442 /* For optimization purposes, we track some properties of the class:
4443 class_has_8bitchar will be non-zero if the class contains at least one <
4444 256 character; class_one_char will be 1 if the class contains just one
4445 character. */
4446
4447 class_has_8bitchar = 0;
4448 class_one_char = 0;
4449
4450 /* Initialize the 32-char bit map to all zeros. We build the map in a
4451 temporary bit of memory, in case the class contains fewer than two
4452 8-bit characters because in that case the compiled code doesn't use the bit
4453 map. */
4454
4455 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4456
4457 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4458 xclass = FALSE;
4459 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4460 class_uchardata_base = class_uchardata; /* Save the start */
4461 #endif
4462
4463 /* Process characters until ] is reached. By writing this as a "do" it
4464 means that an initial ] is taken as a data character. At the start of the
4465 loop, c contains the first byte of the character. */
4466
4467 if (c != CHAR_NULL) do
4468 {
4469 const pcre_uchar *oldptr;
4470
4471 #ifdef SUPPORT_UTF
4472 if (utf && HAS_EXTRALEN(c))
4473 { /* Braces are required because the */
4474 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4475 }
4476 #endif
4477
4478 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4479 /* In the pre-compile phase, accumulate the length of any extra
4480 data and reset the pointer. This is so that very large classes that
4481 contain a zillion > 255 characters no longer overwrite the work space
4482 (which is on the stack). We have to remember that there was XCLASS data,
4483 however. */
4484
4485 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4486 {
4487 xclass = TRUE;
4488 *lengthptr += class_uchardata - class_uchardata_base;
4489 class_uchardata = class_uchardata_base;
4490 }
4491 #endif
4492
4493 /* Inside \Q...\E everything is literal except \E */
4494
4495 if (inescq)
4496 {
4497 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4498 {
4499 inescq = FALSE; /* Reset literal state */
4500 ptr++; /* Skip the 'E' */
4501 continue; /* Carry on with next */
4502 }
4503 goto CHECK_RANGE; /* Could be range if \E follows */
4504 }
4505
4506 /* Handle POSIX class names. Perl allows a negation extension of the
4507 form [:^name:]. A square bracket that doesn't match the syntax is
4508 treated as a literal. We also recognize the POSIX constructions
4509 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4510 5.6 and 5.8 do. */
4511
4512 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4513 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4514 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4515 {
4516 BOOL local_negate = FALSE;
4517 int posix_class, taboffset, tabopt;
4518 register const pcre_uint8 *cbits = cd->cbits;
4519 pcre_uint8 pbits[32];
4520
4521 if (ptr[1] != CHAR_COLON)
4522 {
4523 *errorcodeptr = ERR31;
4524 goto FAILED;
4525 }
4526
4527 ptr += 2;
4528 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4529 {
4530 local_negate = TRUE;
4531 should_flip_negation = TRUE; /* Note negative special */
4532 ptr++;
4533 }
4534
4535 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4536 if (posix_class < 0)
4537 {
4538 *errorcodeptr = ERR30;
4539 goto FAILED;
4540 }
4541
4542 /* If matching is caseless, upper and lower are converted to
4543 alpha. This relies on the fact that the class table starts with
4544 alpha, lower, upper as the first 3 entries. */
4545
4546 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4547 posix_class = 0;
4548
4549 /* When PCRE_UCP is set, some of the POSIX classes are converted to
4550 different escape sequences that use Unicode properties. */
4551
4552 #ifdef SUPPORT_UCP
4553 if ((options & PCRE_UCP) != 0)
4554 {
4555 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4556 if (posix_substitutes[pc] != NULL)
4557 {
4558 nestptr = tempptr + 1;
4559 ptr = posix_substitutes[pc] - 1;
4560 continue;
4561 }
4562 }
4563 #endif
4564 /* In the non-UCP case, we build the bit map for the POSIX class in a
4565 chunk of local store because we may be adding and subtracting from it,
4566 and we don't want to subtract bits that may be in the main map already.
4567 At the end we or the result into the bit map that is being built. */
4568
4569 posix_class *= 3;
4570
4571 /* Copy in the first table (always present) */
4572
4573 memcpy(pbits, cbits + posix_class_maps[posix_class],
4574 32 * sizeof(pcre_uint8));
4575
4576 /* If there is a second table, add or remove it as required. */
4577
4578 taboffset = posix_class_maps[posix_class + 1];
4579 tabopt = posix_class_maps[posix_class + 2];
4580
4581 if (taboffset >= 0)
4582 {
4583 if (tabopt >= 0)
4584 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4585 else
4586 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4587 }
4588
4589 /* Now see if we need to remove any special characters. An option
4590 value of 1 removes vertical space and 2 removes underscore. */
4591
4592 if (tabopt < 0) tabopt = -tabopt;
4593 if (tabopt == 1) pbits[1] &= ~0x3c;
4594 else if (tabopt == 2) pbits[11] &= 0x7f;
4595
4596 /* Add the POSIX table or its complement into the main table that is
4597 being built and we are done. */
4598
4599 if (local_negate)
4600 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4601 else
4602 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4603
4604 ptr = tempptr + 1;
4605 /* Every class contains at least one < 256 character. */
4606 class_has_8bitchar = 1;
4607 /* Every class contains at least two characters. */
4608 class_one_char = 2;
4609 continue; /* End of POSIX syntax handling */
4610 }
4611
4612 /* Backslash may introduce a single character, or it may introduce one
4613 of the specials, which just set a flag. The sequence \b is a special
4614 case. Inside a class (and only there) it is treated as backspace. We
4615 assume that other escapes have more than one character in them, so
4616 speculatively set both class_has_8bitchar and class_one_char bigger
4617 than one. Unrecognized escapes fall through and are either treated
4618 as literal characters (by default), or are faulted if
4619 PCRE_EXTRA is set. */
4620
4621 if (c == CHAR_BACKSLASH)
4622 {
4623 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4624 TRUE);
4625 if (*errorcodeptr != 0) goto FAILED;
4626 if (escape == 0) c = ec;
4627 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4628 else if (escape == ESC_N) /* \N is not supported in a class */
4629 {
4630 *errorcodeptr = ERR71;
4631 goto FAILED;
4632 }
4633 else if (escape == ESC_Q) /* Handle start of quoted string */
4634 {
4635 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4636 {
4637 ptr += 2; /* avoid empty string */
4638 }
4639 else inescq = TRUE;
4640 continue;
4641 }
4642 else if (escape == ESC_E) continue; /* Ignore orphan \E */
4643
4644 else
4645 {
4646 register const pcre_uint8 *cbits = cd->cbits;
4647 /* Every class contains at least two < 256 characters. */
4648 class_has_8bitchar++;
4649 /* Every class contains at least two characters. */
4650 class_one_char += 2;
4651
4652 switch (escape)
4653 {
4654 #ifdef SUPPORT_UCP
4655 case ESC_du: /* These are the values given for \d etc */
4656 case ESC_DU: /* when PCRE_UCP is set. We replace the */
4657 case ESC_wu: /* escape sequence with an appropriate \p */
4658 case ESC_WU: /* or \P to test Unicode properties instead */
4659 case ESC_su: /* of the default ASCII testing. */
4660 case ESC_SU:
4661 nestptr = ptr;
4662 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4663 class_has_8bitchar--; /* Undo! */
4664 continue;
4665 #endif
4666 case ESC_d:
4667 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4668 continue;
4669
4670 case ESC_D:
4671 should_flip_negation = TRUE;
4672 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4673 continue;
4674
4675 case ESC_w:
4676 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4677 continue;
4678
4679 case ESC_W:
4680 should_flip_negation = TRUE;
4681 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4682 continue;
4683
4684 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4685 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4686 previously set by something earlier in the character class.
4687 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4688 we could just adjust the appropriate bit. From PCRE 8.34 we no
4689 longer treat \s and \S specially. */
4690
4691 case ESC_s:
4692 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4693 continue;
4694
4695 case ESC_S:
4696 should_flip_negation = TRUE;
4697 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4698 continue;
4699
4700 /* The rest apply in both UCP and non-UCP cases. */
4701
4702 case ESC_h:
4703 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4704 PRIV(hspace_list), NOTACHAR);
4705 continue;
4706
4707 case ESC_H:
4708 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4709 cd, PRIV(hspace_list));
4710 continue;
4711
4712 case ESC_v:
4713 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4714 PRIV(vspace_list), NOTACHAR);
4715 continue;
4716
4717 case ESC_V:
4718 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4719 cd, PRIV(vspace_list));
4720 continue;
4721
4722 #ifdef SUPPORT_UCP
4723 case ESC_p:
4724 case ESC_P:
4725 {
4726 BOOL negated;
4727 unsigned int ptype = 0, pdata = 0;
4728 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
4729 goto FAILED;
4730 *class_uchardata++ = ((escape == ESC_p) != negated)?
4731 XCL_PROP : XCL_NOTPROP;
4732 *class_uchardata++ = ptype;
4733 *class_uchardata++ = pdata;
4734 class_has_8bitchar--; /* Undo! */
4735 continue;
4736 }
4737 #endif
4738 /* Unrecognized escapes are faulted if PCRE is running in its
4739 strict mode. By default, for compatibility with Perl, they are
4740 treated as literals. */
4741
4742 default:
4743 if ((options & PCRE_EXTRA) != 0)
4744 {
4745 *errorcodeptr = ERR7;
4746 goto FAILED;
4747 }
4748 class_has_8bitchar--; /* Undo the speculative increase. */
4749 class_one_char -= 2; /* Undo the speculative increase. */
4750 c = *ptr; /* Get the final character and fall through */
4751 break;
4752 }
4753 }
4754
4755 /* Fall through if the escape just defined a single character (c >= 0).
4756 This may be greater than 256. */
4757
4758 escape = 0;
4759
4760 } /* End of backslash handling */
4761
4762 /* A character may be followed by '-' to form a range. However, Perl does
4763 not permit ']' to be the end of the range. A '-' character at the end is
4764 treated as a literal. Perl ignores orphaned \E sequences entirely. The
4765 code for handling \Q and \E is messy. */
4766
4767 CHECK_RANGE:
4768 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4769 {
4770 inescq = FALSE;
4771 ptr += 2;
4772 }
4773 oldptr = ptr;
4774
4775 /* Remember if \r or \n were explicitly used */
4776
4777 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4778
4779 /* Check for range */
4780
4781 if (!inescq && ptr[1] == CHAR_MINUS)
4782 {
4783 pcre_uint32 d;
4784 ptr += 2;
4785 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4786
4787 /* If we hit \Q (not followed by \E) at this point, go into escaped
4788 mode. */
4789
4790 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4791 {
4792 ptr += 2;
4793 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4794 { ptr += 2; continue; }
4795 inescq = TRUE;
4796 break;
4797 }
4798
4799 /* Minus (hyphen) at the end of a class is treated as a literal, so put
4800 back the pointer and jump to handle the character that preceded it. */
4801
4802 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4803 {
4804 ptr = oldptr;
4805 goto CLASS_SINGLE_CHARACTER;
4806 }
4807
4808 /* Otherwise, we have a potential range; pick up the next character */
4809
4810 #ifdef SUPPORT_UTF
4811 if (utf)
4812 { /* Braces are required because the */
4813 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4814 }
4815 else
4816 #endif
4817 d = *ptr; /* Not UTF-8 mode */
4818
4819 /* The second part of a range can be a single-character escape, but
4820 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4821 in such circumstances. */
4822
4823 if (!inescq && d == CHAR_BACKSLASH)
4824 {
4825 int descape;
4826 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
4827 if (*errorcodeptr != 0) goto FAILED;
4828
4829 /* \b is backspace; any other special means the '-' was literal. */
4830
4831 if (descape != 0)
4832 {
4833 if (descape == ESC_b) d = CHAR_BS; else
4834 {
4835 ptr = oldptr;
4836 goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4837 }
4838 }
4839 }
4840
4841 /* Check that the two values are in the correct order. Optimize
4842 one-character ranges. */
4843
4844 if (d < c)
4845 {
4846 *errorcodeptr = ERR8;
4847 goto FAILED;
4848 }
4849 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4850
4851 /* We have found a character range, so single character optimizations
4852 cannot be done anymore. Any value greater than 1 indicates that there
4853 is more than one character. */
4854
4855 class_one_char = 2;
4856
4857 /* Remember an explicit \r or \n, and add the range to the class. */
4858
4859 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4860
4861 class_has_8bitchar +=
4862 add_to_class(classbits, &class_uchardata, options, cd, c, d);
4863
4864 continue; /* Go get the next char in the class */
4865 }
4866
4867 /* Handle a single character - we can get here for a normal non-escape
4868 char, or after \ that introduces a single character or for an apparent
4869 range that isn't. Only the value 1 matters for class_one_char, so don't
4870 increase it if it is already 2 or more ... just in case there's a class
4871 with a zillion characters in it. */
4872
4873 CLASS_SINGLE_CHARACTER:
4874 if (class_one_char < 2) class_one_char++;
4875
4876 /* If class_one_char is 1, we have the first single character in the
4877 class, and there have been no prior ranges, or XCLASS items generated by
4878 escapes. If this is the final character in the class, we can optimize by
4879 turning the item into a 1-character OP_CHAR[I] if it's positive, or
4880 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
4881 to be set. Otherwise, there can be no first char if this item is first,
4882 whatever repeat count may follow. In the case of reqchar, save the
4883 previous value for reinstating. */
4884
4885 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4886 {
4887 ptr++;
4888 zeroreqchar = reqchar;
4889 zeroreqcharflags = reqcharflags;
4890
4891 if (negate_class)
4892 {
4893 #ifdef SUPPORT_UCP
4894 int d;
4895 #endif
4896 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4897 zerofirstchar = firstchar;
4898 zerofirstcharflags = firstcharflags;
4899
4900 /* For caseless UTF-8 mode when UCP support is available, check
4901 whether this character has more than one other case. If so, generate
4902 a special OP_NOTPROP item instead of OP_NOTI. */
4903
4904 #ifdef SUPPORT_UCP
4905 if (utf && (options & PCRE_CASELESS) != 0 &&
4906 (d = UCD_CASESET(c)) != 0)
4907 {
4908 *code++ = OP_NOTPROP;
4909 *code++ = PT_CLIST;
4910 *code++ = d;
4911 }
4912 else
4913 #endif
4914 /* Char has only one other case, or UCP not available */
4915
4916 {
4917 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4918 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4919 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4920 code += PRIV(ord2utf)(c, code);
4921 else
4922 #endif
4923 *code++ = c;
4924 }
4925
4926 /* We are finished with this character class */
4927
4928 goto END_CLASS;
4929 }
4930
4931 /* For a single, positive character, get the value into mcbuffer, and
4932 then we can handle this with the normal one-character code. */
4933
4934 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4935 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4936 mclength = PRIV(ord2utf)(c, mcbuffer);
4937 else
4938 #endif
4939 {
4940 mcbuffer[0] = c;
4941 mclength = 1;
4942 }
4943 goto ONE_CHAR;
4944 } /* End of 1-char optimization */
4945
4946 /* There is more than one character in the class, or an XCLASS item
4947 has been generated. Add this character to the class. */
4948
4949 class_has_8bitchar +=
4950 add_to_class(classbits, &class_uchardata, options, cd, c, c);
4951 }
4952
4953 /* Loop until ']' reached. This "while" is the end of the "do" far above.
4954 If we are at the end of an internal nested string, revert to the outer
4955 string. */
4956
4957 while (((c = *(++ptr)) != CHAR_NULL ||
4958 (nestptr != NULL &&
4959 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
4960 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4961
4962 /* Check for missing terminating ']' */
4963
4964 if (c == CHAR_NULL)
4965 {
4966 *errorcodeptr = ERR6;
4967 goto FAILED;
4968 }
4969
4970 /* We will need an XCLASS if data has been placed in class_uchardata. In
4971 the second phase this is a sufficient test. However, in the pre-compile
4972 phase, class_uchardata gets emptied to prevent workspace overflow, so it
4973 only if the very last character in the class needs XCLASS will it contain
4974 anything at this point. For this reason, xclass gets set TRUE above when
4975 uchar_classdata is emptied, and that's why this code is the way it is here
4976 instead of just doing a test on class_uchardata below. */
4977
4978 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4979 if (class_uchardata > class_uchardata_base) xclass = TRUE;
4980 #endif
4981
4982 /* If this is the first thing in the branch, there can be no first char
4983 setting, whatever the repeat count. Any reqchar setting must remain
4984 unchanged after any kind of repeat. */
4985
4986 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4987 zerofirstchar = firstchar;
4988 zerofirstcharflags = firstcharflags;
4989 zeroreqchar = reqchar;
4990 zeroreqcharflags = reqcharflags;
4991
4992 /* If there are characters with values > 255, we have to compile an
4993 extended class, with its own opcode, unless there was a negated special
4994 such as \S in the class, and PCRE_UCP is not set, because in that case all
4995 characters > 255 are in the class, so any that were explicitly given as
4996 well can be ignored. If (when there are explicit characters > 255 that must
4997 be listed) there are no characters < 256, we can omit the bitmap in the
4998 actual compiled code. */
4999
5000 #ifdef SUPPORT_UTF
5001 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5002 #elif !defined COMPILE_PCRE8
5003 if (xclass && !should_flip_negation)
5004 #endif
5005 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5006 {
5007 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5008 *code++ = OP_XCLASS;
5009 code += LINK_SIZE;
5010 *code = negate_class? XCL_NOT:0;
5011
5012 /* If the map is required, move up the extra data to make room for it;
5013 otherwise just move the code pointer to the end of the extra data. */
5014
5015 if (class_has_8bitchar > 0)
5016 {
5017 *code++ |= XCL_MAP;
5018 memmove(code + (32 / sizeof(pcre_uchar)), code,
5019 IN_UCHARS(class_uchardata - code));
5020 memcpy(code, classbits, 32);
5021 code = class_uchardata + (32 / sizeof(pcre_uchar));
5022 }
5023 else code = class_uchardata;
5024
5025 /* Now fill in the complete length of the item */
5026
5027 PUT(previous, 1, (int)(code - previous));
5028 break; /* End of class handling */
5029 }
5030 #endif
5031
5032 /* If there are no characters > 255, or they are all to be included or
5033 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5034 whole class was negated and whether there were negative specials such as \S
5035 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5036 negating it if necessary. */
5037
5038 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5039 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5040 {
5041 if (negate_class)
5042 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5043 memcpy(code, classbits, 32);
5044 }
5045 code += 32 / sizeof(pcre_uchar);
5046
5047 END_CLASS:
5048 break;
5049
5050
5051 /* ===================================================================*/
5052 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5053 has been tested above. */
5054
5055 case CHAR_LEFT_CURLY_BRACKET:
5056 if (!is_quantifier) goto NORMAL_CHAR;
5057 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5058 if (*errorcodeptr != 0) goto FAILED;
5059 goto REPEAT;
5060
5061 case CHAR_ASTERISK:
5062 repeat_min = 0;
5063 repeat_max = -1;
5064 goto REPEAT;
5065
5066 case CHAR_PLUS:
5067 repeat_min = 1;
5068 repeat_max = -1;
5069 goto REPEAT;
5070
5071 case CHAR_QUESTION_MARK:
5072 repeat_min = 0;
5073 repeat_max = 1;
5074
5075 REPEAT:
5076 if (previous == NULL)
5077 {
5078 *errorcodeptr = ERR9;
5079 goto FAILED;
5080 }
5081
5082 if (repeat_min == 0)
5083 {
5084 firstchar = zerofirstchar; /* Adjust for zero repeat */
5085 firstcharflags = zerofirstcharflags;
5086 reqchar = zeroreqchar; /* Ditto */
5087 reqcharflags = zeroreqcharflags;
5088 }
5089
5090 /* Remember whether this is a variable length repeat */
5091
5092 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5093
5094 op_type = 0; /* Default single-char op codes */
5095 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5096
5097 /* Save start of previous item, in case we have to move it up in order to
5098 insert something before it. */
5099
5100 tempcode = previous;
5101
5102 /* If the next character is '+', we have a possessive quantifier. This
5103 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5104 If the next character is '?' this is a minimizing repeat, by default,
5105 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5106 repeat type to the non-default. */
5107
5108 if (ptr[1] == CHAR_PLUS)
5109 {
5110 repeat_type = 0; /* Force greedy */
5111 possessive_quantifier = TRUE;
5112 ptr++;
5113 }
5114 else if (ptr[1] == CHAR_QUESTION_MARK)
5115 {
5116 repeat_type = greedy_non_default;
5117 ptr++;
5118 }
5119 else repeat_type = greedy_default;
5120
5121 /* If previous was a recursion call, wrap it in atomic brackets so that
5122 previous becomes the atomic group. All recursions were so wrapped in the
5123 past, but it no longer happens for non-repeated recursions. In fact, the
5124 repeated ones could be re-implemented independently so as not to need this,
5125 but for the moment we rely on the code for repeating groups. */
5126
5127 if (*previous == OP_RECURSE)
5128 {
5129 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5130 *previous = OP_ONCE;
5131 PUT(previous, 1, 2 + 2*LINK_SIZE);
5132 previous[2 + 2*LINK_SIZE] = OP_KET;
5133 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5134 code += 2 + 2 * LINK_SIZE;
5135 length_prevgroup = 3 + 3*LINK_SIZE;
5136
5137 /* When actually compiling, we need to check whether this was a forward
5138 reference, and if so, adjust the offset. */
5139
5140 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5141 {
5142 int offset = GET(cd->hwm, -LINK_SIZE);
5143 if (offset == previous + 1 - cd->start_code)
5144 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5145 }
5146 }
5147
5148 /* Now handle repetition for the different types of item. */
5149
5150 /* If previous was a character or negated character match, abolish the item
5151 and generate a repeat item instead. If a char item has a minimum of more
5152 than one, ensure that it is set in reqchar - it might not be if a sequence
5153 such as x{3} is the first thing in a branch because the x will have gone
5154 into firstchar instead. */
5155
5156 if (*previous == OP_CHAR || *previous == OP_CHARI
5157 || *previous == OP_NOT || *previous == OP_NOTI)
5158 {
5159 switch (*previous)
5160 {
5161 default: /* Make compiler happy. */
5162 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5163 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5164 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5165 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5166 }
5167
5168 /* Deal with UTF characters that take up more than one character. It's
5169 easier to write this out separately than try to macrify it. Use c to
5170 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5171 it's a length rather than a small character. */
5172
5173 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5174 if (utf && NOT_FIRSTCHAR(code[-1]))
5175 {
5176 pcre_uchar *lastchar = code - 1;
5177 BACKCHAR(lastchar);
5178 c = (int)(code - lastchar); /* Length of UTF-8 character */
5179 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5180 c |= UTF_LENGTH; /* Flag c as a length */
5181 }
5182 else
5183 #endif /* SUPPORT_UTF */
5184
5185 /* Handle the case of a single charater - either with no UTF support, or
5186 with UTF disabled, or for a single character UTF character. */
5187 {
5188 c = code[-1];
5189 if (*previous <= OP_CHARI && repeat_min > 1)
5190 {
5191 reqchar = c;
5192 reqcharflags = req_caseopt | cd->req_varyopt;
5193 }
5194 }
5195
5196 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5197 }
5198
5199 /* If previous was a character type match (\d or similar), abolish it and
5200 create a suitable repeat item. The code is shared with single-character
5201 repeats by setting op_type to add a suitable offset into repeat_type. Note
5202 the the Unicode property types will be present only when SUPPORT_UCP is
5203 defined, but we don't wrap the little bits of code here because it just
5204 makes it horribly messy. */
5205
5206 else if (*previous < OP_EODN)
5207 {
5208 pcre_uchar *oldcode;
5209 int prop_type, prop_value;
5210 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5211 c = *previous;
5212
5213 OUTPUT_SINGLE_REPEAT:
5214 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5215 {
5216 prop_type = previous[1];
5217 prop_value = previous[2];
5218 }
5219 else prop_type = prop_value = -1;
5220
5221 oldcode = code;
5222 code = previous; /* Usually overwrite previous item */
5223
5224 /* If the maximum is zero then the minimum must also be zero; Perl allows
5225 this case, so we do too - by simply omitting the item altogether. */
5226
5227 if (repeat_max == 0) goto END_REPEAT;
5228
5229 /* Combine the op_type with the repeat_type */
5230
5231 repeat_type += op_type;
5232
5233 /* A minimum of zero is handled either as the special case * or ?, or as
5234 an UPTO, with the maximum given. */
5235
5236 if (repeat_min == 0)
5237 {
5238 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5239 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5240 else
5241 {
5242 *code++ = OP_UPTO + repeat_type;
5243 PUT2INC(code, 0, repeat_max);
5244 }
5245 }
5246
5247 /* A repeat minimum of 1 is optimized into some special cases. If the
5248 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5249 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5250 one less than the maximum. */
5251
5252 else if (repeat_min == 1)
5253 {
5254 if (repeat_max == -1)
5255 *code++ = OP_PLUS + repeat_type;
5256 else
5257 {
5258 code = oldcode; /* leave previous item in place */
5259 if (repeat_max == 1) goto END_REPEAT;
5260 *code++ = OP_UPTO + repeat_type;
5261 PUT2INC(code, 0, repeat_max - 1);
5262 }
5263 }
5264
5265 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5266 handled as an EXACT followed by an UPTO. */
5267
5268 else
5269 {
5270 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5271 PUT2INC(code, 0, repeat_min);
5272
5273 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5274 we have to insert the character for the previous code. For a repeated
5275 Unicode property match, there are two extra bytes that define the
5276 required property. In UTF-8 mode, long characters have their length in
5277 c, with the UTF_LENGTH bit as a flag. */
5278
5279 if (repeat_max < 0)
5280 {
5281 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5282 if (utf && (c & UTF_LENGTH) != 0)
5283 {
5284 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5285 code += c & 7;
5286 }
5287 else
5288 #endif
5289 {
5290 *code++ = c;
5291 if (prop_type >= 0)
5292 {
5293 *code++ = prop_type;
5294 *code++ = prop_value;
5295 }
5296 }
5297 *code++ = OP_STAR + repeat_type;
5298 }
5299
5300 /* Else insert an UPTO if the max is greater than the min, again
5301 preceded by the character, for the previously inserted code. If the
5302 UPTO is just for 1 instance, we can use QUERY instead. */
5303
5304 else if (repeat_max != repeat_min)
5305 {
5306 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5307 if (utf && (c & UTF_LENGTH) != 0)
5308 {
5309 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5310 code += c & 7;
5311 }
5312 else
5313 #endif
5314 *code++ = c;
5315 if (prop_type >= 0)
5316 {
5317 *code++ = prop_type;
5318 *code++ = prop_value;
5319 }
5320 repeat_max -= repeat_min;
5321
5322 if (repeat_max == 1)
5323 {
5324 *code++ = OP_QUERY + repeat_type;
5325 }
5326 else
5327 {
5328 *code++ = OP_UPTO + repeat_type;
5329 PUT2INC(code, 0, repeat_max);
5330 }
5331 }
5332 }
5333
5334 /* The character or character type itself comes last in all cases. */
5335
5336 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5337 if (utf && (c & UTF_LENGTH) != 0)
5338 {
5339 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5340 code += c & 7;
5341 }
5342 else
5343 #endif
5344 *code++ = c;
5345
5346 /* For a repeated Unicode property match, there are two extra bytes that
5347 define the required property. */
5348
5349 #ifdef SUPPORT_UCP
5350 if (prop_type >= 0)
5351 {
5352 *code++ = prop_type;
5353 *code++ = prop_value;
5354 }
5355 #endif
5356 }
5357
5358 /* If previous was a character class or a back reference, we put the repeat
5359 stuff after it, but just skip the item if the repeat was {0,0}. */
5360
5361 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5362 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5363 *previous == OP_XCLASS ||
5364 #endif
5365 *previous == OP_REF || *previous == OP_REFI ||
5366 *previous == OP_DNREF || *previous == OP_DNREFI)
5367 {
5368 if (repeat_max == 0)
5369 {
5370 code = previous;
5371 goto END_REPEAT;
5372 }
5373
5374 if (repeat_min == 0 && repeat_max == -1)
5375 *code++ = OP_CRSTAR + repeat_type;
5376 else if (repeat_min == 1 && repeat_max == -1)
5377 *code++ = OP_CRPLUS + repeat_type;
5378 else if (repeat_min == 0 && repeat_max == 1)
5379 *code++ = OP_CRQUERY + repeat_type;
5380 else
5381 {
5382 *code++ = OP_CRRANGE + repeat_type;
5383 PUT2INC(code, 0, repeat_min);
5384 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5385 PUT2INC(code, 0, repeat_max);
5386 }
5387 }
5388
5389 /* If previous was a bracket group, we may have to replicate it in certain
5390 cases. Note that at this point we can encounter only the "basic" bracket
5391 opcodes such as BRA and CBRA, as this is the place where they get converted
5392 into the more special varieties such as BRAPOS and SBRA. A test for >=
5393 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5394 ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5395 repetition of assertions, but now it does, for Perl compatibility. */
5396
5397 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5398 {
5399 register int i;
5400 int len = (int)(code - previous);
5401 pcre_uchar *bralink = NULL;
5402 pcre_uchar *brazeroptr = NULL;
5403
5404 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5405 we just ignore the repeat. */
5406
5407 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5408 goto END_REPEAT;
5409
5410 /* There is no sense in actually repeating assertions. The only potential
5411 use of repetition is in cases when the assertion is optional. Therefore,
5412 if the minimum is greater than zero, just ignore the repeat. If the
5413 maximum is not not zero or one, set it to 1. */
5414
5415 if (*previous < OP_ONCE) /* Assertion */
5416 {
5417 if (repeat_min > 0) goto END_REPEAT;
5418 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5419 }
5420
5421 /* The case of a zero minimum is special because of the need to stick
5422 OP_BRAZERO in front of it, and because the group appears once in the
5423 data, whereas in other cases it appears the minimum number of times. For
5424 this reason, it is simplest to treat this case separately, as otherwise
5425 the code gets far too messy. There are several special subcases when the
5426 minimum is zero. */
5427
5428 if (repeat_min == 0)
5429 {
5430 /* If the maximum is also zero, we used to just omit the group from the
5431 output altogether, like this:
5432
5433 ** if (repeat_max == 0)
5434 ** {
5435 ** code = previous;
5436 ** goto END_REPEAT;
5437 ** }
5438
5439 However, that fails when a group or a subgroup within it is referenced
5440 as a subroutine from elsewhere in the pattern, so now we stick in
5441 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5442 don't have a list of which groups are referenced, we cannot do this
5443 selectively.
5444
5445 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5446 and do no more at this point. However, we do need to adjust any
5447 OP_RECURSE calls inside the group that refer to the group itself or any
5448 internal or forward referenced group, because the offset is from the
5449 start of the whole regex. Temporarily terminate the pattern while doing
5450 this. */
5451
5452 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5453 {
5454 *code = OP_END;
5455 adjust_recurse(previous, 1, utf, cd, save_hwm);
5456 memmove(previous + 1, previous, IN_UCHARS(len));
5457 code++;
5458 if (repeat_max == 0)
5459 {
5460 *previous++ = OP_SKIPZERO;
5461 goto END_REPEAT;
5462 }
5463 brazeroptr = previous; /* Save for possessive optimizing */
5464 *previous++ = OP_BRAZERO + repeat_type;
5465 }
5466
5467 /* If the maximum is greater than 1 and limited, we have to replicate
5468 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5469 The first one has to be handled carefully because it's the original
5470 copy, which has to be moved up. The remainder can be handled by code
5471 that is common with the non-zero minimum case below. We have to
5472 adjust the value or repeat_max, since one less copy is required. Once
5473 again, we may have to adjust any OP_RECURSE calls inside the group. */
5474
5475 else
5476 {
5477 int offset;
5478 *code = OP_END;
5479 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5480 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5481 code += 2 + LINK_SIZE;
5482 *previous++ = OP_BRAZERO + repeat_type;
5483 *previous++ = OP_BRA;
5484
5485 /* We chain together the bracket offset fields that have to be
5486 filled in later when the ends of the brackets are reached. */
5487
5488 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5489 bralink = previous;
5490 PUTINC(previous, 0, offset);
5491 }
5492
5493 repeat_max--;
5494 }
5495
5496 /* If the minimum is greater than zero, replicate the group as many
5497 times as necessary, and adjust the maximum to the number of subsequent
5498 copies that we need. If we set a first char from the group, and didn't
5499 set a required char, copy the latter from the former. If there are any
5500 forward reference subroutine calls in the group, there will be entries on
5501 the workspace list; replicate these with an appropriate increment. */
5502
5503 else
5504 {
5505 if (repeat_min > 1)
5506 {
5507 /* In the pre-compile phase, we don't actually do the replication. We
5508 just adjust the length as if we had. Do some paranoid checks for
5509 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5510 integer type when available, otherwise double. */
5511
5512 if (lengthptr != NULL)
5513 {
5514 int delta = (repeat_min - 1)*length_prevgroup;
5515 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5516 (INT64_OR_DOUBLE)length_prevgroup >
5517 (INT64_OR_DOUBLE)INT_MAX ||
5518 OFLOW_MAX - *lengthptr < delta)
5519 {
5520 *errorcodeptr = ERR20;
5521 goto FAILED;
5522 }
5523 *lengthptr += delta;
5524 }
5525
5526 /* This is compiling for real. If there is a set first byte for
5527 the group, and we have not yet set a "required byte", set it. Make
5528 sure there is enough workspace for copying forward references before
5529 doing the copy. */
5530
5531 else
5532 {
5533 if (groupsetfirstchar && reqcharflags < 0)
5534 {
5535 reqchar = firstchar;
5536 reqcharflags = firstcharflags;
5537 }
5538
5539 for (i = 1; i < repeat_min; i++)
5540 {
5541 pcre_uchar *hc;
5542 pcre_uchar *this_hwm = cd->hwm;
5543 memcpy(code, previous, IN_UCHARS(len));
5544
5545 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5546 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5547 {
5548 int save_offset = save_hwm - cd->start_workspace;
5549 int this_offset = this_hwm - cd->start_workspace;
5550 *errorcodeptr = expand_workspace(cd);
5551 if (*errorcodeptr != 0) goto FAILED;
5552 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5553 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5554 }
5555
5556 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5557 {
5558 PUT(cd->hwm, 0, GET(hc, 0) + len);
5559 cd->hwm += LINK_SIZE;
5560 }
5561 save_hwm = this_hwm;
5562 code += len;
5563 }
5564 }
5565 }
5566
5567 if (repeat_max > 0) repeat_max -= repeat_min;
5568 }
5569
5570 /* This code is common to both the zero and non-zero minimum cases. If
5571 the maximum is limited, it replicates the group in a nested fashion,
5572 remembering the bracket starts on a stack. In the case of a zero minimum,
5573 the first one was set up above. In all cases the repeat_max now specifies
5574 the number of additional copies needed. Again, we must remember to
5575 replicate entries on the forward reference list. */
5576
5577 if (repeat_max >= 0)
5578 {
5579 /* In the pre-compile phase, we don't actually do the replication. We
5580 just adjust the length as if we had. For each repetition we must add 1
5581 to the length for BRAZERO and for all but the last repetition we must
5582 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5583 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5584 a 64-bit integer type when available, otherwise double. */
5585
5586 if (lengthptr != NULL && repeat_max > 0)
5587 {
5588 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5589 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5590 if ((INT64_OR_DOUBLE)repeat_max *
5591 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5592 > (INT64_OR_DOUBLE)INT_MAX ||
5593 OFLOW_MAX - *lengthptr < delta)
5594 {
5595 *errorcodeptr = ERR20;
5596 goto FAILED;
5597 }
5598 *lengthptr += delta;
5599 }
5600
5601 /* This is compiling for real */
5602
5603 else for (i = repeat_max - 1; i >= 0; i--)
5604 {
5605 pcre_uchar *hc;
5606 pcre_uchar *this_hwm = cd->hwm;
5607
5608 *code++ = OP_BRAZERO + repeat_type;
5609
5610 /* All but the final copy start a new nesting, maintaining the
5611 chain of brackets outstanding. */
5612
5613 if (i != 0)
5614 {
5615 int offset;
5616 *code++ = OP_BRA;
5617 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5618 bralink = code;
5619 PUTINC(code, 0, offset);
5620 }
5621
5622 memcpy(code, previous, IN_UCHARS(len));
5623
5624 /* Ensure there is enough workspace for forward references before
5625 copying them. */
5626
5627 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5628 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5629 {
5630 int save_offset = save_hwm - cd->start_workspace;
5631 int this_offset = this_hwm - cd->start_workspace;
5632 *errorcodeptr = expand_workspace(cd);
5633 if (*errorcodeptr != 0) goto FAILED;
5634 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5635 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5636 }
5637
5638 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5639 {
5640 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5641 cd->hwm += LINK_SIZE;
5642 }
5643 save_hwm = this_hwm;
5644 code += len;
5645 }
5646
5647 /* Now chain through the pending brackets, and fill in their length
5648 fields (which are holding the chain links pro tem). */
5649
5650 while (bralink != NULL)
5651 {
5652 int oldlinkoffset;
5653 int offset = (int)(code - bralink + 1);
5654 pcre_uchar *bra = code - offset;
5655 oldlinkoffset = GET(bra, 1);
5656 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5657 *code++ = OP_KET;
5658 PUTINC(code, 0, offset);
5659 PUT(bra, 1, offset);
5660 }
5661 }
5662
5663 /* If the maximum is unlimited, set a repeater in the final copy. For
5664 ONCE brackets, that's all we need to do. However, possessively repeated
5665 ONCE brackets can be converted into non-capturing brackets, as the
5666 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5667 deal with possessive ONCEs specially.
5668
5669 Otherwise, when we are doing the actual compile phase, check to see
5670 whether this group is one that could match an empty string. If so,
5671 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5672 that runtime checking can be done. [This check is also applied to ONCE
5673 groups at runtime, but in a different way.]
5674
5675 Then, if the quantifier was possessive and the bracket is not a
5676 conditional, we convert the BRA code to the POS form, and the KET code to
5677 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5678 subpattern at both the start and at the end.) The use of special opcodes
5679 makes it possible to reduce greatly the stack usage in pcre_exec(). If
5680 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5681
5682 Then, if the minimum number of matches is 1 or 0, cancel the possessive
5683 flag so that the default action below, of wrapping everything inside
5684 atomic brackets, does not happen. When the minimum is greater than 1,
5685 there will be earlier copies of the group, and so we still have to wrap
5686 the whole thing. */
5687
5688 else
5689 {
5690 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5691 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5692
5693 /* Convert possessive ONCE brackets to non-capturing */
5694
5695 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5696 possessive_quantifier) *bracode = OP_BRA;
5697
5698 /* For non-possessive ONCE brackets, all we need to do is to
5699 set the KET. */
5700
5701 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5702 *ketcode = OP_KETRMAX + repeat_type;
5703
5704 /* Handle non-ONCE brackets and possessive ONCEs (which have been
5705 converted to non-capturing above). */
5706
5707 else
5708 {
5709 /* In the compile phase, check for empty string matching. */
5710
5711 if (lengthptr == NULL)
5712 {
5713 pcre_uchar *scode = bracode;
5714 do
5715 {
5716 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
5717 {
5718 *bracode += OP_SBRA - OP_BRA;
5719 break;
5720 }
5721 scode += GET(scode, 1);
5722 }
5723 while (*scode == OP_ALT);
5724 }
5725
5726 /* Handle possessive quantifiers. */
5727
5728 if (possessive_quantifier)
5729 {
5730 /* For COND brackets, we wrap the whole thing in a possessively
5731 repeated non-capturing bracket, because we have not invented POS
5732 versions of the COND opcodes. Because we are moving code along, we
5733 must ensure that any pending recursive references are updated. */
5734
5735 if (*bracode == OP_COND || *bracode == OP_SCOND)
5736 {
5737 int nlen = (int)(code - bracode);
5738 *code = OP_END;
5739 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5740 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5741 code += 1 + LINK_SIZE;
5742 nlen += 1 + LINK_SIZE;
5743 *bracode = OP_BRAPOS;
5744 *code++ = OP_KETRPOS;
5745 PUTINC(code, 0, nlen);
5746 PUT(bracode, 1, nlen);
5747 }
5748
5749 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5750
5751 else
5752 {
5753 *bracode += 1; /* Switch to xxxPOS opcodes */
5754 *ketcode = OP_KETRPOS;
5755 }
5756
5757 /* If the minimum is zero, mark it as possessive, then unset the
5758 possessive flag when the minimum is 0 or 1. */
5759
5760 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5761 if (repeat_min < 2) possessive_quantifier = FALSE;
5762 }
5763
5764 /* Non-possessive quantifier */
5765
5766 else *ketcode = OP_KETRMAX + repeat_type;
5767 }
5768 }
5769 }
5770
5771 /* If previous is OP_FAIL, it was generated by an empty class [] in
5772 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5773 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5774 error above. We can just ignore the repeat in JS case. */
5775
5776 else if (*previous == OP_FAIL) goto END_REPEAT;
5777
5778 /* Else there's some kind of shambles */
5779
5780 else
5781 {
5782 *errorcodeptr = ERR11;
5783 goto FAILED;
5784 }
5785
5786 /* If the character following a repeat is '+', or if certain optimization
5787 tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5788 there are special alternative opcodes for this case. For anything else, we
5789 wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5790 notation is just syntactic sugar, taken from Sun's Java package, but the
5791 special opcodes can optimize it.
5792
5793 Some (but not all) possessively repeated subpatterns have already been
5794 completely handled in the code just above. For them, possessive_quantifier
5795 is always FALSE at this stage.
5796
5797 Note that the repeated item starts at tempcode, not at previous, which
5798 might be the first part of a string whose (former) last char we repeated.
5799
5800 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5801 an 'upto' may follow. We skip over an 'exact' item, and then test the
5802 length of what remains before proceeding. */
5803
5804 if (possessive_quantifier)
5805 {
5806 int len;
5807
5808 if (*tempcode == OP_TYPEEXACT)
5809 tempcode += PRIV(OP_lengths)[*tempcode] +
5810 ((tempcode[1 + IMM2_SIZE] == OP_PROP
5811 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5812
5813 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5814 {
5815 tempcode += PRIV(OP_lengths)[*tempcode];
5816 #ifdef SUPPORT_UTF
5817 if (utf && HAS_EXTRALEN(tempcode[-1]))
5818 tempcode += GET_EXTRALEN(tempcode[-1]);
5819 #endif
5820 }
5821
5822 len = (int)(code - tempcode);
5823 if (len > 0) switch (*tempcode)
5824 {
5825 case OP_STAR: *tempcode = OP_POSSTAR; break;
5826 case OP_PLUS: *tempcode = OP_POSPLUS; break;
5827 case OP_QUERY: *tempcode = OP_POSQUERY; break;
5828 case OP_UPTO: *tempcode = OP_POSUPTO; break;
5829
5830 case OP_STARI: *tempcode = OP_POSSTARI; break;
5831 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
5832 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5833 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
5834
5835 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
5836 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
5837 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5838 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
5839
5840 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
5841 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
5842 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5843 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
5844
5845 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
5846 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
5847 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5848 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
5849
5850 /* Because we are moving code along, we must ensure that any
5851 pending recursive references are updated. */
5852
5853 default:
5854 *code = OP_END;
5855 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5856 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5857 code += 1 + LINK_SIZE;
5858 len += 1 + LINK_SIZE;
5859 tempcode[0] = OP_ONCE;
5860 *code++ = OP_KET;
5861 PUTINC(code, 0, len);
5862 PUT(tempcode, 1, len);
5863 break;
5864 }
5865 }
5866
5867 /* In all case we no longer have a previous item. We also set the
5868 "follows varying string" flag for subsequently encountered reqchars if
5869 it isn't already set and we have just passed a varying length item. */
5870
5871 END_REPEAT:
5872 previous = NULL;
5873 cd->req_varyopt |= reqvary;
5874 break;
5875
5876
5877 /* ===================================================================*/
5878 /* Start of nested parenthesized sub-expression, or comment or lookahead or
5879 lookbehind or option setting or condition or all the other extended
5880 parenthesis forms. */
5881
5882 case CHAR_LEFT_PARENTHESIS:
5883 newoptions = options;
5884 skipbytes = 0;
5885 bravalue = OP_CBRA;
5886 save_hwm = cd->hwm;
5887 reset_bracount = FALSE;
5888
5889 /* First deal with various "verbs" that can be introduced by '*'. */
5890
5891 ptr++;
5892 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5893 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5894 {
5895 int i, namelen;
5896 int arglen = 0;
5897 const char *vn = verbnames;
5898 const pcre_uchar *name = ptr + 1;
5899 const pcre_uchar *arg = NULL;
5900 previous = NULL;
5901 ptr++;
5902 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5903 namelen = (int)(ptr - name);
5904
5905 /* It appears that Perl allows any characters whatsoever, other than
5906 a closing parenthesis, to appear in arguments, so we no longer insist on
5907 letters, digits, and underscores. */
5908
5909 if (*ptr == CHAR_COLON)
5910 {
5911 arg = ++ptr;
5912 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5913 arglen = (int)(ptr - arg);
5914 if ((unsigned int)arglen > MAX_MARK)
5915 {
5916 *errorcodeptr = ERR75;
5917 goto FAILED;
5918 }
5919 }
5920
5921 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5922 {
5923 *errorcodeptr = ERR60;
5924 goto FAILED;
5925 }
5926
5927 /* Scan the table of verb names */
5928
5929 for (i = 0; i < verbcount; i++)
5930 {
5931 if (namelen == verbs[i].len &&
5932 STRNCMP_UC_C8(name, vn, namelen) == 0)
5933 {
5934 int setverb;
5935
5936 /* Check for open captures before ACCEPT and convert it to
5937 ASSERT_ACCEPT if in an assertion. */
5938
5939 if (verbs[i].op == OP_ACCEPT)
5940 {
5941 open_capitem *oc;
5942 if (arglen != 0)
5943 {
5944 *errorcodeptr = ERR59;
5945 goto FAILED;
5946 }
5947 cd->had_accept = TRUE;
5948 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5949 {
5950 *code++ = OP_CLOSE;
5951 PUT2INC(code, 0, oc->number);
5952 }
5953 setverb = *code++ =
5954 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5955
5956 /* Do not set firstchar after *ACCEPT */
5957 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5958 }
5959
5960 /* Handle other cases with/without an argument */
5961
5962 else if (arglen == 0)
5963 {
5964 if (verbs[i].op < 0) /* Argument is mandatory */
5965 {
5966 *errorcodeptr = ERR66;
5967 goto FAILED;
5968 }
5969 setverb = *code++ = verbs[i].op;
5970 }
5971
5972 else
5973 {
5974 if (verbs[i].op_arg < 0) /* Argument is forbidden */
5975 {
5976 *errorcodeptr = ERR59;
5977 goto FAILED;
5978 }
5979 setverb = *code++ = verbs[i].op_arg;
5980 *code++ = arglen;
5981 memcpy(code, arg, IN_UCHARS(arglen));
5982 code += arglen;
5983 *code++ = 0;
5984 }
5985
5986 switch (setverb)
5987 {
5988 case OP_THEN:
5989 case OP_THEN_ARG:
5990 cd->external_flags |= PCRE_HASTHEN;
5991 break;
5992
5993 case OP_PRUNE:
5994 case OP_PRUNE_ARG:
5995 case OP_SKIP:
5996 case OP_SKIP_ARG:
5997 cd->had_pruneorskip = TRUE;
5998 break;
5999 }
6000
6001 break; /* Found verb, exit loop */
6002 }
6003
6004 vn += verbs[i].len + 1;
6005 }
6006
6007 if (i < verbcount) continue; /* Successfully handled a verb */
6008 *errorcodeptr = ERR60; /* Verb not recognized */
6009 goto FAILED;
6010 }
6011
6012 /* Deal with the extended parentheses; all are introduced by '?', and the
6013 appearance of any of them means that this is not a capturing group. */
6014
6015 else if (*ptr == CHAR_QUESTION_MARK)
6016 {
6017 int i, set, unset, namelen;
6018 int *optset;
6019 const pcre_uchar *name;
6020 pcre_uchar *slot;
6021
6022 switch (*(++ptr))
6023 {
6024 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
6025 ptr++;
6026 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6027 if (*ptr == CHAR_NULL)
6028 {
6029 *errorcodeptr = ERR18;
6030 goto FAILED;
6031 }
6032 continue;
6033
6034
6035 /* ------------------------------------------------------------ */
6036 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6037 reset_bracount = TRUE;
6038 /* Fall through */
6039
6040 /* ------------------------------------------------------------ */
6041 case CHAR_COLON: /* Non-capturing bracket */
6042 bravalue = OP_BRA;
6043 ptr++;
6044 break;
6045
6046
6047 /* ------------------------------------------------------------ */
6048 case CHAR_LEFT_PARENTHESIS:
6049 bravalue = OP_COND; /* Conditional group */
6050 tempptr = ptr;
6051
6052 /* A condition can be an assertion, a number (referring to a numbered
6053 group), a name (referring to a named group), or 'R', referring to
6054 recursion. R<digits> and R&name are also permitted for recursion tests.
6055
6056 There are several syntaxes for testing a named group: (?(name)) is used
6057 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
6058
6059 There are two unfortunate ambiguities, caused by history. (a) 'R' can
6060 be the recursive thing or the name 'R' (and similarly for 'R' followed
6061 by digits), and (b) a number could be a name that consists of digits.
6062 In both cases, we look for a name first; if not found, we try the other
6063 cases.
6064
6065 For compatibility with auto-callouts, we allow a callout to be
6066 specified before a condition that is an assertion. First, check for the
6067 syntax of a callout; if found, adjust the temporary pointer that is
6068 used to check for an assertion condition. That's all that is needed! */
6069
6070 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6071 {
6072 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6073 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6074 tempptr += i + 1;
6075 }
6076
6077 /* For conditions that are assertions, check the syntax, and then exit
6078 the switch. This will take control down to where bracketed groups,
6079 including assertions, are processed. */
6080
6081 if (tempptr[1] == CHAR_QUESTION_MARK &&
6082 (tempptr[2] == CHAR_EQUALS_SIGN ||
6083 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6084 tempptr[2] == CHAR_LESS_THAN_SIGN))
6085 break;
6086
6087 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6088 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6089
6090 code[1+LINK_SIZE] = OP_CREF;
6091 skipbytes = 1+IMM2_SIZE;
6092 refsign = -1;
6093
6094 /* Check for a test for recursion in a named group. */
6095
6096 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
6097 {
6098 terminator = -1;
6099 ptr += 2;
6100 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6101 }
6102
6103 /* Check for a test for a named group's having been set, using the Perl
6104 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6105 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6106 consist entirely of digits, there is scope for ambiguity. */
6107
6108 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6109 {
6110 terminator = CHAR_GREATER_THAN_SIGN;
6111 ptr++;
6112 }
6113 else if (ptr[1] == CHAR_APOSTROPHE)
6114 {
6115 terminator = CHAR_APOSTROPHE;
6116 ptr++;
6117 }
6118 else
6119 {
6120 terminator = CHAR_NULL;
6121 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6122 }
6123
6124 /* When a name is one of a number of duplicates, a different opcode is
6125 used and it needs more memory. Unfortunately we cannot tell whether a
6126 name is a duplicate in the first pass, so we have to allow for more
6127 memory except when we know it is a relative numerical reference. */
6128
6129 if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6130
6131 /* We now expect to read a name (possibly all digits); any thing else
6132 is an error. In the case of all digits, also get it as a number. */
6133
6134 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6135 {
6136 ptr += 1; /* To get the right offset */
6137 *errorcodeptr = ERR28;
6138 goto FAILED;
6139 }
6140
6141 recno = 0;
6142 name = ++ptr;
6143 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6144 {
6145 if (recno >= 0)
6146 recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
6147 ptr++;
6148 }
6149 namelen = (int)(ptr - name);
6150
6151 /* Check the terminator */
6152
6153 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6154 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6155 {
6156 ptr--; /* Error offset */
6157 *errorcodeptr = ERR26;
6158 goto FAILED;
6159 }
6160
6161 /* Do no further checking in the pre-compile phase. */
6162
6163 if (lengthptr != NULL) break;
6164
6165 /* In the real compile we do the work of looking for the actual
6166 reference. If the string started with "+" or "-" we require the rest to
6167 be digits, in which case recno will be set. */
6168
6169 if (refsign > 0)
6170 {
6171 if (recno <= 0)
6172 {
6173 *errorcodeptr = ERR58;
6174 goto FAILED;
6175 }
6176 recno = (refsign == CHAR_MINUS)?
6177 cd->bracount - recno + 1 : recno +cd->bracount;
6178 if (recno <= 0 || recno > cd->final_bracount)
6179 {
6180 *errorcodeptr = ERR15;
6181 goto FAILED;
6182 }
6183 PUT2(code, 2+LINK_SIZE, recno);
6184 break;
6185 }
6186
6187 /* Otherwise (did not start with "+" or "-"), start by looking for the
6188 name. */
6189
6190 slot = cd->name_table;
6191 for (i = 0; i < cd->names_found; i++)
6192 {
6193 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6194 slot += cd->name_entry_size;
6195 }
6196
6197 /* Found the named subpattern. If the name is duplicated, add one to
6198 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6199 appropriate data values. Otherwise, just insert the unique subpattern
6200 number. */
6201
6202 if (i < cd->names_found)
6203 {
6204 int offset = i++;
6205 int count = 1;
6206 recno = GET2(slot, 0); /* Number from first found */
6207 for (; i < cd->names_found; i++)
6208 {
6209 slot += cd->name_entry_size;
6210 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6211 count++;
6212 }
6213 if (count > 1)
6214 {
6215 PUT2(code, 2+LINK_SIZE, offset);
6216 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6217 skipbytes += IMM2_SIZE;
6218 code[1+LINK_SIZE]++;
6219 }
6220 else /* Not a duplicated name */
6221 {
6222 PUT2(code, 2+LINK_SIZE, recno);
6223 }
6224 }
6225
6226 /* If terminator == CHAR_NULL it means that the name followed directly
6227 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6228 are some further alternatives to try. For the cases where terminator !=
6229 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
6230 now checked all the possibilities, so give an error. */
6231
6232 else if (terminator != CHAR_NULL)
6233 {
6234 *errorcodeptr = ERR15;
6235 goto FAILED;
6236 }
6237
6238 /* Check for (?(R) for recursion. Allow digits after R to specify a
6239 specific group number. */
6240
6241 else if (*name == CHAR_R)
6242 {
6243 recno = 0;
6244 for (i = 1; i < namelen; i++)
6245 {
6246 if (!IS_DIGIT(name[i]))
6247 {
6248 *errorcodeptr = ERR15;
6249 goto FAILED;
6250 }
6251 recno = recno * 10 + name[i] - CHAR_0;
6252 }
6253 if (recno == 0) recno = RREF_ANY;
6254 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
6255 PUT2(code, 2+LINK_SIZE, recno);
6256 }
6257
6258 /* Similarly, check for the (?(DEFINE) "condition", which is always
6259 false. */
6260
6261 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6262 {
6263 code[1+LINK_SIZE] = OP_DEF;
6264 skipbytes = 1;
6265 }
6266
6267 /* Check for the "name" actually being a subpattern number. We are
6268 in the second pass here, so final_bracount is set. */
6269
6270 else if (recno > 0 && recno <= cd->final_bracount)
6271 {
6272 PUT2(code, 2+LINK_SIZE, recno);
6273 }
6274
6275 /* Either an unidentified subpattern, or a reference to (?(0) */
6276
6277 else
6278 {
6279 *errorcodeptr = (recno == 0)? ERR35: ERR15;
6280 goto FAILED;
6281 }
6282 break;
6283
6284
6285 /* ------------------------------------------------------------ */
6286 case CHAR_EQUALS_SIGN: /* Positive lookahead */
6287 bravalue = OP_ASSERT;
6288 cd->assert_depth += 1;
6289 ptr++;
6290 break;
6291
6292
6293 /* ------------------------------------------------------------ */
6294 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
6295 ptr++;
6296 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
6297 {
6298 *code++ = OP_FAIL;
6299 previous = NULL;
6300 continue;
6301 }
6302 bravalue = OP_ASSERT_NOT;
6303 cd->assert_depth += 1;
6304 break;
6305
6306
6307 /* ------------------------------------------------------------ */
6308 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
6309 switch (ptr[1])
6310 {
6311 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
6312 bravalue = OP_ASSERTBACK;
6313 cd->assert_depth += 1;
6314 ptr += 2;
6315 break;
6316
6317 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
6318 bravalue = OP_ASSERTBACK_NOT;
6319 cd->assert_depth += 1;
6320 ptr += 2;
6321 break;
6322
6323 default: /* Could be name define, else bad */
6324 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
6325 goto DEFINE_NAME;
6326 ptr++; /* Correct offset for error */
6327 *errorcodeptr = ERR24;
6328 goto FAILED;
6329 }
6330 break;
6331
6332
6333 /* ------------------------------------------------------------ */
6334 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
6335 bravalue = OP_ONCE;
6336 ptr++;
6337 break;
6338
6339
6340 /* ------------------------------------------------------------ */
6341 case CHAR_C: /* Callout - may be followed by digits; */
6342 previous_callout = code; /* Save for later completion */
6343 after_manual_callout = 1; /* Skip one item before completing */
6344 *code++ = OP_CALLOUT;
6345 {
6346 int n = 0;
6347 ptr++;
6348 while(IS_DIGIT(*ptr))
6349 n = n * 10 + *ptr++ - CHAR_0;
6350 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6351 {
6352 *errorcodeptr = ERR39;
6353 goto FAILED;
6354 }
6355 if (n > 255)
6356 {
6357 *errorcodeptr = ERR38;
6358 goto FAILED;
6359 }
6360 *code++ = n;
6361 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6362 PUT(code, LINK_SIZE, 0); /* Default length */
6363 code += 2 * LINK_SIZE;
6364 }
6365 previous = NULL;
6366 continue;
6367
6368
6369 /* ------------------------------------------------------------ */
6370 case CHAR_P: /* Python-style named subpattern handling */
6371 if (*(++ptr) == CHAR_EQUALS_SIGN ||
6372 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
6373 {
6374 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6375 terminator = CHAR_RIGHT_PARENTHESIS;
6376 goto NAMED_REF_OR_RECURSE;
6377 }
6378 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
6379 {
6380 *errorcodeptr = ERR41;
6381 goto FAILED;
6382 }
6383 /* Fall through to handle (?P< as (?< is handled */
6384
6385
6386 /* ------------------------------------------------------------ */
6387 DEFINE_NAME: /* Come here from (?< handling */
6388 case CHAR_APOSTROPHE:
6389 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6390 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6391 name = ++ptr;
6392
6393 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6394 namelen = (int)(ptr - name);
6395
6396 /* In the pre-compile phase, do a syntax check, remember the longest
6397 name, and then remember the group in a vector, expanding it if
6398 necessary. Duplicates for the same number are skipped; other duplicates
6399 are checked for validity. In the actual compile, there is nothing to
6400 do. */
6401
6402 if (lengthptr != NULL)
6403 {
6404 named_group *ng;
6405 pcre_uint32 number = cd->bracount + 1;
6406
6407 if (*ptr != (pcre_uchar)terminator)
6408 {
6409 *errorcodeptr = ERR42;
6410 goto FAILED;
6411 }
6412
6413 if (cd->names_found >= MAX_NAME_COUNT)
6414 {
6415 *errorcodeptr = ERR49;
6416 goto FAILED;
6417 }
6418
6419 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6420 {
6421 cd->name_entry_size = namelen + IMM2_SIZE + 1;
6422 if (namelen > MAX_NAME_SIZE)
6423 {
6424 *errorcodeptr = ERR48;
6425 goto FAILED;
6426 }
6427 }
6428
6429 /* Scan the list to check for duplicates. For duplicate names, if the
6430 number is the same, break the loop, which causes the name to be
6431 discarded; otherwise, if DUPNAMES is not set, give an error.
6432 If it is set, allow the name with a different number, but continue
6433 scanning in case this is a duplicate with the same number. For
6434 non-duplicate names, give an error if the number is duplicated. */
6435
6436 ng = cd->named_groups;
6437 for (i = 0; i < cd->names_found; i++, ng++)
6438 {
6439 if (namelen == ng->length &&
6440 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6441 {
6442 if (ng->number == number) break;
6443 if ((options & PCRE_DUPNAMES) == 0)
6444 {
6445 *errorcodeptr = ERR43;
6446 goto FAILED;
6447 }
6448 cd->dupnames = TRUE; /* Duplicate names exist */
6449 }
6450 else if (ng->number == number)
6451 {
6452 *errorcodeptr = ERR65;
6453 goto FAILED;
6454 }
6455 }
6456
6457 if (i >= cd->names_found) /* Not a duplicate with same number */
6458 {
6459 /* Increase the list size if necessary */
6460
6461 if (cd->names_found >= cd->named_group_list_size)
6462 {
6463 int newsize = cd->named_group_list_size * 2;
6464 named_group *newspace = (PUBL(malloc))
6465 (newsize * sizeof(named_group));
6466
6467 if (newspace == NULL)
6468 {
6469 *errorcodeptr = ERR21;
6470 goto FAILED;
6471 }
6472
6473 memcpy(newspace, cd->named_groups,
6474 cd->named_group_list_size * sizeof(named_group));
6475 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
6476 (PUBL(free))((void *)cd->named_groups);
6477 cd->named_groups = newspace;
6478 cd->named_group_list_size = newsize;
6479 }
6480
6481 cd->named_groups[cd->names_found].name = name;
6482 cd->named_groups[cd->names_found].length = namelen;
6483 cd->named_groups[cd->names_found].number = number;
6484 cd->names_found++;
6485 }
6486 }
6487
6488 ptr++; /* Move past > or ' in both passes. */
6489 goto NUMBERED_GROUP;
6490
6491
6492 /* ------------------------------------------------------------ */
6493 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
6494 terminator = CHAR_RIGHT_PARENTHESIS;
6495 is_recurse = TRUE;
6496 /* Fall through */
6497
6498 /* We come here from the Python syntax above that handles both
6499 references (?P=name) and recursion (?P>name), as well as falling
6500 through from the Perl recursion syntax (?&name). We also come here from
6501 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6502 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6503
6504 NAMED_REF_OR_RECURSE:
6505 name = ++ptr;
6506 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6507 namelen = (int)(ptr - name);
6508
6509 /* In the pre-compile phase, do a syntax check. We used to just set
6510 a dummy reference number, because it was not used in the first pass.
6511 However, with the change of recursive back references to be atomic,
6512 we have to look for the number so that this state can be identified, as
6513 otherwise the incorrect length is computed. If it's not a backwards
6514 reference, the dummy number will do. */
6515
6516 if (lengthptr != NULL)
6517 {
6518 named_group *ng;
6519
6520 if (namelen == 0)
6521 {
6522 *errorcodeptr = ERR62;
6523 goto FAILED;
6524 }
6525 if (*ptr != (pcre_uchar)terminator)
6526 {
6527 *errorcodeptr = ERR42;
6528 goto FAILED;
6529 }
6530 if (namelen > MAX_NAME_SIZE)
6531 {
6532 *errorcodeptr = ERR48;
6533 goto FAILED;
6534 }
6535
6536 /* The name table does not exist in the first pass; instead we must
6537 scan the list of names encountered so far in order to get the
6538 number. If the name is not found, set the value to 0 for a forward
6539 reference. */
6540
6541 ng = cd->named_groups;
6542 for (i = 0; i < cd->names_found; i++, ng++)
6543 {
6544 if (namelen == ng->length &&
6545 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6546 break;
6547 }
6548 recno = (i < cd->names_found)? ng->number : 0;
6549
6550 /* Count named back references. */
6551
6552 if (!is_recurse) cd->namedrefcount++;
6553 }
6554
6555 /* In the real compile, search the name table. We check the name
6556 first, and then check that we have reached the end of the name in the
6557 table. That way, if the name is longer than any in the table, the
6558 comparison will fail without reading beyond the table entry. */
6559
6560 else
6561 {
6562 slot = cd->name_table;
6563 for (i = 0; i < cd->names_found; i++)
6564 {
6565 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6566 slot[IMM2_SIZE+namelen] == 0)
6567 break;
6568 slot += cd->name_entry_size;
6569 }
6570
6571 if (i < cd->names_found)
6572 {
6573 recno = GET2(slot, 0);
6574 }
6575 else
6576 {
6577 *errorcodeptr = ERR15;
6578 goto FAILED;
6579 }
6580 }
6581
6582 /* In both phases, for recursions, we can now go to the code than
6583 handles numerical recursion. */
6584
6585 if (is_recurse) goto HANDLE_RECURSION;
6586
6587 /* In the second pass we must see if the name is duplicated. If so, we
6588 generate a different opcode. */
6589
6590 if (lengthptr == NULL && cd->dupnames)
6591 {
6592 int count = 1;
6593 unsigned int index = i;
6594 pcre_uchar *cslot = slot + cd->name_entry_size;
6595
6596 for (i++; i < cd->names_found; i++)
6597 {
6598 if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
6599 count++;
6600 cslot += cd->name_entry_size;
6601 }
6602
6603 if (count > 1)
6604 {
6605 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6606 previous = code;
6607 *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6608 PUT2INC(code, 0, index);
6609 PUT2INC(code, 0, count);
6610
6611 /* Process each potentially referenced group. */
6612
6613 for (; slot < cslot; slot += cd->name_entry_size)
6614 {
6615 open_capitem *oc;
6616 recno = GET2(slot, 0);
6617 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6618 if (recno > cd->top_backref) cd->top_backref = recno;
6619
6620 /* Check to see if this back reference is recursive, that it, it
6621 is inside the group that it references. A flag is set so that the
6622 group can be made atomic. */
6623
6624 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6625 {
6626 if (oc->number == recno)
6627 {
6628 oc->flag = TRUE;
6629 break;
6630 }
6631 }
6632 }
6633
6634 continue; /* End of back ref handling */
6635 }
6636 }
6637
6638 /* First pass, or a non-duplicated name. */
6639
6640 goto HANDLE_REFERENCE;
6641
6642
6643 /* ------------------------------------------------------------ */
6644 case CHAR_R: /* Recursion */
6645 ptr++; /* Same as (?0) */
6646 /* Fall through */
6647
6648
6649 /* ------------------------