/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1370 - (show annotations)
Wed Oct 9 10:18:26 2013 UTC (6 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 293651 byte(s)
Add \o{} and tidy up \x{} handling. Minor update to RunTest.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Tables of names of POSIX character classes and their lengths. The names are
264 now all in a single string, to reduce the number of relocations when a shared
265 library is dynamically loaded. The list of lengths is terminated by a zero
266 length entry. The first three must be alpha, lower, upper, as this is assumed
267 for handling case independence. */
268
269 static const char posix_names[] =
270 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
271 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
272 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
273 STRING_word0 STRING_xdigit;
274
275 static const pcre_uint8 posix_name_lengths[] = {
276 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
277
278 /* Table of class bit maps for each POSIX class. Each class is formed from a
279 base map, with an optional addition or removal of another map. Then, for some
280 classes, there is some additional tweaking: for [:blank:] the vertical space
281 characters are removed, and for [:alpha:] and [:alnum:] the underscore
282 character is removed. The triples in the table consist of the base map offset,
283 second map offset or -1 if no second map, and a non-negative value for map
284 addition or a negative value for map subtraction (if there are two maps). The
285 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
286 remove vertical space characters, 2 => remove underscore. */
287
288 static const int posix_class_maps[] = {
289 cbit_word, cbit_digit, -2, /* alpha */
290 cbit_lower, -1, 0, /* lower */
291 cbit_upper, -1, 0, /* upper */
292 cbit_word, -1, 2, /* alnum - word without underscore */
293 cbit_print, cbit_cntrl, 0, /* ascii */
294 cbit_space, -1, 1, /* blank - a GNU extension */
295 cbit_cntrl, -1, 0, /* cntrl */
296 cbit_digit, -1, 0, /* digit */
297 cbit_graph, -1, 0, /* graph */
298 cbit_print, -1, 0, /* print */
299 cbit_punct, -1, 0, /* punct */
300 cbit_space, -1, 0, /* space */
301 cbit_word, -1, 0, /* word - a Perl extension */
302 cbit_xdigit,-1, 0 /* xdigit */
303 };
304
305 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
306 substitutes must be in the order of the names, defined above, and there are
307 both positive and negative cases. NULL means no substitute. */
308
309 #ifdef SUPPORT_UCP
310 static const pcre_uchar string_PNd[] = {
311 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
312 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
313 static const pcre_uchar string_pNd[] = {
314 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
315 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
316 static const pcre_uchar string_PXsp[] = {
317 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
318 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319 static const pcre_uchar string_pXsp[] = {
320 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322 static const pcre_uchar string_PXwd[] = {
323 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
324 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325 static const pcre_uchar string_pXwd[] = {
326 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328
329 static const pcre_uchar *substitutes[] = {
330 string_PNd, /* \D */
331 string_pNd, /* \d */
332 string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
333 string_pXsp, /* \s */
334 string_PXwd, /* \W */
335 string_pXwd /* \w */
336 };
337
338 static const pcre_uchar string_pL[] = {
339 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
340 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pLl[] = {
342 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_pLu[] = {
345 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
346 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXan[] = {
348 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 static const pcre_uchar string_h[] = {
351 CHAR_BACKSLASH, CHAR_h, '\0' };
352 static const pcre_uchar string_pXps[] = {
353 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355 static const pcre_uchar string_PL[] = {
356 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
357 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358 static const pcre_uchar string_PLl[] = {
359 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
360 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361 static const pcre_uchar string_PLu[] = {
362 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
363 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
364 static const pcre_uchar string_PXan[] = {
365 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
366 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
367 static const pcre_uchar string_H[] = {
368 CHAR_BACKSLASH, CHAR_H, '\0' };
369 static const pcre_uchar string_PXps[] = {
370 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372
373 static const pcre_uchar *posix_substitutes[] = {
374 string_pL, /* alpha */
375 string_pLl, /* lower */
376 string_pLu, /* upper */
377 string_pXan, /* alnum */
378 NULL, /* ascii */
379 string_h, /* blank */
380 NULL, /* cntrl */
381 string_pNd, /* digit */
382 NULL, /* graph */
383 NULL, /* print */
384 NULL, /* punct */
385 string_pXps, /* space */ /* NOTE: Xps is POSIX space */
386 string_pXwd, /* word */
387 NULL, /* xdigit */
388 /* Negated cases */
389 string_PL, /* ^alpha */
390 string_PLl, /* ^lower */
391 string_PLu, /* ^upper */
392 string_PXan, /* ^alnum */
393 NULL, /* ^ascii */
394 string_H, /* ^blank */
395 NULL, /* ^cntrl */
396 string_PNd, /* ^digit */
397 NULL, /* ^graph */
398 NULL, /* ^print */
399 NULL, /* ^punct */
400 string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
401 string_PXwd, /* ^word */
402 NULL /* ^xdigit */
403 };
404 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
405 #endif
406
407 #define STRING(a) # a
408 #define XSTRING(s) STRING(s)
409
410 /* The texts of compile-time error messages. These are "char *" because they
411 are passed to the outside world. Do not ever re-use any error number, because
412 they are documented. Always add a new error instead. Messages marked DEAD below
413 are no longer used. This used to be a table of strings, but in order to reduce
414 the number of relocations needed when a shared library is loaded dynamically,
415 it is now one long string. We cannot use a table of offsets, because the
416 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
417 simply count through to the one we want - this isn't a performance issue
418 because these strings are used only when there is a compilation error.
419
420 Each substring ends with \0 to insert a null character. This includes the final
421 substring, so that the whole string ends with \0\0, which can be detected when
422 counting through. */
423
424 static const char error_texts[] =
425 "no error\0"
426 "\\ at end of pattern\0"
427 "\\c at end of pattern\0"
428 "unrecognized character follows \\\0"
429 "numbers out of order in {} quantifier\0"
430 /* 5 */
431 "number too big in {} quantifier\0"
432 "missing terminating ] for character class\0"
433 "invalid escape sequence in character class\0"
434 "range out of order in character class\0"
435 "nothing to repeat\0"
436 /* 10 */
437 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
438 "internal error: unexpected repeat\0"
439 "unrecognized character after (? or (?-\0"
440 "POSIX named classes are supported only within a class\0"
441 "missing )\0"
442 /* 15 */
443 "reference to non-existent subpattern\0"
444 "erroffset passed as NULL\0"
445 "unknown option bit(s) set\0"
446 "missing ) after comment\0"
447 "parentheses nested too deeply\0" /** DEAD **/
448 /* 20 */
449 "regular expression is too large\0"
450 "failed to get memory\0"
451 "unmatched parentheses\0"
452 "internal error: code overflow\0"
453 "unrecognized character after (?<\0"
454 /* 25 */
455 "lookbehind assertion is not fixed length\0"
456 "malformed number or name after (?(\0"
457 "conditional group contains more than two branches\0"
458 "assertion expected after (?(\0"
459 "(?R or (?[+-]digits must be followed by )\0"
460 /* 30 */
461 "unknown POSIX class name\0"
462 "POSIX collating elements are not supported\0"
463 "this version of PCRE is compiled without UTF support\0"
464 "spare error\0" /** DEAD **/
465 "character value in \\x{} or \\o{} is too large\0"
466 /* 35 */
467 "invalid condition (?(0)\0"
468 "\\C not allowed in lookbehind assertion\0"
469 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
470 "number after (?C is > 255\0"
471 "closing ) for (?C expected\0"
472 /* 40 */
473 "recursive call could loop indefinitely\0"
474 "unrecognized character after (?P\0"
475 "syntax error in subpattern name (missing terminator)\0"
476 "two named subpatterns have the same name\0"
477 "invalid UTF-8 string\0"
478 /* 45 */
479 "support for \\P, \\p, and \\X has not been compiled\0"
480 "malformed \\P or \\p sequence\0"
481 "unknown property name after \\P or \\p\0"
482 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
483 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
484 /* 50 */
485 "repeated subpattern is too long\0" /** DEAD **/
486 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
487 "internal error: overran compiling workspace\0"
488 "internal error: previously-checked referenced subpattern not found\0"
489 "DEFINE group contains more than one branch\0"
490 /* 55 */
491 "repeating a DEFINE group is not allowed\0" /** DEAD **/
492 "inconsistent NEWLINE options\0"
493 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
494 "a numbered reference must not be zero\0"
495 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
496 /* 60 */
497 "(*VERB) not recognized or malformed\0"
498 "number is too big\0"
499 "subpattern name expected\0"
500 "digit expected after (?+\0"
501 "] is an invalid data character in JavaScript compatibility mode\0"
502 /* 65 */
503 "different names for subpatterns of the same number are not allowed\0"
504 "(*MARK) must have an argument\0"
505 "this version of PCRE is not compiled with Unicode property support\0"
506 "\\c must be followed by an ASCII character\0"
507 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
508 /* 70 */
509 "internal error: unknown opcode in find_fixedlength()\0"
510 "\\N is not supported in a class\0"
511 "too many forward references\0"
512 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
513 "invalid UTF-16 string\0"
514 /* 75 */
515 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
516 "character value in \\u.... sequence is too large\0"
517 "invalid UTF-32 string\0"
518 "setting UTF is disabled by the application\0"
519 "non-hex character in \\x{} (closing brace missing?)\0"
520 /* 80 */
521 "non-octal character in \\o{} (closing brace missing?)\0"
522 "missing opening brace after \\o\0"
523 ;
524
525 /* Table to identify digits and hex digits. This is used when compiling
526 patterns. Note that the tables in chartables are dependent on the locale, and
527 may mark arbitrary characters as digits - but the PCRE compiling code expects
528 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
529 a private table here. It costs 256 bytes, but it is a lot faster than doing
530 character value tests (at least in some simple cases I timed), and in some
531 applications one wants PCRE to compile efficiently as well as match
532 efficiently.
533
534 For convenience, we use the same bit definitions as in chartables:
535
536 0x04 decimal digit
537 0x08 hexadecimal digit
538
539 Then we can use ctype_digit and ctype_xdigit in the code. */
540
541 /* Using a simple comparison for decimal numbers rather than a memory read
542 is much faster, and the resulting code is simpler (the compiler turns it
543 into a subtraction and unsigned comparison). */
544
545 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
546
547 #ifndef EBCDIC
548
549 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
550 UTF-8 mode. */
551
552 static const pcre_uint8 digitab[] =
553 {
554 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
555 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
556 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
557 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
558 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
559 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
560 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
561 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
562 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
563 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
564 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
565 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
566 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
586
587 #else
588
589 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
590
591 static const pcre_uint8 digitab[] =
592 {
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
609 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
617 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
623 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
624 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
625
626 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
627 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
628 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
629 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
631 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
635 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
636 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
638 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
640 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
643 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
644 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
645 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
646 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
647 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
648 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
649 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
650 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
651 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
652 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
653 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
654 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
655 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
656 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
657 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
658 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
659 #endif
660
661
662 /* This table is used to check whether auto-possessification is possible
663 between adjacent character-type opcodes. The left-hand (repeated) opcode is
664 used to select the row, and the right-hand opcode is use to select the column.
665 A value of 1 means that auto-possessification is OK. For example, the second
666 value in the first row means that \D+\d can be turned into \D++\d.
667
668 The Unicode property types (\P and \p) have to be present to fill out the table
669 because of what their opcode values are, but the table values should always be
670 zero because property types are handled separately in the code. The last four
671 columns apply to items that cannot be repeated, so there is no need to have
672 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
673 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
674
675 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
676 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
677
678 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
679 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
680 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
681 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
682 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
683 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
684 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
685 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
686 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
687 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
688 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
689 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
690 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
691 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
692 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
693 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
694 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
695 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
696 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
697 };
698
699
700 /* This table is used to check whether auto-possessification is possible
701 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
702 left-hand (repeated) opcode is used to select the row, and the right-hand
703 opcode is used to select the column. The values are as follows:
704
705 0 Always return FALSE (never auto-possessify)
706 1 Character groups are distinct (possessify if both are OP_PROP)
707 2 Check character categories in the same group (general or particular)
708 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
709
710 4 Check left general category vs right particular category
711 5 Check right general category vs left particular category
712
713 6 Left alphanum vs right general category
714 7 Left space vs right general category
715 8 Left word vs right general category
716
717 9 Right alphanum vs left general category
718 10 Right space vs left general category
719 11 Right word vs left general category
720
721 12 Left alphanum vs right particular category
722 13 Left space vs right particular category
723 14 Left word vs right particular category
724
725 15 Right alphanum vs left particular category
726 16 Right space vs left particular category
727 17 Right word vs left particular category
728 */
729
730 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
731 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
732 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
733 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
734 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
735 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
736 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
737 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
738 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
739 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
740 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
741 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
742 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
743 };
744
745 /* This table is used to check whether auto-possessification is possible
746 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
747 specifies a general category and the other specifies a particular category. The
748 row is selected by the general category and the column by the particular
749 category. The value is 1 if the particular category is not part of the general
750 category. */
751
752 static const pcre_uint8 catposstab[7][30] = {
753 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
754 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
755 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
756 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
757 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
758 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
759 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
760 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
761 };
762
763 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
764 a general or particular category. The properties in each row are those
765 that apply to the character set in question. Duplication means that a little
766 unnecessary work is done when checking, but this keeps things much simpler
767 because they can all use the same code. For more details see the comment where
768 this table is used.
769
770 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
771 "space", but from Perl 5.18 it's included, so both categories are treated the
772 same here. */
773
774 static const pcre_uint8 posspropstab[3][4] = {
775 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
776 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
777 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
778 };
779
780
781
782 /*************************************************
783 * Find an error text *
784 *************************************************/
785
786 /* The error texts are now all in one long string, to save on relocations. As
787 some of the text is of unknown length, we can't use a table of offsets.
788 Instead, just count through the strings. This is not a performance issue
789 because it happens only when there has been a compilation error.
790
791 Argument: the error number
792 Returns: pointer to the error string
793 */
794
795 static const char *
796 find_error_text(int n)
797 {
798 const char *s = error_texts;
799 for (; n > 0; n--)
800 {
801 while (*s++ != CHAR_NULL) {};
802 if (*s == CHAR_NULL) return "Error text not found (please report)";
803 }
804 return s;
805 }
806
807
808
809 /*************************************************
810 * Expand the workspace *
811 *************************************************/
812
813 /* This function is called during the second compiling phase, if the number of
814 forward references fills the existing workspace, which is originally a block on
815 the stack. A larger block is obtained from malloc() unless the ultimate limit
816 has been reached or the increase will be rather small.
817
818 Argument: pointer to the compile data block
819 Returns: 0 if all went well, else an error number
820 */
821
822 static int
823 expand_workspace(compile_data *cd)
824 {
825 pcre_uchar *newspace;
826 int newsize = cd->workspace_size * 2;
827
828 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
829 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
830 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
831 return ERR72;
832
833 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
834 if (newspace == NULL) return ERR21;
835 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
836 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
837 if (cd->workspace_size > COMPILE_WORK_SIZE)
838 (PUBL(free))((void *)cd->start_workspace);
839 cd->start_workspace = newspace;
840 cd->workspace_size = newsize;
841 return 0;
842 }
843
844
845
846 /*************************************************
847 * Check for counted repeat *
848 *************************************************/
849
850 /* This function is called when a '{' is encountered in a place where it might
851 start a quantifier. It looks ahead to see if it really is a quantifier or not.
852 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
853 where the ddds are digits.
854
855 Arguments:
856 p pointer to the first char after '{'
857
858 Returns: TRUE or FALSE
859 */
860
861 static BOOL
862 is_counted_repeat(const pcre_uchar *p)
863 {
864 if (!IS_DIGIT(*p)) return FALSE;
865 p++;
866 while (IS_DIGIT(*p)) p++;
867 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
868
869 if (*p++ != CHAR_COMMA) return FALSE;
870 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
871
872 if (!IS_DIGIT(*p)) return FALSE;
873 p++;
874 while (IS_DIGIT(*p)) p++;
875
876 return (*p == CHAR_RIGHT_CURLY_BRACKET);
877 }
878
879
880
881 /*************************************************
882 * Handle escapes *
883 *************************************************/
884
885 /* This function is called when a \ has been encountered. It either returns a
886 positive value for a simple escape such as \n, or 0 for a data character which
887 will be placed in chptr. A backreference to group n is returned as negative n.
888 When UTF-8 is enabled, a positive value greater than 255 may be returned in
889 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
890 character of the escape sequence.
891
892 Arguments:
893 ptrptr points to the pattern position pointer
894 chptr points to a returned data character
895 errorcodeptr points to the errorcode variable
896 bracount number of previous extracting brackets
897 options the options bits
898 isclass TRUE if inside a character class
899
900 Returns: zero => a data character
901 positive => a special escape sequence
902 negative => a back reference
903 on error, errorcodeptr is set
904 */
905
906 static int
907 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
908 int bracount, int options, BOOL isclass)
909 {
910 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
911 BOOL utf = (options & PCRE_UTF8) != 0;
912 const pcre_uchar *ptr = *ptrptr + 1;
913 pcre_uint32 c;
914 int escape = 0;
915 int i;
916
917 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
918 ptr--; /* Set pointer back to the last byte */
919
920 /* If backslash is at the end of the pattern, it's an error. */
921
922 if (c == CHAR_NULL) *errorcodeptr = ERR1;
923
924 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
925 in a table. A non-zero result is something that can be returned immediately.
926 Otherwise further processing may be required. */
927
928 #ifndef EBCDIC /* ASCII/UTF-8 coding */
929 /* Not alphanumeric */
930 else if (c < CHAR_0 || c > CHAR_z) {}
931 else if ((i = escapes[c - CHAR_0]) != 0)
932 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
933
934 #else /* EBCDIC coding */
935 /* Not alphanumeric */
936 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
937 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
938 #endif
939
940 /* Escapes that need further processing, or are illegal. */
941
942 else
943 {
944 const pcre_uchar *oldptr;
945 BOOL braced, negated, overflow;
946 int s;
947
948 switch (c)
949 {
950 /* A number of Perl escapes are not handled by PCRE. We give an explicit
951 error. */
952
953 case CHAR_l:
954 case CHAR_L:
955 *errorcodeptr = ERR37;
956 break;
957
958 case CHAR_u:
959 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
960 {
961 /* In JavaScript, \u must be followed by four hexadecimal numbers.
962 Otherwise it is a lowercase u letter. */
963 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
964 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
965 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
966 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
967 {
968 c = 0;
969 for (i = 0; i < 4; ++i)
970 {
971 register pcre_uint32 cc = *(++ptr);
972 #ifndef EBCDIC /* ASCII/UTF-8 coding */
973 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
974 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
975 #else /* EBCDIC coding */
976 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
977 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
978 #endif
979 }
980
981 #if defined COMPILE_PCRE8
982 if (c > (utf ? 0x10ffffU : 0xffU))
983 #elif defined COMPILE_PCRE16
984 if (c > (utf ? 0x10ffffU : 0xffffU))
985 #elif defined COMPILE_PCRE32
986 if (utf && c > 0x10ffffU)
987 #endif
988 {
989 *errorcodeptr = ERR76;
990 }
991 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
992 }
993 }
994 else
995 *errorcodeptr = ERR37;
996 break;
997
998 case CHAR_U:
999 /* In JavaScript, \U is an uppercase U letter. */
1000 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1001 break;
1002
1003 /* In a character class, \g is just a literal "g". Outside a character
1004 class, \g must be followed by one of a number of specific things:
1005
1006 (1) A number, either plain or braced. If positive, it is an absolute
1007 backreference. If negative, it is a relative backreference. This is a Perl
1008 5.10 feature.
1009
1010 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1011 is part of Perl's movement towards a unified syntax for back references. As
1012 this is synonymous with \k{name}, we fudge it up by pretending it really
1013 was \k.
1014
1015 (3) For Oniguruma compatibility we also support \g followed by a name or a
1016 number either in angle brackets or in single quotes. However, these are
1017 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1018 the ESC_g code (cf \k). */
1019
1020 case CHAR_g:
1021 if (isclass) break;
1022 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1023 {
1024 escape = ESC_g;
1025 break;
1026 }
1027
1028 /* Handle the Perl-compatible cases */
1029
1030 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1031 {
1032 const pcre_uchar *p;
1033 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1034 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1035 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1036 {
1037 escape = ESC_k;
1038 break;
1039 }
1040 braced = TRUE;
1041 ptr++;
1042 }
1043 else braced = FALSE;
1044
1045 if (ptr[1] == CHAR_MINUS)
1046 {
1047 negated = TRUE;
1048 ptr++;
1049 }
1050 else negated = FALSE;
1051
1052 /* The integer range is limited by the machine's int representation. */
1053 s = 0;
1054 overflow = FALSE;
1055 while (IS_DIGIT(ptr[1]))
1056 {
1057 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1058 {
1059 overflow = TRUE;
1060 break;
1061 }
1062 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1063 }
1064 if (overflow) /* Integer overflow */
1065 {
1066 while (IS_DIGIT(ptr[1]))
1067 ptr++;
1068 *errorcodeptr = ERR61;
1069 break;
1070 }
1071
1072 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1073 {
1074 *errorcodeptr = ERR57;
1075 break;
1076 }
1077
1078 if (s == 0)
1079 {
1080 *errorcodeptr = ERR58;
1081 break;
1082 }
1083
1084 if (negated)
1085 {
1086 if (s > bracount)
1087 {
1088 *errorcodeptr = ERR15;
1089 break;
1090 }
1091 s = bracount - (s - 1);
1092 }
1093
1094 escape = -s;
1095 break;
1096
1097 /* The handling of escape sequences consisting of a string of digits
1098 starting with one that is not zero is not straightforward. Perl has changed
1099 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1100 recommended to avoid the ambiguities in the old syntax.
1101
1102 Outside a character class, the digits are read as a decimal number. If the
1103 number is less than 8 (used to be 10), or if there are that many previous
1104 extracting left brackets, then it is a back reference. Otherwise, up to
1105 three octal digits are read to form an escaped byte. Thus \123 is likely to
1106 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1107 the octal value is greater than 377, the least significant 8 bits are
1108 taken. \8 and \9 are treated as the literal characters 8 and 9.
1109
1110 Inside a character class, \ followed by a digit is always either a literal
1111 8 or 9 or an octal number. */
1112
1113 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1114 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1115
1116 if (!isclass)
1117 {
1118 oldptr = ptr;
1119 /* The integer range is limited by the machine's int representation. */
1120 s = (int)(c -CHAR_0);
1121 overflow = FALSE;
1122 while (IS_DIGIT(ptr[1]))
1123 {
1124 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1125 {
1126 overflow = TRUE;
1127 break;
1128 }
1129 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1130 }
1131 if (overflow) /* Integer overflow */
1132 {
1133 while (IS_DIGIT(ptr[1]))
1134 ptr++;
1135 *errorcodeptr = ERR61;
1136 break;
1137 }
1138 if (s < 8 || s <= bracount) /* Check for back reference */
1139 {
1140 escape = -s;
1141 break;
1142 }
1143 ptr = oldptr; /* Put the pointer back and fall through */
1144 }
1145
1146 /* Handle a digit following \ when the number is not a back reference. If
1147 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1148 then treat the digit as a following literal. At least by Perl 5.18 this
1149 changed so as not to insert the binary zero. */
1150
1151 if ((c = *ptr) >= CHAR_8) break;
1152
1153 /* Fall through with a digit less than 8 */
1154
1155 /* \0 always starts an octal number, but we may drop through to here with a
1156 larger first octal digit. The original code used just to take the least
1157 significant 8 bits of octal numbers (I think this is what early Perls used
1158 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1159 but no more than 3 octal digits. */
1160
1161 case CHAR_0:
1162 c -= CHAR_0;
1163 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1164 c = c * 8 + *(++ptr) - CHAR_0;
1165 #ifdef COMPILE_PCRE8
1166 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1167 #endif
1168 break;
1169
1170 /* \o is a relatively new Perl feature, supporting a more general way of
1171 specifying character codes in octal. The only supported form is \o{ddd}. */
1172
1173 case CHAR_o:
1174 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1175 {
1176 ptr += 2;
1177 c = 0;
1178 overflow = FALSE;
1179 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1180 {
1181 register pcre_uint32 cc = *ptr++;
1182 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1183 #ifdef COMPILE_PCRE32
1184 if (c >= 0x10000000l) { overflow = TRUE; break; }
1185 #endif
1186 c = (c << 3) + cc - CHAR_0 ;
1187 #if defined COMPILE_PCRE8
1188 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1189 #elif defined COMPILE_PCRE16
1190 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1191 #elif defined COMPILE_PCRE32
1192 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1193 #endif
1194 }
1195 if (overflow)
1196 {
1197 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1198 *errorcodeptr = ERR34;
1199 }
1200 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1201 {
1202 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1203 }
1204 else *errorcodeptr = ERR80;
1205 }
1206 break;
1207
1208 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1209 numbers. Otherwise it is a lowercase x letter. */
1210
1211 case CHAR_x:
1212 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1213 {
1214 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1215 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1216 {
1217 c = 0;
1218 for (i = 0; i < 2; ++i)
1219 {
1220 register pcre_uint32 cc = *(++ptr);
1221 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1222 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1223 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1224 #else /* EBCDIC coding */
1225 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1226 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1227 #endif
1228 }
1229 }
1230 } /* End JavaScript handling */
1231
1232 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1233 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1234 digits. If not, { used to be treated as a data character. However, Perl
1235 seems to read hex digits up to the first non-such, and ignore the rest, so
1236 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1237 now gives an error. */
1238
1239 else
1240 {
1241 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1242 {
1243 ptr += 2;
1244 c = 0;
1245 overflow = FALSE;
1246 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1247 {
1248 register pcre_uint32 cc = *ptr++;
1249 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1250
1251 #ifdef COMPILE_PCRE32
1252 if (c >= 0x10000000l) { overflow = TRUE; break; }
1253 #endif
1254
1255 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1256 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1257 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1258 #else /* EBCDIC coding */
1259 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1260 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1261 #endif
1262
1263 #if defined COMPILE_PCRE8
1264 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1265 #elif defined COMPILE_PCRE16
1266 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1267 #elif defined COMPILE_PCRE32
1268 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1269 #endif
1270 }
1271
1272 if (overflow)
1273 {
1274 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1275 *errorcodeptr = ERR34;
1276 }
1277
1278 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1279 {
1280 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1281 }
1282
1283 /* If the sequence of hex digits does not end with '}', give an error.
1284 We used just to recognize this construct and fall through to the normal
1285 \x handling, but nowadays Perl gives an error, which seems much more
1286 sensible, so we do too. */
1287
1288 else *errorcodeptr = ERR79;
1289 } /* End of \x{} processing */
1290
1291 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1292
1293 else
1294 {
1295 c = 0;
1296 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1297 {
1298 pcre_uint32 cc; /* Some compilers don't like */
1299 cc = *(++ptr); /* ++ in initializers */
1300 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1301 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1302 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1303 #else /* EBCDIC coding */
1304 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1305 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1306 #endif
1307 }
1308 } /* End of \xdd handling */
1309 } /* End of Perl-style \x handling */
1310 break;
1311
1312 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1313 An error is given if the byte following \c is not an ASCII character. This
1314 coding is ASCII-specific, but then the whole concept of \cx is
1315 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1316
1317 case CHAR_c:
1318 c = *(++ptr);
1319 if (c == CHAR_NULL)
1320 {
1321 *errorcodeptr = ERR2;
1322 break;
1323 }
1324 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1325 if (c > 127) /* Excludes all non-ASCII in either mode */
1326 {
1327 *errorcodeptr = ERR68;
1328 break;
1329 }
1330 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1331 c ^= 0x40;
1332 #else /* EBCDIC coding */
1333 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1334 c ^= 0xC0;
1335 #endif
1336 break;
1337
1338 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1339 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1340 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1341 odd, but there used to be some cases other than the default, and there may
1342 be again in future, so I haven't "optimized" it. */
1343
1344 default:
1345 if ((options & PCRE_EXTRA) != 0) switch(c)
1346 {
1347 default:
1348 *errorcodeptr = ERR3;
1349 break;
1350 }
1351 break;
1352 }
1353 }
1354
1355 /* Perl supports \N{name} for character names, as well as plain \N for "not
1356 newline". PCRE does not support \N{name}. However, it does support
1357 quantification such as \N{2,3}. */
1358
1359 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1360 !is_counted_repeat(ptr+2))
1361 *errorcodeptr = ERR37;
1362
1363 /* If PCRE_UCP is set, we change the values for \d etc. */
1364
1365 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1366 escape += (ESC_DU - ESC_D);
1367
1368 /* Set the pointer to the final character before returning. */
1369
1370 *ptrptr = ptr;
1371 *chptr = c;
1372 return escape;
1373 }
1374
1375
1376
1377 #ifdef SUPPORT_UCP
1378 /*************************************************
1379 * Handle \P and \p *
1380 *************************************************/
1381
1382 /* This function is called after \P or \p has been encountered, provided that
1383 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1384 pointing at the P or p. On exit, it is pointing at the final character of the
1385 escape sequence.
1386
1387 Argument:
1388 ptrptr points to the pattern position pointer
1389 negptr points to a boolean that is set TRUE for negation else FALSE
1390 ptypeptr points to an unsigned int that is set to the type value
1391 pdataptr points to an unsigned int that is set to the detailed property value
1392 errorcodeptr points to the error code variable
1393
1394 Returns: TRUE if the type value was found, or FALSE for an invalid type
1395 */
1396
1397 static BOOL
1398 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1399 unsigned int *pdataptr, int *errorcodeptr)
1400 {
1401 pcre_uchar c;
1402 int i, bot, top;
1403 const pcre_uchar *ptr = *ptrptr;
1404 pcre_uchar name[32];
1405
1406 c = *(++ptr);
1407 if (c == CHAR_NULL) goto ERROR_RETURN;
1408
1409 *negptr = FALSE;
1410
1411 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1412 negation. */
1413
1414 if (c == CHAR_LEFT_CURLY_BRACKET)
1415 {
1416 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1417 {
1418 *negptr = TRUE;
1419 ptr++;
1420 }
1421 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1422 {
1423 c = *(++ptr);
1424 if (c == CHAR_NULL) goto ERROR_RETURN;
1425 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1426 name[i] = c;
1427 }
1428 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1429 name[i] = 0;
1430 }
1431
1432 /* Otherwise there is just one following character */
1433
1434 else
1435 {
1436 name[0] = c;
1437 name[1] = 0;
1438 }
1439
1440 *ptrptr = ptr;
1441
1442 /* Search for a recognized property name using binary chop */
1443
1444 bot = 0;
1445 top = PRIV(utt_size);
1446
1447 while (bot < top)
1448 {
1449 int r;
1450 i = (bot + top) >> 1;
1451 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1452 if (r == 0)
1453 {
1454 *ptypeptr = PRIV(utt)[i].type;
1455 *pdataptr = PRIV(utt)[i].value;
1456 return TRUE;
1457 }
1458 if (r > 0) bot = i + 1; else top = i;
1459 }
1460
1461 *errorcodeptr = ERR47;
1462 *ptrptr = ptr;
1463 return FALSE;
1464
1465 ERROR_RETURN:
1466 *errorcodeptr = ERR46;
1467 *ptrptr = ptr;
1468 return FALSE;
1469 }
1470 #endif
1471
1472
1473
1474 /*************************************************
1475 * Read repeat counts *
1476 *************************************************/
1477
1478 /* Read an item of the form {n,m} and return the values. This is called only
1479 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1480 so the syntax is guaranteed to be correct, but we need to check the values.
1481
1482 Arguments:
1483 p pointer to first char after '{'
1484 minp pointer to int for min
1485 maxp pointer to int for max
1486 returned as -1 if no max
1487 errorcodeptr points to error code variable
1488
1489 Returns: pointer to '}' on success;
1490 current ptr on error, with errorcodeptr set non-zero
1491 */
1492
1493 static const pcre_uchar *
1494 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1495 {
1496 int min = 0;
1497 int max = -1;
1498
1499 /* Read the minimum value and do a paranoid check: a negative value indicates
1500 an integer overflow. */
1501
1502 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1503 if (min < 0 || min > 65535)
1504 {
1505 *errorcodeptr = ERR5;
1506 return p;
1507 }
1508
1509 /* Read the maximum value if there is one, and again do a paranoid on its size.
1510 Also, max must not be less than min. */
1511
1512 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1513 {
1514 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1515 {
1516 max = 0;
1517 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1518 if (max < 0 || max > 65535)
1519 {
1520 *errorcodeptr = ERR5;
1521 return p;
1522 }
1523 if (max < min)
1524 {
1525 *errorcodeptr = ERR4;
1526 return p;
1527 }
1528 }
1529 }
1530
1531 /* Fill in the required variables, and pass back the pointer to the terminating
1532 '}'. */
1533
1534 *minp = min;
1535 *maxp = max;
1536 return p;
1537 }
1538
1539
1540
1541 /*************************************************
1542 * Find first significant op code *
1543 *************************************************/
1544
1545 /* This is called by several functions that scan a compiled expression looking
1546 for a fixed first character, or an anchoring op code etc. It skips over things
1547 that do not influence this. For some calls, it makes sense to skip negative
1548 forward and all backward assertions, and also the \b assertion; for others it
1549 does not.
1550
1551 Arguments:
1552 code pointer to the start of the group
1553 skipassert TRUE if certain assertions are to be skipped
1554
1555 Returns: pointer to the first significant opcode
1556 */
1557
1558 static const pcre_uchar*
1559 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1560 {
1561 for (;;)
1562 {
1563 switch ((int)*code)
1564 {
1565 case OP_ASSERT_NOT:
1566 case OP_ASSERTBACK:
1567 case OP_ASSERTBACK_NOT:
1568 if (!skipassert) return code;
1569 do code += GET(code, 1); while (*code == OP_ALT);
1570 code += PRIV(OP_lengths)[*code];
1571 break;
1572
1573 case OP_WORD_BOUNDARY:
1574 case OP_NOT_WORD_BOUNDARY:
1575 if (!skipassert) return code;
1576 /* Fall through */
1577
1578 case OP_CALLOUT:
1579 case OP_CREF:
1580 case OP_DNCREF:
1581 case OP_RREF:
1582 case OP_DNRREF:
1583 case OP_DEF:
1584 code += PRIV(OP_lengths)[*code];
1585 break;
1586
1587 default:
1588 return code;
1589 }
1590 }
1591 /* Control never reaches here */
1592 }
1593
1594
1595
1596 /*************************************************
1597 * Find the fixed length of a branch *
1598 *************************************************/
1599
1600 /* Scan a branch and compute the fixed length of subject that will match it,
1601 if the length is fixed. This is needed for dealing with backward assertions.
1602 In UTF8 mode, the result is in characters rather than bytes. The branch is
1603 temporarily terminated with OP_END when this function is called.
1604
1605 This function is called when a backward assertion is encountered, so that if it
1606 fails, the error message can point to the correct place in the pattern.
1607 However, we cannot do this when the assertion contains subroutine calls,
1608 because they can be forward references. We solve this by remembering this case
1609 and doing the check at the end; a flag specifies which mode we are running in.
1610
1611 Arguments:
1612 code points to the start of the pattern (the bracket)
1613 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1614 atend TRUE if called when the pattern is complete
1615 cd the "compile data" structure
1616
1617 Returns: the fixed length,
1618 or -1 if there is no fixed length,
1619 or -2 if \C was encountered (in UTF-8 mode only)
1620 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1621 or -4 if an unknown opcode was encountered (internal error)
1622 */
1623
1624 static int
1625 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1626 {
1627 int length = -1;
1628
1629 register int branchlength = 0;
1630 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1631
1632 /* Scan along the opcodes for this branch. If we get to the end of the
1633 branch, check the length against that of the other branches. */
1634
1635 for (;;)
1636 {
1637 int d;
1638 pcre_uchar *ce, *cs;
1639 register pcre_uchar op = *cc;
1640
1641 switch (op)
1642 {
1643 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1644 OP_BRA (normal non-capturing bracket) because the other variants of these
1645 opcodes are all concerned with unlimited repeated groups, which of course
1646 are not of fixed length. */
1647
1648 case OP_CBRA:
1649 case OP_BRA:
1650 case OP_ONCE:
1651 case OP_ONCE_NC:
1652 case OP_COND:
1653 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1654 if (d < 0) return d;
1655 branchlength += d;
1656 do cc += GET(cc, 1); while (*cc == OP_ALT);
1657 cc += 1 + LINK_SIZE;
1658 break;
1659
1660 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1661 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1662 an ALT. If it is END it's the end of the outer call. All can be handled by
1663 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1664 because they all imply an unlimited repeat. */
1665
1666 case OP_ALT:
1667 case OP_KET:
1668 case OP_END:
1669 case OP_ACCEPT:
1670 case OP_ASSERT_ACCEPT:
1671 if (length < 0) length = branchlength;
1672 else if (length != branchlength) return -1;
1673 if (*cc != OP_ALT) return length;
1674 cc += 1 + LINK_SIZE;
1675 branchlength = 0;
1676 break;
1677
1678 /* A true recursion implies not fixed length, but a subroutine call may
1679 be OK. If the subroutine is a forward reference, we can't deal with
1680 it until the end of the pattern, so return -3. */
1681
1682 case OP_RECURSE:
1683 if (!atend) return -3;
1684 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1685 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1686 if (cc > cs && cc < ce) return -1; /* Recursion */
1687 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1688 if (d < 0) return d;
1689 branchlength += d;
1690 cc += 1 + LINK_SIZE;
1691 break;
1692
1693 /* Skip over assertive subpatterns */
1694
1695 case OP_ASSERT:
1696 case OP_ASSERT_NOT:
1697 case OP_ASSERTBACK:
1698 case OP_ASSERTBACK_NOT:
1699 do cc += GET(cc, 1); while (*cc == OP_ALT);
1700 cc += PRIV(OP_lengths)[*cc];
1701 break;
1702
1703 /* Skip over things that don't match chars */
1704
1705 case OP_MARK:
1706 case OP_PRUNE_ARG:
1707 case OP_SKIP_ARG:
1708 case OP_THEN_ARG:
1709 cc += cc[1] + PRIV(OP_lengths)[*cc];
1710 break;
1711
1712 case OP_CALLOUT:
1713 case OP_CIRC:
1714 case OP_CIRCM:
1715 case OP_CLOSE:
1716 case OP_COMMIT:
1717 case OP_CREF:
1718 case OP_DEF:
1719 case OP_DNCREF:
1720 case OP_DNRREF:
1721 case OP_DOLL:
1722 case OP_DOLLM:
1723 case OP_EOD:
1724 case OP_EODN:
1725 case OP_FAIL:
1726 case OP_NOT_WORD_BOUNDARY:
1727 case OP_PRUNE:
1728 case OP_REVERSE:
1729 case OP_RREF:
1730 case OP_SET_SOM:
1731 case OP_SKIP:
1732 case OP_SOD:
1733 case OP_SOM:
1734 case OP_THEN:
1735 case OP_WORD_BOUNDARY:
1736 cc += PRIV(OP_lengths)[*cc];
1737 break;
1738
1739 /* Handle literal characters */
1740
1741 case OP_CHAR:
1742 case OP_CHARI:
1743 case OP_NOT:
1744 case OP_NOTI:
1745 branchlength++;
1746 cc += 2;
1747 #ifdef SUPPORT_UTF
1748 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1749 #endif
1750 break;
1751
1752 /* Handle exact repetitions. The count is already in characters, but we
1753 need to skip over a multibyte character in UTF8 mode. */
1754
1755 case OP_EXACT:
1756 case OP_EXACTI:
1757 case OP_NOTEXACT:
1758 case OP_NOTEXACTI:
1759 branchlength += (int)GET2(cc,1);
1760 cc += 2 + IMM2_SIZE;
1761 #ifdef SUPPORT_UTF
1762 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1763 #endif
1764 break;
1765
1766 case OP_TYPEEXACT:
1767 branchlength += GET2(cc,1);
1768 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1769 cc += 2;
1770 cc += 1 + IMM2_SIZE + 1;
1771 break;
1772
1773 /* Handle single-char matchers */
1774
1775 case OP_PROP:
1776 case OP_NOTPROP:
1777 cc += 2;
1778 /* Fall through */
1779
1780 case OP_HSPACE:
1781 case OP_VSPACE:
1782 case OP_NOT_HSPACE:
1783 case OP_NOT_VSPACE:
1784 case OP_NOT_DIGIT:
1785 case OP_DIGIT:
1786 case OP_NOT_WHITESPACE:
1787 case OP_WHITESPACE:
1788 case OP_NOT_WORDCHAR:
1789 case OP_WORDCHAR:
1790 case OP_ANY:
1791 case OP_ALLANY:
1792 branchlength++;
1793 cc++;
1794 break;
1795
1796 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1797 otherwise \C is coded as OP_ALLANY. */
1798
1799 case OP_ANYBYTE:
1800 return -2;
1801
1802 /* Check a class for variable quantification */
1803
1804 case OP_CLASS:
1805 case OP_NCLASS:
1806 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1807 case OP_XCLASS:
1808 /* The original code caused an unsigned overflow in 64 bit systems,
1809 so now we use a conditional statement. */
1810 if (op == OP_XCLASS)
1811 cc += GET(cc, 1);
1812 else
1813 cc += PRIV(OP_lengths)[OP_CLASS];
1814 #else
1815 cc += PRIV(OP_lengths)[OP_CLASS];
1816 #endif
1817
1818 switch (*cc)
1819 {
1820 case OP_CRPLUS:
1821 case OP_CRMINPLUS:
1822 case OP_CRSTAR:
1823 case OP_CRMINSTAR:
1824 case OP_CRQUERY:
1825 case OP_CRMINQUERY:
1826 return -1;
1827
1828 case OP_CRRANGE:
1829 case OP_CRMINRANGE:
1830 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1831 branchlength += (int)GET2(cc,1);
1832 cc += 1 + 2 * IMM2_SIZE;
1833 break;
1834
1835 default:
1836 branchlength++;
1837 }
1838 break;
1839
1840 /* Anything else is variable length */
1841
1842 case OP_ANYNL:
1843 case OP_BRAMINZERO:
1844 case OP_BRAPOS:
1845 case OP_BRAPOSZERO:
1846 case OP_BRAZERO:
1847 case OP_CBRAPOS:
1848 case OP_EXTUNI:
1849 case OP_KETRMAX:
1850 case OP_KETRMIN:
1851 case OP_KETRPOS:
1852 case OP_MINPLUS:
1853 case OP_MINPLUSI:
1854 case OP_MINQUERY:
1855 case OP_MINQUERYI:
1856 case OP_MINSTAR:
1857 case OP_MINSTARI:
1858 case OP_MINUPTO:
1859 case OP_MINUPTOI:
1860 case OP_NOTMINPLUS:
1861 case OP_NOTMINPLUSI:
1862 case OP_NOTMINQUERY:
1863 case OP_NOTMINQUERYI:
1864 case OP_NOTMINSTAR:
1865 case OP_NOTMINSTARI:
1866 case OP_NOTMINUPTO:
1867 case OP_NOTMINUPTOI:
1868 case OP_NOTPLUS:
1869 case OP_NOTPLUSI:
1870 case OP_NOTPOSPLUS:
1871 case OP_NOTPOSPLUSI:
1872 case OP_NOTPOSQUERY:
1873 case OP_NOTPOSQUERYI:
1874 case OP_NOTPOSSTAR:
1875 case OP_NOTPOSSTARI:
1876 case OP_NOTPOSUPTO:
1877 case OP_NOTPOSUPTOI:
1878 case OP_NOTQUERY:
1879 case OP_NOTQUERYI:
1880 case OP_NOTSTAR:
1881 case OP_NOTSTARI:
1882 case OP_NOTUPTO:
1883 case OP_NOTUPTOI:
1884 case OP_PLUS:
1885 case OP_PLUSI:
1886 case OP_POSPLUS:
1887 case OP_POSPLUSI:
1888 case OP_POSQUERY:
1889 case OP_POSQUERYI:
1890 case OP_POSSTAR:
1891 case OP_POSSTARI:
1892 case OP_POSUPTO:
1893 case OP_POSUPTOI:
1894 case OP_QUERY:
1895 case OP_QUERYI:
1896 case OP_REF:
1897 case OP_REFI:
1898 case OP_DNREF:
1899 case OP_DNREFI:
1900 case OP_SBRA:
1901 case OP_SBRAPOS:
1902 case OP_SCBRA:
1903 case OP_SCBRAPOS:
1904 case OP_SCOND:
1905 case OP_SKIPZERO:
1906 case OP_STAR:
1907 case OP_STARI:
1908 case OP_TYPEMINPLUS:
1909 case OP_TYPEMINQUERY:
1910 case OP_TYPEMINSTAR:
1911 case OP_TYPEMINUPTO:
1912 case OP_TYPEPLUS:
1913 case OP_TYPEPOSPLUS:
1914 case OP_TYPEPOSQUERY:
1915 case OP_TYPEPOSSTAR:
1916 case OP_TYPEPOSUPTO:
1917 case OP_TYPEQUERY:
1918 case OP_TYPESTAR:
1919 case OP_TYPEUPTO:
1920 case OP_UPTO:
1921 case OP_UPTOI:
1922 return -1;
1923
1924 /* Catch unrecognized opcodes so that when new ones are added they
1925 are not forgotten, as has happened in the past. */
1926
1927 default:
1928 return -4;
1929 }
1930 }
1931 /* Control never gets here */
1932 }
1933
1934
1935
1936 /*************************************************
1937 * Scan compiled regex for specific bracket *
1938 *************************************************/
1939
1940 /* This little function scans through a compiled pattern until it finds a
1941 capturing bracket with the given number, or, if the number is negative, an
1942 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1943 so that it can be called from pcre_study() when finding the minimum matching
1944 length.
1945
1946 Arguments:
1947 code points to start of expression
1948 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1949 number the required bracket number or negative to find a lookbehind
1950
1951 Returns: pointer to the opcode for the bracket, or NULL if not found
1952 */
1953
1954 const pcre_uchar *
1955 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
1956 {
1957 for (;;)
1958 {
1959 register pcre_uchar c = *code;
1960
1961 if (c == OP_END) return NULL;
1962
1963 /* XCLASS is used for classes that cannot be represented just by a bit
1964 map. This includes negated single high-valued characters. The length in
1965 the table is zero; the actual length is stored in the compiled code. */
1966
1967 if (c == OP_XCLASS) code += GET(code, 1);
1968
1969 /* Handle recursion */
1970
1971 else if (c == OP_REVERSE)
1972 {
1973 if (number < 0) return (pcre_uchar *)code;
1974 code += PRIV(OP_lengths)[c];
1975 }
1976
1977 /* Handle capturing bracket */
1978
1979 else if (c == OP_CBRA || c == OP_SCBRA ||
1980 c == OP_CBRAPOS || c == OP_SCBRAPOS)
1981 {
1982 int n = (int)GET2(code, 1+LINK_SIZE);
1983 if (n == number) return (pcre_uchar *)code;
1984 code += PRIV(OP_lengths)[c];
1985 }
1986
1987 /* Otherwise, we can get the item's length from the table, except that for
1988 repeated character types, we have to test for \p and \P, which have an extra
1989 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1990 must add in its length. */
1991
1992 else
1993 {
1994 switch(c)
1995 {
1996 case OP_TYPESTAR:
1997 case OP_TYPEMINSTAR:
1998 case OP_TYPEPLUS:
1999 case OP_TYPEMINPLUS:
2000 case OP_TYPEQUERY:
2001 case OP_TYPEMINQUERY:
2002 case OP_TYPEPOSSTAR:
2003 case OP_TYPEPOSPLUS:
2004 case OP_TYPEPOSQUERY:
2005 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2006 break;
2007
2008 case OP_TYPEUPTO:
2009 case OP_TYPEMINUPTO:
2010 case OP_TYPEEXACT:
2011 case OP_TYPEPOSUPTO:
2012 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2013 code += 2;
2014 break;
2015
2016 case OP_MARK:
2017 case OP_PRUNE_ARG:
2018 case OP_SKIP_ARG:
2019 case OP_THEN_ARG:
2020 code += code[1];
2021 break;
2022 }
2023
2024 /* Add in the fixed length from the table */
2025
2026 code += PRIV(OP_lengths)[c];
2027
2028 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2029 a multi-byte character. The length in the table is a minimum, so we have to
2030 arrange to skip the extra bytes. */
2031
2032 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2033 if (utf) switch(c)
2034 {
2035 case OP_CHAR:
2036 case OP_CHARI:
2037 case OP_EXACT:
2038 case OP_EXACTI:
2039 case OP_UPTO:
2040 case OP_UPTOI:
2041 case OP_MINUPTO:
2042 case OP_MINUPTOI:
2043 case OP_POSUPTO:
2044 case OP_POSUPTOI:
2045 case OP_STAR:
2046 case OP_STARI:
2047 case OP_MINSTAR:
2048 case OP_MINSTARI:
2049 case OP_POSSTAR:
2050 case OP_POSSTARI:
2051 case OP_PLUS:
2052 case OP_PLUSI:
2053 case OP_MINPLUS:
2054 case OP_MINPLUSI:
2055 case OP_POSPLUS:
2056 case OP_POSPLUSI:
2057 case OP_QUERY:
2058 case OP_QUERYI:
2059 case OP_MINQUERY:
2060 case OP_MINQUERYI:
2061 case OP_POSQUERY:
2062 case OP_POSQUERYI:
2063 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2064 break;
2065 }
2066 #else
2067 (void)(utf); /* Keep compiler happy by referencing function argument */
2068 #endif
2069 }
2070 }
2071 }
2072
2073
2074
2075 /*************************************************
2076 * Scan compiled regex for recursion reference *
2077 *************************************************/
2078
2079 /* This little function scans through a compiled pattern until it finds an
2080 instance of OP_RECURSE.
2081
2082 Arguments:
2083 code points to start of expression
2084 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2085
2086 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2087 */
2088
2089 static const pcre_uchar *
2090 find_recurse(const pcre_uchar *code, BOOL utf)
2091 {
2092 for (;;)
2093 {
2094 register pcre_uchar c = *code;
2095 if (c == OP_END) return NULL;
2096 if (c == OP_RECURSE) return code;
2097
2098 /* XCLASS is used for classes that cannot be represented just by a bit
2099 map. This includes negated single high-valued characters. The length in
2100 the table is zero; the actual length is stored in the compiled code. */
2101
2102 if (c == OP_XCLASS) code += GET(code, 1);
2103
2104 /* Otherwise, we can get the item's length from the table, except that for
2105 repeated character types, we have to test for \p and \P, which have an extra
2106 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2107 must add in its length. */
2108
2109 else
2110 {
2111 switch(c)
2112 {
2113 case OP_TYPESTAR:
2114 case OP_TYPEMINSTAR:
2115 case OP_TYPEPLUS:
2116 case OP_TYPEMINPLUS:
2117 case OP_TYPEQUERY:
2118 case OP_TYPEMINQUERY:
2119 case OP_TYPEPOSSTAR:
2120 case OP_TYPEPOSPLUS:
2121 case OP_TYPEPOSQUERY:
2122 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2123 break;
2124
2125 case OP_TYPEPOSUPTO:
2126 case OP_TYPEUPTO:
2127 case OP_TYPEMINUPTO:
2128 case OP_TYPEEXACT:
2129 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2130 code += 2;
2131 break;
2132
2133 case OP_MARK:
2134 case OP_PRUNE_ARG:
2135 case OP_SKIP_ARG:
2136 case OP_THEN_ARG:
2137 code += code[1];
2138 break;
2139 }
2140
2141 /* Add in the fixed length from the table */
2142
2143 code += PRIV(OP_lengths)[c];
2144
2145 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2146 by a multi-byte character. The length in the table is a minimum, so we have
2147 to arrange to skip the extra bytes. */
2148
2149 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2150 if (utf) switch(c)
2151 {
2152 case OP_CHAR:
2153 case OP_CHARI:
2154 case OP_NOT:
2155 case OP_NOTI:
2156 case OP_EXACT:
2157 case OP_EXACTI:
2158 case OP_NOTEXACT:
2159 case OP_NOTEXACTI:
2160 case OP_UPTO:
2161 case OP_UPTOI:
2162 case OP_NOTUPTO:
2163 case OP_NOTUPTOI:
2164 case OP_MINUPTO:
2165 case OP_MINUPTOI:
2166 case OP_NOTMINUPTO:
2167 case OP_NOTMINUPTOI:
2168 case OP_POSUPTO:
2169 case OP_POSUPTOI:
2170 case OP_NOTPOSUPTO:
2171 case OP_NOTPOSUPTOI:
2172 case OP_STAR:
2173 case OP_STARI:
2174 case OP_NOTSTAR:
2175 case OP_NOTSTARI:
2176 case OP_MINSTAR:
2177 case OP_MINSTARI:
2178 case OP_NOTMINSTAR:
2179 case OP_NOTMINSTARI:
2180 case OP_POSSTAR:
2181 case OP_POSSTARI:
2182 case OP_NOTPOSSTAR:
2183 case OP_NOTPOSSTARI:
2184 case OP_PLUS:
2185 case OP_PLUSI:
2186 case OP_NOTPLUS:
2187 case OP_NOTPLUSI:
2188 case OP_MINPLUS:
2189 case OP_MINPLUSI:
2190 case OP_NOTMINPLUS:
2191 case OP_NOTMINPLUSI:
2192 case OP_POSPLUS:
2193 case OP_POSPLUSI:
2194 case OP_NOTPOSPLUS:
2195 case OP_NOTPOSPLUSI:
2196 case OP_QUERY:
2197 case OP_QUERYI:
2198 case OP_NOTQUERY:
2199 case OP_NOTQUERYI:
2200 case OP_MINQUERY:
2201 case OP_MINQUERYI:
2202 case OP_NOTMINQUERY:
2203 case OP_NOTMINQUERYI:
2204 case OP_POSQUERY:
2205 case OP_POSQUERYI:
2206 case OP_NOTPOSQUERY:
2207 case OP_NOTPOSQUERYI:
2208 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2209 break;
2210 }
2211 #else
2212 (void)(utf); /* Keep compiler happy by referencing function argument */
2213 #endif
2214 }
2215 }
2216 }
2217
2218
2219
2220 /*************************************************
2221 * Scan compiled branch for non-emptiness *
2222 *************************************************/
2223
2224 /* This function scans through a branch of a compiled pattern to see whether it
2225 can match the empty string or not. It is called from could_be_empty()
2226 below and from compile_branch() when checking for an unlimited repeat of a
2227 group that can match nothing. Note that first_significant_code() skips over
2228 backward and negative forward assertions when its final argument is TRUE. If we
2229 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2230 bracket whose current branch will already have been scanned.
2231
2232 Arguments:
2233 code points to start of search
2234 endcode points to where to stop
2235 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2236 cd contains pointers to tables etc.
2237 recurses chain of recurse_check to catch mutual recursion
2238
2239 Returns: TRUE if what is matched could be empty
2240 */
2241
2242 typedef struct recurse_check {
2243 struct recurse_check *prev;
2244 const pcre_uchar *group;
2245 } recurse_check;
2246
2247 static BOOL
2248 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2249 BOOL utf, compile_data *cd, recurse_check *recurses)
2250 {
2251 register pcre_uchar c;
2252 recurse_check this_recurse;
2253
2254 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2255 code < endcode;
2256 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2257 {
2258 const pcre_uchar *ccode;
2259
2260 c = *code;
2261
2262 /* Skip over forward assertions; the other assertions are skipped by
2263 first_significant_code() with a TRUE final argument. */
2264
2265 if (c == OP_ASSERT)
2266 {
2267 do code += GET(code, 1); while (*code == OP_ALT);
2268 c = *code;
2269 continue;
2270 }
2271
2272 /* For a recursion/subroutine call, if its end has been reached, which
2273 implies a backward reference subroutine call, we can scan it. If it's a
2274 forward reference subroutine call, we can't. To detect forward reference
2275 we have to scan up the list that is kept in the workspace. This function is
2276 called only when doing the real compile, not during the pre-compile that
2277 measures the size of the compiled pattern. */
2278
2279 if (c == OP_RECURSE)
2280 {
2281 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2282 BOOL empty_branch;
2283
2284 /* Test for forward reference or uncompleted reference. This is disabled
2285 when called to scan a completed pattern by setting cd->start_workspace to
2286 NULL. */
2287
2288 if (cd->start_workspace != NULL)
2289 {
2290 const pcre_uchar *tcode;
2291 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2292 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2293 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2294 }
2295
2296 /* If we are scanning a completed pattern, there are no forward references
2297 and all groups are complete. We need to detect whether this is a recursive
2298 call, as otherwise there will be an infinite loop. If it is a recursion,
2299 just skip over it. Simple recursions are easily detected. For mutual
2300 recursions we keep a chain on the stack. */
2301
2302 else
2303 {
2304 recurse_check *r = recurses;
2305 const pcre_uchar *endgroup = scode;
2306
2307 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2308 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2309
2310 for (r = recurses; r != NULL; r = r->prev)
2311 if (r->group == scode) break;
2312 if (r != NULL) continue; /* Mutual recursion */
2313 }
2314
2315 /* Completed reference; scan the referenced group, remembering it on the
2316 stack chain to detect mutual recursions. */
2317
2318 empty_branch = FALSE;
2319 this_recurse.prev = recurses;
2320 this_recurse.group = scode;
2321
2322 do
2323 {
2324 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2325 {
2326 empty_branch = TRUE;
2327 break;
2328 }
2329 scode += GET(scode, 1);
2330 }
2331 while (*scode == OP_ALT);
2332
2333 if (!empty_branch) return FALSE; /* All branches are non-empty */
2334 continue;
2335 }
2336
2337 /* Groups with zero repeats can of course be empty; skip them. */
2338
2339 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2340 c == OP_BRAPOSZERO)
2341 {
2342 code += PRIV(OP_lengths)[c];
2343 do code += GET(code, 1); while (*code == OP_ALT);
2344 c = *code;
2345 continue;
2346 }
2347
2348 /* A nested group that is already marked as "could be empty" can just be
2349 skipped. */
2350
2351 if (c == OP_SBRA || c == OP_SBRAPOS ||
2352 c == OP_SCBRA || c == OP_SCBRAPOS)
2353 {
2354 do code += GET(code, 1); while (*code == OP_ALT);
2355 c = *code;
2356 continue;
2357 }
2358
2359 /* For other groups, scan the branches. */
2360
2361 if (c == OP_BRA || c == OP_BRAPOS ||
2362 c == OP_CBRA || c == OP_CBRAPOS ||
2363 c == OP_ONCE || c == OP_ONCE_NC ||
2364 c == OP_COND)
2365 {
2366 BOOL empty_branch;
2367 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2368
2369 /* If a conditional group has only one branch, there is a second, implied,
2370 empty branch, so just skip over the conditional, because it could be empty.
2371 Otherwise, scan the individual branches of the group. */
2372
2373 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2374 code += GET(code, 1);
2375 else
2376 {
2377 empty_branch = FALSE;
2378 do
2379 {
2380 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2381 empty_branch = TRUE;
2382 code += GET(code, 1);
2383 }
2384 while (*code == OP_ALT);
2385 if (!empty_branch) return FALSE; /* All branches are non-empty */
2386 }
2387
2388 c = *code;
2389 continue;
2390 }
2391
2392 /* Handle the other opcodes */
2393
2394 switch (c)
2395 {
2396 /* Check for quantifiers after a class. XCLASS is used for classes that
2397 cannot be represented just by a bit map. This includes negated single
2398 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2399 actual length is stored in the compiled code, so we must update "code"
2400 here. */
2401
2402 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2403 case OP_XCLASS:
2404 ccode = code += GET(code, 1);
2405 goto CHECK_CLASS_REPEAT;
2406 #endif
2407
2408 case OP_CLASS:
2409 case OP_NCLASS:
2410 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2411
2412 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2413 CHECK_CLASS_REPEAT:
2414 #endif
2415
2416 switch (*ccode)
2417 {
2418 case OP_CRSTAR: /* These could be empty; continue */
2419 case OP_CRMINSTAR:
2420 case OP_CRQUERY:
2421 case OP_CRMINQUERY:
2422 break;
2423
2424 default: /* Non-repeat => class must match */
2425 case OP_CRPLUS: /* These repeats aren't empty */
2426 case OP_CRMINPLUS:
2427 return FALSE;
2428
2429 case OP_CRRANGE:
2430 case OP_CRMINRANGE:
2431 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2432 break;
2433 }
2434 break;
2435
2436 /* Opcodes that must match a character */
2437
2438 case OP_ANY:
2439 case OP_ALLANY:
2440 case OP_ANYBYTE:
2441
2442 case OP_PROP:
2443 case OP_NOTPROP:
2444 case OP_ANYNL:
2445
2446 case OP_NOT_HSPACE:
2447 case OP_HSPACE:
2448 case OP_NOT_VSPACE:
2449 case OP_VSPACE:
2450 case OP_EXTUNI:
2451
2452 case OP_NOT_DIGIT:
2453 case OP_DIGIT:
2454 case OP_NOT_WHITESPACE:
2455 case OP_WHITESPACE:
2456 case OP_NOT_WORDCHAR:
2457 case OP_WORDCHAR:
2458
2459 case OP_CHAR:
2460 case OP_CHARI:
2461 case OP_NOT:
2462 case OP_NOTI:
2463
2464 case OP_PLUS:
2465 case OP_PLUSI:
2466 case OP_MINPLUS:
2467 case OP_MINPLUSI:
2468
2469 case OP_NOTPLUS:
2470 case OP_NOTPLUSI:
2471 case OP_NOTMINPLUS:
2472 case OP_NOTMINPLUSI:
2473
2474 case OP_POSPLUS:
2475 case OP_POSPLUSI:
2476 case OP_NOTPOSPLUS:
2477 case OP_NOTPOSPLUSI:
2478
2479 case OP_EXACT:
2480 case OP_EXACTI:
2481 case OP_NOTEXACT:
2482 case OP_NOTEXACTI:
2483
2484 case OP_TYPEPLUS:
2485 case OP_TYPEMINPLUS:
2486 case OP_TYPEPOSPLUS:
2487 case OP_TYPEEXACT:
2488
2489 return FALSE;
2490
2491 /* These are going to continue, as they may be empty, but we have to
2492 fudge the length for the \p and \P cases. */
2493
2494 case OP_TYPESTAR:
2495 case OP_TYPEMINSTAR:
2496 case OP_TYPEPOSSTAR:
2497 case OP_TYPEQUERY:
2498 case OP_TYPEMINQUERY:
2499 case OP_TYPEPOSQUERY:
2500 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2501 break;
2502
2503 /* Same for these */
2504
2505 case OP_TYPEUPTO:
2506 case OP_TYPEMINUPTO:
2507 case OP_TYPEPOSUPTO:
2508 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2509 code += 2;
2510 break;
2511
2512 /* End of branch */
2513
2514 case OP_KET:
2515 case OP_KETRMAX:
2516 case OP_KETRMIN:
2517 case OP_KETRPOS:
2518 case OP_ALT:
2519 return TRUE;
2520
2521 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2522 MINUPTO, and POSUPTO and their caseless and negative versions may be
2523 followed by a multibyte character. */
2524
2525 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2526 case OP_STAR:
2527 case OP_STARI:
2528 case OP_NOTSTAR:
2529 case OP_NOTSTARI:
2530
2531 case OP_MINSTAR:
2532 case OP_MINSTARI:
2533 case OP_NOTMINSTAR:
2534 case OP_NOTMINSTARI:
2535
2536 case OP_POSSTAR:
2537 case OP_POSSTARI:
2538 case OP_NOTPOSSTAR:
2539 case OP_NOTPOSSTARI:
2540
2541 case OP_QUERY:
2542 case OP_QUERYI:
2543 case OP_NOTQUERY:
2544 case OP_NOTQUERYI:
2545
2546 case OP_MINQUERY:
2547 case OP_MINQUERYI:
2548 case OP_NOTMINQUERY:
2549 case OP_NOTMINQUERYI:
2550
2551 case OP_POSQUERY:
2552 case OP_POSQUERYI:
2553 case OP_NOTPOSQUERY:
2554 case OP_NOTPOSQUERYI:
2555
2556 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2557 break;
2558
2559 case OP_UPTO:
2560 case OP_UPTOI:
2561 case OP_NOTUPTO:
2562 case OP_NOTUPTOI:
2563
2564 case OP_MINUPTO:
2565 case OP_MINUPTOI:
2566 case OP_NOTMINUPTO:
2567 case OP_NOTMINUPTOI:
2568
2569 case OP_POSUPTO:
2570 case OP_POSUPTOI:
2571 case OP_NOTPOSUPTO:
2572 case OP_NOTPOSUPTOI:
2573
2574 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2575 break;
2576 #endif
2577
2578 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2579 string. */
2580
2581 case OP_MARK:
2582 case OP_PRUNE_ARG:
2583 case OP_SKIP_ARG:
2584 case OP_THEN_ARG:
2585 code += code[1];
2586 break;
2587
2588 /* None of the remaining opcodes are required to match a character. */
2589
2590 default:
2591 break;
2592 }
2593 }
2594
2595 return TRUE;
2596 }
2597
2598
2599
2600 /*************************************************
2601 * Scan compiled regex for non-emptiness *
2602 *************************************************/
2603
2604 /* This function is called to check for left recursive calls. We want to check
2605 the current branch of the current pattern to see if it could match the empty
2606 string. If it could, we must look outwards for branches at other levels,
2607 stopping when we pass beyond the bracket which is the subject of the recursion.
2608 This function is called only during the real compile, not during the
2609 pre-compile.
2610
2611 Arguments:
2612 code points to start of the recursion
2613 endcode points to where to stop (current RECURSE item)
2614 bcptr points to the chain of current (unclosed) branch starts
2615 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2616 cd pointers to tables etc
2617
2618 Returns: TRUE if what is matched could be empty
2619 */
2620
2621 static BOOL
2622 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2623 branch_chain *bcptr, BOOL utf, compile_data *cd)
2624 {
2625 while (bcptr != NULL && bcptr->current_branch >= code)
2626 {
2627 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2628 return FALSE;
2629 bcptr = bcptr->outer;
2630 }
2631 return TRUE;
2632 }
2633
2634
2635
2636 /*************************************************
2637 * Base opcode of repeated opcodes *
2638 *************************************************/
2639
2640 /* Returns the base opcode for repeated single character type opcodes. If the
2641 opcode is not a repeated character type, it returns with the original value.
2642
2643 Arguments: c opcode
2644 Returns: base opcode for the type
2645 */
2646
2647 static pcre_uchar
2648 get_repeat_base(pcre_uchar c)
2649 {
2650 return (c > OP_TYPEPOSUPTO)? c :
2651 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2652 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2653 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2654 (c >= OP_STARI)? OP_STARI :
2655 OP_STAR;
2656 }
2657
2658
2659
2660 #ifdef SUPPORT_UCP
2661 /*************************************************
2662 * Check a character and a property *
2663 *************************************************/
2664
2665 /* This function is called by check_auto_possessive() when a property item
2666 is adjacent to a fixed character.
2667
2668 Arguments:
2669 c the character
2670 ptype the property type
2671 pdata the data for the type
2672 negated TRUE if it's a negated property (\P or \p{^)
2673
2674 Returns: TRUE if auto-possessifying is OK
2675 */
2676
2677 static BOOL
2678 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2679 BOOL negated)
2680 {
2681 const pcre_uint32 *p;
2682 const ucd_record *prop = GET_UCD(c);
2683
2684 switch(ptype)
2685 {
2686 case PT_LAMP:
2687 return (prop->chartype == ucp_Lu ||
2688 prop->chartype == ucp_Ll ||
2689 prop->chartype == ucp_Lt) == negated;
2690
2691 case PT_GC:
2692 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2693
2694 case PT_PC:
2695 return (pdata == prop->chartype) == negated;
2696
2697 case PT_SC:
2698 return (pdata == prop->script) == negated;
2699
2700 /* These are specials */
2701
2702 case PT_ALNUM:
2703 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2704 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2705
2706 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2707 means that Perl space and POSIX space are now identical. PCRE was changed
2708 at release 8.34. */
2709
2710 case PT_SPACE: /* Perl space */
2711 case PT_PXSPACE: /* POSIX space */
2712 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2713 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2714 c == CHAR_FF || c == CHAR_CR)
2715 == negated;
2716
2717 case PT_WORD:
2718 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2719 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2720 c == CHAR_UNDERSCORE) == negated;
2721
2722 case PT_CLIST:
2723 p = PRIV(ucd_caseless_sets) + prop->caseset;
2724 for (;;)
2725 {
2726 if (c < *p) return !negated;
2727 if (c == *p++) return negated;
2728 }
2729 break; /* Control never reaches here */
2730 }
2731
2732 return FALSE;
2733 }
2734 #endif /* SUPPORT_UCP */
2735
2736
2737
2738 /*************************************************
2739 * Fill the character property list *
2740 *************************************************/
2741
2742 /* Checks whether the code points to an opcode that can take part in auto-
2743 possessification, and if so, fills a list with its properties.
2744
2745 Arguments:
2746 code points to start of expression
2747 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2748 fcc points to case-flipping table
2749 list points to output list
2750 list[0] will be filled with the opcode
2751 list[1] will be non-zero if this opcode
2752 can match an empty character string
2753 list[2..7] depends on the opcode
2754
2755 Returns: points to the start of the next opcode if *code is accepted
2756 NULL if *code is not accepted
2757 */
2758
2759 static const pcre_uchar *
2760 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2761 const pcre_uint8 *fcc, pcre_uint32 *list)
2762 {
2763 pcre_uchar c = *code;
2764 const pcre_uchar *end;
2765 const pcre_uint32 *clist_src;
2766 pcre_uint32 *clist_dest;
2767 pcre_uint32 chr;
2768 pcre_uchar base;
2769
2770 list[0] = c;
2771 list[1] = FALSE;
2772 code++;
2773
2774 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2775 {
2776 base = get_repeat_base(c);
2777 c -= (base - OP_STAR);
2778
2779 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2780 code += IMM2_SIZE;
2781
2782 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2783
2784 switch(base)
2785 {
2786 case OP_STAR:
2787 list[0] = OP_CHAR;
2788 break;
2789
2790 case OP_STARI:
2791 list[0] = OP_CHARI;
2792 break;
2793
2794 case OP_NOTSTAR:
2795 list[0] = OP_NOT;
2796 break;
2797
2798 case OP_NOTSTARI:
2799 list[0] = OP_NOTI;
2800 break;
2801
2802 case OP_TYPESTAR:
2803 list[0] = *code;
2804 code++;
2805 break;
2806 }
2807 c = list[0];
2808 }
2809
2810 switch(c)
2811 {
2812 case OP_NOT_DIGIT:
2813 case OP_DIGIT:
2814 case OP_NOT_WHITESPACE:
2815 case OP_WHITESPACE:
2816 case OP_NOT_WORDCHAR:
2817 case OP_WORDCHAR:
2818 case OP_ANY:
2819 case OP_ALLANY:
2820 case OP_ANYNL:
2821 case OP_NOT_HSPACE:
2822 case OP_HSPACE:
2823 case OP_NOT_VSPACE:
2824 case OP_VSPACE:
2825 case OP_EXTUNI:
2826 case OP_EODN:
2827 case OP_EOD:
2828 case OP_DOLL:
2829 case OP_DOLLM:
2830 return code;
2831
2832 case OP_CHAR:
2833 case OP_NOT:
2834 GETCHARINCTEST(chr, code);
2835 list[2] = chr;
2836 list[3] = NOTACHAR;
2837 return code;
2838
2839 case OP_CHARI:
2840 case OP_NOTI:
2841 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2842 GETCHARINCTEST(chr, code);
2843 list[2] = chr;
2844
2845 #ifdef SUPPORT_UCP
2846 if (chr < 128 || (chr < 256 && !utf))
2847 list[3] = fcc[chr];
2848 else
2849 list[3] = UCD_OTHERCASE(chr);
2850 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2851 list[3] = (chr < 256) ? fcc[chr] : chr;
2852 #else
2853 list[3] = fcc[chr];
2854 #endif
2855
2856 /* The othercase might be the same value. */
2857
2858 if (chr == list[3])
2859 list[3] = NOTACHAR;
2860 else
2861 list[4] = NOTACHAR;
2862 return code;
2863
2864 #ifdef SUPPORT_UCP
2865 case OP_PROP:
2866 case OP_NOTPROP:
2867 if (code[0] != PT_CLIST)
2868 {
2869 list[2] = code[0];
2870 list[3] = code[1];
2871 return code + 2;
2872 }
2873
2874 /* Convert only if we have anough space. */
2875
2876 clist_src = PRIV(ucd_caseless_sets) + code[1];
2877 clist_dest = list + 2;
2878 code += 2;
2879
2880 do {
2881 /* Early return if there is not enough space. */
2882 if (clist_dest >= list + 8)
2883 {
2884 list[2] = code[0];
2885 list[3] = code[1];
2886 return code;
2887 }
2888 *clist_dest++ = *clist_src;
2889 }
2890 while(*clist_src++ != NOTACHAR);
2891
2892 /* Enough space to store all characters. */
2893
2894 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2895 return code;
2896 #endif
2897
2898 case OP_NCLASS:
2899 case OP_CLASS:
2900 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2901 case OP_XCLASS:
2902
2903 if (c == OP_XCLASS)
2904 end = code + GET(code, 0);
2905 else
2906 #endif
2907 end = code + 32 / sizeof(pcre_uchar);
2908
2909 switch(*end)
2910 {
2911 case OP_CRSTAR:
2912 case OP_CRMINSTAR:
2913 case OP_CRQUERY:
2914 case OP_CRMINQUERY:
2915 list[1] = TRUE;
2916 end++;
2917 break;
2918
2919 case OP_CRRANGE:
2920 case OP_CRMINRANGE:
2921 list[1] = (GET2(end, 1) == 0);
2922 end += 1 + 2 * IMM2_SIZE;
2923 break;
2924 }
2925 list[2] = end - code;
2926 return end;
2927 }
2928 return NULL; /* Opcode not accepted */
2929 }
2930
2931
2932
2933 /*************************************************
2934 * Scan further character sets for match *
2935 *************************************************/
2936
2937 /* Checks whether the base and the current opcode have a common character, in
2938 which case the base cannot be possessified.
2939
2940 Arguments:
2941 code points to the byte code
2942 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2943 cd static compile data
2944 base_list the data list of the base opcode
2945
2946 Returns: TRUE if the auto-possessification is possible
2947 */
2948
2949 static BOOL
2950 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
2951 const pcre_uint32* base_list)
2952 {
2953 pcre_uchar c;
2954 pcre_uint32 list[8];
2955 const pcre_uint32* chr_ptr;
2956 const pcre_uint32* ochr_ptr;
2957 const pcre_uint32* list_ptr;
2958 pcre_uint32 chr;
2959
2960 for(;;)
2961 {
2962 c = *code;
2963
2964 /* Skip over callouts */
2965
2966 if (c == OP_CALLOUT)
2967 {
2968 code += PRIV(OP_lengths)[c];
2969 continue;
2970 }
2971
2972 if (c == OP_ALT)
2973 {
2974 do code += GET(code, 1); while (*code == OP_ALT);
2975 c = *code;
2976 }
2977
2978 switch(c)
2979 {
2980 case OP_END:
2981 /* TRUE only in greedy case. The non-greedy case could be replaced by an
2982 OP_EXACT, but it is probably not worth it. (And note that OP_EXACT uses
2983 more memory, which we cannot get at this stage.) */
2984
2985 return base_list[1] != 0;
2986
2987 case OP_KET:
2988 /* If the bracket is capturing, and referenced by an OP_RECURSE, the
2989 non-greedy case cannot be converted to a possessive form. We do not test
2990 the bracket type at the moment, but we might do it in the future to improve
2991 this condition. (But note that recursive calls are always atomic.) */
2992
2993 if (base_list[1] == 0) return FALSE;
2994 code += PRIV(OP_lengths)[c];
2995 continue;
2996 }
2997
2998 /* Check for a supported opcode, and load its properties. */
2999
3000 code = get_chr_property_list(code, utf, cd->fcc, list);
3001 if (code == NULL) return FALSE; /* Unsupported */
3002
3003 /* If either opcode is a small character list, set pointers for comparing
3004 characters from that list with another list, or with a property. */
3005
3006 if (base_list[0] == OP_CHAR)
3007 {
3008 chr_ptr = base_list + 2;
3009 list_ptr = list;
3010 }
3011 else if (list[0] == OP_CHAR)
3012 {
3013 chr_ptr = list + 2;
3014 list_ptr = base_list;
3015 }
3016
3017 /* Some property combinations also acceptable. Unicode property opcodes are
3018 processed specially; the rest can be handled with a lookup table. */
3019
3020 else
3021 {
3022 pcre_uint32 leftop, rightop;
3023
3024 if (list[1] != 0) return FALSE; /* Must match at least one character */
3025 leftop = base_list[0];
3026 rightop = list[0];
3027
3028 #ifdef SUPPORT_UCP
3029 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3030 {
3031 if (rightop == OP_EOD) return TRUE;
3032 if (rightop == OP_PROP || rightop == OP_NOTPROP)
3033 {
3034 int n;
3035 const pcre_uint8 *p;
3036 BOOL same = leftop == rightop;
3037 BOOL lisprop = leftop == OP_PROP;
3038 BOOL risprop = rightop == OP_PROP;
3039 BOOL bothprop = lisprop && risprop;
3040
3041 /* There's a table that specifies how each combination is to be
3042 processed:
3043 0 Always return FALSE (never auto-possessify)
3044 1 Character groups are distinct (possessify if both are OP_PROP)
3045 2 Check character categories in the same group (general or particular)
3046 3 Return TRUE if the two opcodes are not the same
3047 ... see comments below
3048 */
3049
3050 n = propposstab[base_list[2]][list[2]];
3051 switch(n)
3052 {
3053 case 0: return FALSE;
3054 case 1: return bothprop;
3055 case 2: return (base_list[3] == list[3]) != same;
3056 case 3: return !same;
3057
3058 case 4: /* Left general category, right particular category */
3059 return risprop && catposstab[base_list[3]][list[3]] == same;
3060
3061 case 5: /* Right general category, left particular category */
3062 return lisprop && catposstab[list[3]][base_list[3]] == same;
3063
3064 /* This code is logically tricky. Think hard before fiddling with it.
3065 The posspropstab table has four entries per row. Each row relates to
3066 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3067 Only WORD actually needs all four entries, but using repeats for the
3068 others means they can all use the same code below.
3069
3070 The first two entries in each row are Unicode general categories, and
3071 apply always, because all the characters they include are part of the
3072 PCRE character set. The third and fourth entries are a general and a
3073 particular category, respectively, that include one or more relevant
3074 characters. One or the other is used, depending on whether the check
3075 is for a general or a particular category. However, in both cases the
3076 category contains more characters than the specials that are defined
3077 for the property being tested against. Therefore, it cannot be used
3078 in a NOTPROP case.
3079
3080 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3081 Underscore is covered by ucp_P or ucp_Po. */
3082
3083 case 6: /* Left alphanum vs right general category */
3084 case 7: /* Left space vs right general category */
3085 case 8: /* Left word vs right general category */
3086 p = posspropstab[n-6];
3087 return risprop && lisprop ==
3088 (list[3] != p[0] &&
3089 list[3] != p[1] &&
3090 (list[3] != p[2] || !lisprop));
3091
3092 case 9: /* Right alphanum vs left general category */
3093 case 10: /* Right space vs left general category */
3094 case 11: /* Right word vs left general category */
3095 p = posspropstab[n-9];
3096 return lisprop && risprop ==
3097 (base_list[3] != p[0] &&
3098 base_list[3] != p[1] &&
3099 (base_list[3] != p[2] || !risprop));
3100
3101 case 12: /* Left alphanum vs right particular category */
3102 case 13: /* Left space vs right particular category */
3103 case 14: /* Left word vs right particular category */
3104 p = posspropstab[n-12];
3105 return risprop && lisprop ==
3106 (catposstab[p[0]][list[3]] &&
3107 catposstab[p[1]][list[3]] &&
3108 (list[3] != p[3] || !lisprop));
3109
3110 case 15: /* Right alphanum vs left particular category */
3111 case 16: /* Right space vs left particular category */
3112 case 17: /* Right word vs left particular category */
3113 p = posspropstab[n-15];
3114 return lisprop && risprop ==
3115 (catposstab[p[0]][base_list[3]] &&
3116 catposstab[p[1]][base_list[3]] &&
3117 (base_list[3] != p[3] || !risprop));
3118 }
3119 }
3120 return FALSE;
3121 }
3122
3123 else
3124 #endif /* SUPPORT_UCP */
3125
3126 return leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3127 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3128 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3129 }
3130
3131 /* Control reaches here only if one of the items is a small character list.
3132 All characters are checked against the other side. */
3133
3134 do
3135 {
3136 chr = *chr_ptr;
3137
3138 switch(list_ptr[0])
3139 {
3140 case OP_CHAR:
3141 ochr_ptr = list_ptr + 2;
3142 do
3143 {
3144 if (chr == *ochr_ptr) return FALSE;
3145 ochr_ptr++;
3146 }
3147 while(*ochr_ptr != NOTACHAR);
3148 break;
3149
3150 case OP_NOT:
3151 ochr_ptr = list_ptr + 2;
3152 do
3153 {
3154 if (chr == *ochr_ptr)
3155 break;
3156 ochr_ptr++;
3157 }
3158 while(*ochr_ptr != NOTACHAR);
3159 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3160 break;
3161
3162 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3163 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3164
3165 case OP_DIGIT:
3166 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3167 break;
3168
3169 case OP_NOT_DIGIT:
3170 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3171 break;
3172
3173 case OP_WHITESPACE:
3174 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3175 break;
3176
3177 case OP_NOT_WHITESPACE:
3178 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3179 break;
3180
3181 case OP_WORDCHAR:
3182 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3183 break;
3184
3185 case OP_NOT_WORDCHAR:
3186 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3187 break;
3188
3189 case OP_HSPACE:
3190 switch(chr)
3191 {
3192 HSPACE_CASES: return FALSE;
3193 default: break;
3194 }
3195 break;
3196
3197 case OP_NOT_HSPACE:
3198 switch(chr)
3199 {
3200 HSPACE_CASES: break;
3201 default: return FALSE;
3202 }
3203 break;
3204
3205 case OP_ANYNL:
3206 case OP_VSPACE:
3207 switch(chr)
3208 {
3209 VSPACE_CASES: return FALSE;
3210 default: break;
3211 }
3212 break;
3213
3214 case OP_NOT_VSPACE:
3215 switch(chr)
3216 {
3217 VSPACE_CASES: break;
3218 default: return FALSE;
3219 }
3220 break;
3221
3222 case OP_DOLL:
3223 case OP_EODN:
3224 switch (chr)
3225 {
3226 case CHAR_CR:
3227 case CHAR_LF:
3228 case CHAR_VT:
3229 case CHAR_FF:
3230 case CHAR_NEL:
3231 #ifndef EBCDIC
3232 case 0x2028:
3233 case 0x2029:
3234 #endif /* Not EBCDIC */
3235 return FALSE;
3236 }
3237 break;
3238
3239 case OP_EOD: /* Can always possessify before \z */
3240 break;
3241
3242 case OP_PROP:
3243 case OP_NOTPROP:
3244 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3245 list_ptr[0] == OP_NOTPROP))
3246 return FALSE;
3247 break;
3248
3249 /* The class comparisons work only when the class is the second item
3250 of the pair, because there are at present no possessive forms of the
3251 class opcodes. Note also that the "code" variable that is used below
3252 points after the second item, and that the pointer for the first item
3253 is not available, so even if there were possessive forms of the class
3254 opcodes, the correct comparison could not be done. */
3255
3256 case OP_NCLASS:
3257 if (chr > 255) return FALSE;
3258 /* Fall through */
3259
3260 case OP_CLASS:
3261 if (list_ptr != list) return FALSE; /* Class is first opcode */
3262 if (chr > 255) break;
3263 if ((((pcre_uint8 *)(code - list_ptr[2] + 1))[chr >> 3] & (1 << (chr & 7))) != 0)
3264 return FALSE;
3265 break;
3266
3267 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3268 case OP_XCLASS:
3269 if (list_ptr != list) return FALSE; /* Class is first opcode */
3270 if (PRIV(xclass)(chr, code - list_ptr[2] + 1 + LINK_SIZE, utf))
3271 return FALSE;
3272 break;
3273 #endif
3274
3275 default:
3276 return FALSE;
3277 }
3278
3279 chr_ptr++;
3280 }
3281 while(*chr_ptr != NOTACHAR);
3282
3283 /* At least one character must be matched from this opcode. */
3284
3285 if (list[1] == 0) return TRUE;
3286 }
3287
3288 return FALSE;
3289 }
3290
3291
3292
3293 /*************************************************
3294 * Scan compiled regex for auto-possession *
3295 *************************************************/
3296
3297 /* Replaces single character iterations with their possessive alternatives
3298 if appropriate. This function modifies the compiled opcode!
3299
3300 Arguments:
3301 code points to start of the byte code
3302 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3303 cd static compile data
3304
3305 Returns: nothing
3306 */
3307
3308 static void
3309 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3310 {
3311 register pcre_uchar c;
3312 const pcre_uchar *end;
3313 pcre_uint32 list[8];
3314
3315 for (;;)
3316 {
3317 c = *code;
3318
3319 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3320 {
3321 c -= get_repeat_base(c) - OP_STAR;
3322 end = (c <= OP_MINUPTO) ?
3323 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3324 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3325
3326 if (end != NULL && compare_opcodes(end, utf, cd, list))
3327 {
3328 switch(c)
3329 {
3330 case OP_STAR:
3331 *code += OP_POSSTAR - OP_STAR;
3332 break;
3333
3334 case OP_MINSTAR:
3335 *code += OP_POSSTAR - OP_MINSTAR;
3336 break;
3337
3338 case OP_PLUS:
3339 *code += OP_POSPLUS - OP_PLUS;
3340 break;
3341
3342 case OP_MINPLUS:
3343 *code += OP_POSPLUS - OP_MINPLUS;
3344 break;
3345
3346 case OP_QUERY:
3347 *code += OP_POSQUERY - OP_QUERY;
3348 break;
3349
3350 case OP_MINQUERY:
3351 *code += OP_POSQUERY - OP_MINQUERY;
3352 break;
3353
3354 case OP_UPTO:
3355 *code += OP_POSUPTO - OP_UPTO;
3356 break;
3357
3358 case OP_MINUPTO:
3359 *code += OP_MINUPTO - OP_UPTO;
3360 break;
3361 }
3362 }
3363 c = *code;
3364 }
3365
3366 switch(c)
3367 {
3368 case OP_END:
3369 return;
3370
3371 case OP_TYPESTAR:
3372 case OP_TYPEMINSTAR:
3373 case OP_TYPEPLUS:
3374 case OP_TYPEMINPLUS:
3375 case OP_TYPEQUERY:
3376 case OP_TYPEMINQUERY:
3377 case OP_TYPEPOSSTAR:
3378 case OP_TYPEPOSPLUS:
3379 case OP_TYPEPOSQUERY:
3380 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3381 break;
3382
3383 case OP_TYPEUPTO:
3384 case OP_TYPEMINUPTO:
3385 case OP_TYPEEXACT:
3386 case OP_TYPEPOSUPTO:
3387 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3388 code += 2;
3389 break;
3390
3391 case OP_XCLASS:
3392 code += GET(code, 1);
3393 break;
3394
3395 case OP_MARK:
3396 case OP_PRUNE_ARG:
3397 case OP_SKIP_ARG:
3398 case OP_THEN_ARG:
3399 code += code[1];
3400 break;
3401 }
3402
3403 /* Add in the fixed length from the table */
3404
3405 code += PRIV(OP_lengths)[c];
3406
3407 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3408 a multi-byte character. The length in the table is a minimum, so we have to
3409 arrange to skip the extra bytes. */
3410
3411 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3412 if (utf) switch(c)
3413 {
3414 case OP_CHAR:
3415 case OP_CHARI:
3416 case OP_NOT:
3417 case OP_NOTI:
3418 case OP_STAR:
3419 case OP_MINSTAR:
3420 case OP_PLUS:
3421 case OP_MINPLUS:
3422 case OP_QUERY:
3423 case OP_MINQUERY:
3424 case OP_UPTO:
3425 case OP_MINUPTO:
3426 case OP_EXACT:
3427 case OP_POSSTAR:
3428 case OP_POSPLUS:
3429 case OP_POSQUERY:
3430 case OP_POSUPTO:
3431 case OP_STARI:
3432 case OP_MINSTARI:
3433 case OP_PLUSI:
3434 case OP_MINPLUSI:
3435 case OP_QUERYI:
3436 case OP_MINQUERYI:
3437 case OP_UPTOI:
3438 case OP_MINUPTOI:
3439 case OP_EXACTI:
3440 case OP_POSSTARI:
3441 case OP_POSPLUSI:
3442 case OP_POSQUERYI:
3443 case OP_POSUPTOI:
3444 case OP_NOTSTAR:
3445 case OP_NOTMINSTAR:
3446 case OP_NOTPLUS:
3447 case OP_NOTMINPLUS:
3448 case OP_NOTQUERY:
3449 case OP_NOTMINQUERY:
3450 case OP_NOTUPTO:
3451 case OP_NOTMINUPTO:
3452 case OP_NOTEXACT:
3453 case OP_NOTPOSSTAR:
3454 case OP_NOTPOSPLUS:
3455 case OP_NOTPOSQUERY:
3456 case OP_NOTPOSUPTO:
3457 case OP_NOTSTARI:
3458 case OP_NOTMINSTARI:
3459 case OP_NOTPLUSI:
3460 case OP_NOTMINPLUSI:
3461 case OP_NOTQUERYI:
3462 case OP_NOTMINQUERYI:
3463 case OP_NOTUPTOI:
3464 case OP_NOTMINUPTOI:
3465 case OP_NOTEXACTI:
3466 case OP_NOTPOSSTARI:
3467 case OP_NOTPOSPLUSI:
3468 case OP_NOTPOSQUERYI:
3469 case OP_NOTPOSUPTOI:
3470 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3471 break;
3472 }
3473 #else
3474 (void)(utf); /* Keep compiler happy by referencing function argument */
3475 #endif
3476 }
3477 }
3478
3479
3480
3481 /*************************************************
3482 * Check for POSIX class syntax *
3483 *************************************************/
3484
3485 /* This function is called when the sequence "[:" or "[." or "[=" is
3486 encountered in a character class. It checks whether this is followed by a
3487 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3488 reach an unescaped ']' without the special preceding character, return FALSE.
3489
3490 Originally, this function only recognized a sequence of letters between the
3491 terminators, but it seems that Perl recognizes any sequence of characters,
3492 though of course unknown POSIX names are subsequently rejected. Perl gives an
3493 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3494 didn't consider this to be a POSIX class. Likewise for [:1234:].
3495
3496 The problem in trying to be exactly like Perl is in the handling of escapes. We
3497 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3498 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3499 below handles the special case of \], but does not try to do any other escape
3500 processing. This makes it different from Perl for cases such as [:l\ower:]
3501 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3502 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
3503 I think.
3504
3505 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3506 It seems that the appearance of a nested POSIX class supersedes an apparent
3507 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3508 a digit.
3509
3510 In Perl, unescaped square brackets may also appear as part of class names. For
3511 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3512 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3513 seem right at all. PCRE does not allow closing square brackets in POSIX class
3514 names.
3515
3516 Arguments:
3517 ptr pointer to the initial [
3518 endptr where to return the end pointer
3519
3520 Returns: TRUE or FALSE
3521 */
3522
3523 static BOOL
3524 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3525 {
3526 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3527 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3528 for (++ptr; *ptr != CHAR_NULL; ptr++)
3529 {
3530 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3531 ptr++;
3532 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3533 else
3534 {
3535 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3536 {
3537 *endptr = ptr;
3538 return TRUE;
3539 }
3540 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3541 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3542 ptr[1] == CHAR_EQUALS_SIGN) &&
3543 check_posix_syntax(ptr, endptr))
3544 return FALSE;
3545 }
3546 }
3547 return FALSE;
3548 }
3549
3550
3551
3552
3553 /*************************************************
3554 * Check POSIX class name *
3555 *************************************************/
3556
3557 /* This function is called to check the name given in a POSIX-style class entry
3558 such as [:alnum:].
3559
3560 Arguments:
3561 ptr points to the first letter
3562 len the length of the name
3563
3564 Returns: a value representing the name, or -1 if unknown
3565 */
3566
3567 static int
3568 check_posix_name(const pcre_uchar *ptr, int len)
3569 {
3570 const char *pn = posix_names;
3571 register int yield = 0;
3572 while (posix_name_lengths[yield] != 0)
3573 {
3574 if (len == posix_name_lengths[yield] &&
3575 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3576 pn += posix_name_lengths[yield] + 1;
3577 yield++;
3578 }
3579 return -1;
3580 }
3581
3582
3583 /*************************************************
3584 * Adjust OP_RECURSE items in repeated group *
3585 *************************************************/
3586
3587 /* OP_RECURSE items contain an offset from the start of the regex to the group
3588 that is referenced. This means that groups can be replicated for fixed
3589 repetition simply by copying (because the recursion is allowed to refer to
3590 earlier groups that are outside the current group). However, when a group is
3591 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3592 inserted before it, after it has been compiled. This means that any OP_RECURSE
3593 items within it that refer to the group itself or any contained groups have to
3594 have their offsets adjusted. That one of the jobs of this function. Before it
3595 is called, the partially compiled regex must be temporarily terminated with
3596 OP_END.
3597
3598 This function has been extended with the possibility of forward references for
3599 recursions and subroutine calls. It must also check the list of such references
3600 for the group we are dealing with. If it finds that one of the recursions in
3601 the current group is on this list, it adjusts the offset in the list, not the
3602 value in the reference (which is a group number).
3603
3604 Arguments:
3605 group points to the start of the group
3606 adjust the amount by which the group is to be moved
3607 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3608 cd contains pointers to tables etc.
3609 save_hwm the hwm forward reference pointer at the start of the group
3610
3611 Returns: nothing
3612 */
3613
3614 static void
3615 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3616 pcre_uchar *save_hwm)
3617 {
3618 pcre_uchar *ptr = group;
3619
3620 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3621 {
3622 int offset;
3623 pcre_uchar *hc;
3624
3625 /* See if this recursion is on the forward reference list. If so, adjust the
3626 reference. */
3627
3628 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3629 {
3630 offset = (int)GET(hc, 0);
3631 if (cd->start_code + offset == ptr + 1)
3632 {
3633 PUT(hc, 0, offset + adjust);
3634 break;
3635 }
3636 }
3637
3638 /* Otherwise, adjust the recursion offset if it's after the start of this
3639 group. */
3640
3641 if (hc >= cd->hwm)
3642 {
3643 offset = (int)GET(ptr, 1);
3644 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3645 }
3646
3647 ptr += 1 + LINK_SIZE;
3648 }
3649 }
3650
3651
3652
3653 /*************************************************
3654 * Insert an automatic callout point *
3655 *************************************************/
3656
3657 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3658 callout points before each pattern item.
3659
3660 Arguments:
3661 code current code pointer
3662 ptr current pattern pointer
3663 cd pointers to tables etc
3664
3665 Returns: new code pointer
3666 */
3667
3668 static pcre_uchar *
3669 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3670 {
3671 *code++ = OP_CALLOUT;
3672 *code++ = 255;
3673 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
3674 PUT(code, LINK_SIZE, 0); /* Default length */
3675 return code + 2 * LINK_SIZE;
3676 }
3677
3678
3679
3680 /*************************************************
3681 * Complete a callout item *
3682 *************************************************/
3683
3684 /* A callout item contains the length of the next item in the pattern, which
3685 we can't fill in till after we have reached the relevant point. This is used
3686 for both automatic and manual callouts.
3687
3688 Arguments:
3689 previous_callout points to previous callout item
3690 ptr current pattern pointer
3691 cd pointers to tables etc
3692
3693 Returns: nothing
3694 */
3695
3696 static void
3697 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3698 {
3699 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3700 PUT(previous_callout, 2 + LINK_SIZE, length);
3701 }
3702
3703
3704
3705 #ifdef SUPPORT_UCP
3706 /*************************************************
3707 * Get othercase range *
3708 *************************************************/
3709
3710 /* This function is passed the start and end of a class range, in UTF-8 mode
3711 with UCP support. It searches up the characters, looking for ranges of
3712 characters in the "other" case. Each call returns the next one, updating the
3713 start address. A character with multiple other cases is returned on its own
3714 with a special return value.
3715
3716 Arguments:
3717 cptr points to starting character value; updated
3718 d end value
3719 ocptr where to put start of othercase range
3720 odptr where to put end of othercase range
3721
3722 Yield: -1 when no more
3723 0 when a range is returned
3724 >0 the CASESET offset for char with multiple other cases
3725 in this case, ocptr contains the original
3726 */
3727
3728 static int
3729 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
3730 pcre_uint32 *odptr)
3731 {
3732 pcre_uint32 c, othercase, next;
3733 unsigned int co;
3734
3735 /* Find the first character that has an other case. If it has multiple other
3736 cases, return its case offset value. */
3737
3738 for (c = *cptr; c <= d; c++)
3739 {
3740 if ((co = UCD_CASESET(c)) != 0)
3741 {
3742 *ocptr = c++; /* Character that has the set */
3743 *cptr = c; /* Rest of input range */
3744 return (int)co;
3745 }
3746 if ((othercase = UCD_OTHERCASE(c)) != c) break;
3747 }
3748
3749 if (c > d) return -1; /* Reached end of range */
3750
3751 *ocptr = othercase;
3752 next = othercase + 1;
3753
3754 for (++c; c <= d; c++)
3755 {
3756 if (UCD_OTHERCASE(c) != next) break;
3757 next++;
3758 }
3759
3760 *odptr = next - 1; /* End of othercase range */
3761 *cptr = c; /* Rest of input range */
3762 return 0;
3763 }
3764 #endif /* SUPPORT_UCP */
3765
3766
3767
3768 /*************************************************
3769 * Add a character or range to a class *
3770 *************************************************/
3771
3772 /* This function packages up the logic of adding a character or range of
3773 characters to a class. The character values in the arguments will be within the
3774 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3775 mutually recursive with the function immediately below.
3776
3777 Arguments:
3778 classbits the bit map for characters < 256
3779 uchardptr points to the pointer for extra data
3780 options the options word
3781 cd contains pointers to tables etc.
3782 start start of range character
3783 end end of range character
3784
3785 Returns: the number of < 256 characters added
3786 the pointer to extra data is updated
3787 */
3788
3789 static int
3790 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3791 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
3792 {
3793 pcre_uint32 c;
3794 int n8 = 0;
3795
3796 /* If caseless matching is required, scan the range and process alternate
3797 cases. In Unicode, there are 8-bit characters that have alternate cases that
3798 are greater than 255 and vice-versa. Sometimes we can just extend the original
3799 range. */
3800
3801 if ((options & PCRE_CASELESS) != 0)
3802 {
3803 #ifdef SUPPORT_UCP
3804 if ((options & PCRE_UTF8) != 0)
3805 {
3806 int rc;
3807 pcre_uint32 oc, od;
3808
3809 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
3810 c = start;
3811
3812 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3813 {
3814 /* Handle a single character that has more than one other case. */
3815
3816 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3817 PRIV(ucd_caseless_sets) + rc, oc);
3818
3819 /* Do nothing if the other case range is within the original range. */
3820
3821 else if (oc >= start && od <= end) continue;
3822
3823 /* Extend the original range if there is overlap, noting that if oc < c, we
3824 can't have od > end because a subrange is always shorter than the basic
3825 range. Otherwise, use a recursive call to add the additional range. */
3826
3827 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3828 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
3829 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3830 }
3831 }
3832 else
3833 #endif /* SUPPORT_UCP */
3834
3835 /* Not UTF-mode, or no UCP */
3836
3837 for (c = start; c <= end && c < 256; c++)
3838 {
3839 SETBIT(classbits, cd->fcc[c]);
3840 n8++;
3841 }
3842 }
3843
3844 /* Now handle the original range. Adjust the final value according to the bit
3845 length - this means that the same lists of (e.g.) horizontal spaces can be used
3846 in all cases. */
3847
3848 #if defined COMPILE_PCRE8
3849 #ifdef SUPPORT_UTF
3850 if ((options & PCRE_UTF8) == 0)
3851 #endif
3852 if (end > 0xff) end = 0xff;
3853
3854 #elif defined COMPILE_PCRE16
3855 #ifdef SUPPORT_UTF
3856 if ((options & PCRE_UTF16) == 0)
3857 #endif
3858 if (end > 0xffff) end = 0xffff;
3859
3860 #endif /* COMPILE_PCRE[8|16] */
3861
3862 /* If all characters are less than 256, use the bit map. Otherwise use extra
3863 data. */
3864
3865 if (end < 0x100)
3866 {
3867 for (c = start; c <= end; c++)
3868 {
3869 n8++;
3870 SETBIT(classbits, c);
3871 }
3872 }
3873
3874 else
3875 {
3876 pcre_uchar *uchardata = *uchardptr;
3877
3878 #ifdef SUPPORT_UTF
3879 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
3880 {
3881 if (start < end)
3882 {
3883 *uchardata++ = XCL_RANGE;
3884 uchardata += PRIV(ord2utf)(start, uchardata);
3885 uchardata += PRIV(ord2utf)(end, uchardata);
3886 }
3887 else if (start == end)
3888 {
3889 *uchardata++ = XCL_SINGLE;
3890 uchardata += PRIV(ord2utf)(start, uchardata);
3891 }
3892 }
3893 else
3894 #endif /* SUPPORT_UTF */
3895
3896 /* Without UTF support, character values are constrained by the bit length,
3897 and can only be > 256 for 16-bit and 32-bit libraries. */
3898
3899 #ifdef COMPILE_PCRE8
3900 {}
3901 #else
3902 if (start < end)
3903 {
3904 *uchardata++ = XCL_RANGE;
3905 *uchardata++ = start;
3906 *uchardata++ = end;
3907 }
3908 else if (start == end)
3909 {
3910 *uchardata++ = XCL_SINGLE;
3911 *uchardata++ = start;
3912 }
3913 #endif
3914
3915 *uchardptr = uchardata; /* Updata extra data pointer */
3916 }
3917
3918 return n8; /* Number of 8-bit characters */
3919 }
3920
3921
3922
3923
3924 /*************************************************
3925 * Add a list of characters to a class *
3926 *************************************************/
3927
3928 /* This function is used for adding a list of case-equivalent characters to a
3929 class, and also for adding a list of horizontal or vertical whitespace. If the
3930 list is in order (which it should be), ranges of characters are detected and
3931 handled appropriately. This function is mutually recursive with the function
3932 above.
3933
3934 Arguments:
3935 classbits the bit map for characters < 256
3936 uchardptr points to the pointer for extra data
3937 options the options word
3938 cd contains pointers to tables etc.
3939 p points to row of 32-bit values, terminated by NOTACHAR
3940 except character to omit; this is used when adding lists of
3941 case-equivalent characters to avoid including the one we
3942 already know about
3943
3944 Returns: the number of < 256 characters added
3945 the pointer to extra data is updated
3946 */
3947
3948 static int
3949 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3950 compile_data *cd, const pcre_uint32 *p, unsigned int except)
3951 {
3952 int n8 = 0;
3953 while (p[0] < NOTACHAR)
3954 {
3955 int n = 0;
3956 if (p[0] != except)
3957 {
3958 while(p[n+1] == p[0] + n + 1) n++;
3959 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
3960 }
3961 p += n + 1;
3962 }
3963 return n8;
3964 }
3965
3966
3967
3968 /*************************************************
3969 * Add characters not in a list to a class *
3970 *************************************************/
3971
3972 /* This function is used for adding the complement of a list of horizontal or
3973 vertical whitespace to a class. The list must be in order.
3974
3975 Arguments:
3976 classbits the bit map for characters < 256
3977 uchardptr points to the pointer for extra data
3978 options the options word
3979 cd contains pointers to tables etc.
3980 p points to row of 32-bit values, terminated by NOTACHAR
3981
3982 Returns: the number of < 256 characters added
3983 the pointer to extra data is updated
3984 */
3985
3986 static int
3987 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
3988 int options, compile_data *cd, const pcre_uint32 *p)
3989 {
3990 BOOL utf = (options & PCRE_UTF8) != 0;
3991 int n8 = 0;
3992 if (p[0] > 0)
3993 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
3994 while (p[0] < NOTACHAR)
3995 {
3996 while (p[1] == p[0] + 1) p++;
3997 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
3998 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3999 p++;
4000 }
4001 return n8;
4002 }
4003
4004
4005
4006 /*************************************************
4007 * Compile one branch *
4008 *************************************************/
4009
4010 /* Scan the pattern, compiling it into the a vector. If the options are
4011 changed during the branch, the pointer is used to change the external options
4012 bits. This function is used during the pre-compile phase when we are trying
4013 to find out the amount of memory needed, as well as during the real compile
4014 phase. The value of lengthptr distinguishes the two phases.
4015
4016 Arguments:
4017 optionsptr pointer to the option bits
4018 codeptr points to the pointer to the current code point
4019 ptrptr points to the current pattern pointer
4020 errorcodeptr points to error code variable
4021 firstcharptr place to put the first required character
4022 firstcharflagsptr place to put the first character flags, or a negative number
4023 reqcharptr place to put the last required character
4024 reqcharflagsptr place to put the last required character flags, or a negative number
4025 bcptr points to current branch chain
4026 cond_depth conditional nesting depth
4027 cd contains pointers to tables etc.
4028 lengthptr NULL during the real compile phase
4029 points to length accumulator during pre-compile phase
4030
4031 Returns: TRUE on success
4032 FALSE, with *errorcodeptr set non-zero on error
4033 */
4034
4035 static BOOL
4036 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4037 const pcre_uchar **ptrptr, int *errorcodeptr,
4038 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4039 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4040 branch_chain *bcptr, int cond_depth,
4041 compile_data *cd, int *lengthptr)
4042 {
4043 int repeat_type, op_type;
4044 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4045 int bravalue = 0;
4046 int greedy_default, greedy_non_default;
4047 pcre_uint32 firstchar, reqchar;
4048 pcre_int32 firstcharflags, reqcharflags;
4049 pcre_uint32 zeroreqchar, zerofirstchar;
4050 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4051 pcre_int32 req_caseopt, reqvary, tempreqvary;
4052 int options = *optionsptr; /* May change dynamically */
4053 int after_manual_callout = 0;
4054 int length_prevgroup = 0;
4055 register pcre_uint32 c;
4056 int escape;
4057 register pcre_uchar *code = *codeptr;
4058 pcre_uchar *last_code = code;
4059 pcre_uchar *orig_code = code;
4060 pcre_uchar *tempcode;
4061 BOOL inescq = FALSE;
4062 BOOL groupsetfirstchar = FALSE;
4063 const pcre_uchar *ptr = *ptrptr;
4064 const pcre_uchar *tempptr;
4065 const pcre_uchar *nestptr = NULL;
4066 pcre_uchar *previous = NULL;
4067 pcre_uchar *previous_callout = NULL;
4068 pcre_uchar *save_hwm = NULL;
4069 pcre_uint8 classbits[32];
4070
4071 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4072 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4073 dynamically as we process the pattern. */
4074
4075 #ifdef SUPPORT_UTF
4076 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4077 BOOL utf = (options & PCRE_UTF8) != 0;
4078 #ifndef COMPILE_PCRE32
4079 pcre_uchar utf_chars[6];
4080 #endif
4081 #else
4082 BOOL utf = FALSE;
4083 #endif
4084
4085 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4086 class_uchardata always so that it can be passed to add_to_class() always,
4087 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4088 alternative calls for the different cases. */
4089
4090 pcre_uchar *class_uchardata;
4091 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4092 BOOL xclass;
4093 pcre_uchar *class_uchardata_base;
4094 #endif
4095
4096 #ifdef PCRE_DEBUG
4097 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4098 #endif
4099
4100 /* Set up the default and non-default settings for greediness */
4101
4102 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4103 greedy_non_default = greedy_default ^ 1;
4104
4105 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4106 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4107 matches a non-fixed char first char; reqchar just remains unset if we never
4108 find one.
4109
4110 When we hit a repeat whose minimum is zero, we may have to adjust these values
4111 to take the zero repeat into account. This is implemented by setting them to
4112 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4113 item types that can be repeated set these backoff variables appropriately. */
4114
4115 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4116 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4117
4118 /* The variable req_caseopt contains either the REQ_CASELESS value
4119 or zero, according to the current setting of the caseless flag. The
4120 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4121 firstchar or reqchar variables to record the case status of the
4122 value. This is used only for ASCII characters. */
4123
4124 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4125
4126 /* Switch on next character until the end of the branch */
4127
4128 for (;; ptr++)
4129 {
4130 BOOL negate_class;
4131 BOOL should_flip_negation;
4132 BOOL possessive_quantifier;
4133 BOOL is_quantifier;
4134 BOOL is_recurse;
4135 BOOL reset_bracount;
4136 int class_has_8bitchar;
4137 int class_one_char;
4138 int newoptions;
4139 int recno;
4140 int refsign;
4141 int skipbytes;
4142 pcre_uint32 subreqchar, subfirstchar;
4143 pcre_int32 subreqcharflags, subfirstcharflags;
4144 int terminator;
4145 unsigned int mclength;
4146 unsigned int tempbracount;
4147 pcre_uint32 ec;
4148 pcre_uchar mcbuffer[8];
4149
4150 /* Get next character in the pattern */
4151
4152 c = *ptr;
4153
4154 /* If we are at the end of a nested substitution, revert to the outer level
4155 string. Nesting only happens one level deep. */
4156
4157 if (c == CHAR_NULL && nestptr != NULL)
4158 {
4159 ptr = nestptr;
4160 nestptr = NULL;
4161 c = *ptr;
4162 }
4163
4164 /* If we are in the pre-compile phase, accumulate the length used for the
4165 previous cycle of this loop. */
4166
4167 if (lengthptr != NULL)
4168 {
4169 #ifdef PCRE_DEBUG
4170 if (code > cd->hwm) cd->hwm = code; /* High water info */
4171 #endif
4172 if (code > cd->start_workspace + cd->workspace_size -
4173 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4174 {
4175 *errorcodeptr = ERR52;
4176 goto FAILED;
4177 }
4178
4179 /* There is at least one situation where code goes backwards: this is the
4180 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4181 the class is simply eliminated. However, it is created first, so we have to
4182 allow memory for it. Therefore, don't ever reduce the length at this point.
4183 */
4184
4185 if (code < last_code) code = last_code;
4186
4187 /* Paranoid check for integer overflow */
4188
4189 if (OFLOW_MAX - *lengthptr < code - last_code)
4190 {
4191 *errorcodeptr = ERR20;
4192 goto FAILED;
4193 }
4194
4195 *lengthptr += (int)(code - last_code);
4196 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4197 (int)(code - last_code), c, c));
4198
4199 /* If "previous" is set and it is not at the start of the work space, move
4200 it back to there, in order to avoid filling up the work space. Otherwise,
4201 if "previous" is NULL, reset the current code pointer to the start. */
4202
4203 if (previous != NULL)
4204 {
4205 if (previous > orig_code)
4206 {
4207 memmove(orig_code, previous, IN_UCHARS(code - previous));
4208 code -= previous - orig_code;
4209 previous = orig_code;
4210 }
4211 }
4212 else code = orig_code;
4213
4214 /* Remember where this code item starts so we can pick up the length
4215 next time round. */
4216
4217 last_code = code;
4218 }
4219
4220 /* In the real compile phase, just check the workspace used by the forward
4221 reference list. */
4222
4223 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4224 WORK_SIZE_SAFETY_MARGIN)
4225 {
4226 *errorcodeptr = ERR52;
4227 goto FAILED;
4228 }
4229
4230 /* If in \Q...\E, check for the end; if not, we have a literal */
4231
4232 if (inescq && c != CHAR_NULL)
4233 {
4234 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4235 {
4236 inescq = FALSE;
4237 ptr++;
4238 continue;
4239 }
4240 else
4241 {
4242 if (previous_callout != NULL)
4243 {
4244 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4245 complete_callout(previous_callout, ptr, cd);
4246 previous_callout = NULL;
4247 }
4248 if ((options & PCRE_AUTO_CALLOUT) != 0)
4249 {
4250 previous_callout = code;
4251 code = auto_callout(code, ptr, cd);
4252 }
4253 goto NORMAL_CHAR;
4254 }
4255 }
4256
4257 /* Fill in length of a previous callout, except when the next thing is
4258 a quantifier. */
4259
4260 is_quantifier =
4261 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4262 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4263
4264 if (!is_quantifier && previous_callout != NULL &&
4265 after_manual_callout-- <= 0)
4266 {
4267 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4268 complete_callout(previous_callout, ptr, cd);
4269 previous_callout = NULL;
4270 }
4271
4272 /* In extended mode, skip white space and comments. */
4273
4274 if ((options & PCRE_EXTENDED) != 0)
4275 {
4276 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
4277 if (c == CHAR_NUMBER_SIGN)
4278 {
4279 ptr++;
4280 while (*ptr != CHAR_NULL)
4281 {
4282 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
4283 ptr++;
4284 #ifdef SUPPORT_UTF
4285 if (utf) FORWARDCHAR(ptr);
4286 #endif
4287 }
4288 if (*ptr != CHAR_NULL) continue;
4289
4290 /* Else fall through to handle end of string */
4291 c = 0;
4292 }
4293 }
4294
4295 /* No auto callout for quantifiers. */
4296
4297 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
4298 {
4299 previous_callout = code;
4300 code = auto_callout(code, ptr, cd);
4301 }
4302
4303 switch(c)
4304 {
4305 /* ===================================================================*/
4306 case 0: /* The branch terminates at string end */
4307 case CHAR_VERTICAL_LINE: /* or | or ) */
4308 case CHAR_RIGHT_PARENTHESIS:
4309 *firstcharptr = firstchar;
4310 *firstcharflagsptr = firstcharflags;
4311 *reqcharptr = reqchar;
4312 *reqcharflagsptr = reqcharflags;
4313 *codeptr = code;
4314 *ptrptr = ptr;
4315 if (lengthptr != NULL)
4316 {
4317 if (OFLOW_MAX - *lengthptr < code - last_code)
4318 {
4319 *errorcodeptr = ERR20;
4320 goto FAILED;
4321 }
4322 *lengthptr += (int)(code - last_code); /* To include callout length */
4323 DPRINTF((">> end branch\n"));
4324 }
4325 return TRUE;
4326
4327
4328 /* ===================================================================*/
4329 /* Handle single-character metacharacters. In multiline mode, ^ disables
4330 the setting of any following char as a first character. */
4331
4332 case CHAR_CIRCUMFLEX_ACCENT:
4333 previous = NULL;
4334 if ((options & PCRE_MULTILINE) != 0)
4335 {
4336 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4337 *code++ = OP_CIRCM;
4338 }
4339 else *code++ = OP_CIRC;
4340 break;
4341
4342 case CHAR_DOLLAR_SIGN:
4343 previous = NULL;
4344 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4345 break;
4346
4347 /* There can never be a first char if '.' is first, whatever happens about
4348 repeats. The value of reqchar doesn't change either. */
4349
4350 case CHAR_DOT:
4351 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4352 zerofirstchar = firstchar;
4353 zerofirstcharflags = firstcharflags;
4354 zeroreqchar = reqchar;
4355 zeroreqcharflags = reqcharflags;
4356 previous = code;
4357 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4358 break;
4359
4360
4361 /* ===================================================================*/
4362 /* Character classes. If the included characters are all < 256, we build a
4363 32-byte bitmap of the permitted characters, except in the special case
4364 where there is only one such character. For negated classes, we build the
4365 map as usual, then invert it at the end. However, we use a different opcode
4366 so that data characters > 255 can be handled correctly.
4367
4368 If the class contains characters outside the 0-255 range, a different
4369 opcode is compiled. It may optionally have a bit map for characters < 256,
4370 but those above are are explicitly listed afterwards. A flag byte tells
4371 whether the bitmap is present, and whether this is a negated class or not.
4372
4373 In JavaScript compatibility mode, an isolated ']' causes an error. In
4374 default (Perl) mode, it is treated as a data character. */
4375
4376 case CHAR_RIGHT_SQUARE_BRACKET:
4377 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4378 {
4379 *errorcodeptr = ERR64;
4380 goto FAILED;
4381 }
4382 goto NORMAL_CHAR;
4383
4384 case CHAR_LEFT_SQUARE_BRACKET:
4385 previous = code;
4386
4387 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4388 they are encountered at the top level, so we'll do that too. */
4389
4390 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4391 ptr[1] == CHAR_EQUALS_SIGN) &&
4392 check_posix_syntax(ptr, &tempptr))
4393 {
4394 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4395 goto FAILED;
4396 }
4397
4398 /* If the first character is '^', set the negation flag and skip it. Also,
4399 if the first few characters (either before or after ^) are \Q\E or \E we
4400 skip them too. This makes for compatibility with Perl. */
4401
4402 negate_class = FALSE;
4403 for (;;)
4404 {
4405 c = *(++ptr);
4406 if (c == CHAR_BACKSLASH)
4407 {
4408 if (ptr[1] == CHAR_E)
4409 ptr++;
4410 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4411 ptr += 3;
4412 else
4413 break;
4414 }
4415 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4416 negate_class = TRUE;
4417 else break;
4418 }
4419
4420 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4421 an initial ']' is taken as a data character -- the code below handles
4422 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4423 [^] must match any character, so generate OP_ALLANY. */
4424
4425 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4426 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4427 {
4428 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4429 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4430 zerofirstchar = firstchar;
4431 zerofirstcharflags = firstcharflags;
4432 break;
4433 }
4434
4435 /* If a class contains a negative special such as \S, we need to flip the
4436 negation flag at the end, so that support for characters > 255 works
4437 correctly (they are all included in the class). */
4438
4439 should_flip_negation = FALSE;
4440
4441 /* For optimization purposes, we track some properties of the class:
4442 class_has_8bitchar will be non-zero if the class contains at least one <
4443 256 character; class_one_char will be 1 if the class contains just one
4444 character. */
4445
4446 class_has_8bitchar = 0;
4447 class_one_char = 0;
4448
4449 /* Initialize the 32-char bit map to all zeros. We build the map in a
4450 temporary bit of memory, in case the class contains fewer than two
4451 8-bit characters because in that case the compiled code doesn't use the bit
4452 map. */
4453
4454 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4455
4456 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4457 xclass = FALSE;
4458 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4459 class_uchardata_base = class_uchardata; /* Save the start */
4460 #endif
4461
4462 /* Process characters until ] is reached. By writing this as a "do" it
4463 means that an initial ] is taken as a data character. At the start of the
4464 loop, c contains the first byte of the character. */
4465
4466 if (c != CHAR_NULL) do
4467 {
4468 const pcre_uchar *oldptr;
4469
4470 #ifdef SUPPORT_UTF
4471 if (utf && HAS_EXTRALEN(c))
4472 { /* Braces are required because the */
4473 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4474 }
4475 #endif
4476
4477 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4478 /* In the pre-compile phase, accumulate the length of any extra
4479 data and reset the pointer. This is so that very large classes that
4480 contain a zillion > 255 characters no longer overwrite the work space
4481 (which is on the stack). We have to remember that there was XCLASS data,
4482 however. */
4483
4484 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4485 {
4486 xclass = TRUE;
4487 *lengthptr += class_uchardata - class_uchardata_base;
4488 class_uchardata = class_uchardata_base;
4489 }
4490 #endif
4491
4492 /* Inside \Q...\E everything is literal except \E */
4493
4494 if (inescq)
4495 {
4496 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4497 {
4498 inescq = FALSE; /* Reset literal state */
4499 ptr++; /* Skip the 'E' */
4500 continue; /* Carry on with next */
4501 }
4502 goto CHECK_RANGE; /* Could be range if \E follows */
4503 }
4504
4505 /* Handle POSIX class names. Perl allows a negation extension of the
4506 form [:^name:]. A square bracket that doesn't match the syntax is
4507 treated as a literal. We also recognize the POSIX constructions
4508 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4509 5.6 and 5.8 do. */
4510
4511 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4512 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4513 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4514 {
4515 BOOL local_negate = FALSE;
4516 int posix_class, taboffset, tabopt;
4517 register const pcre_uint8 *cbits = cd->cbits;
4518 pcre_uint8 pbits[32];
4519
4520 if (ptr[1] != CHAR_COLON)
4521 {
4522 *errorcodeptr = ERR31;
4523 goto FAILED;
4524 }
4525
4526 ptr += 2;
4527 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4528 {
4529 local_negate = TRUE;
4530 should_flip_negation = TRUE; /* Note negative special */
4531 ptr++;
4532 }
4533
4534 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4535 if (posix_class < 0)
4536 {
4537 *errorcodeptr = ERR30;
4538 goto FAILED;
4539 }
4540
4541 /* If matching is caseless, upper and lower are converted to
4542 alpha. This relies on the fact that the class table starts with
4543 alpha, lower, upper as the first 3 entries. */
4544
4545 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4546 posix_class = 0;
4547
4548 /* When PCRE_UCP is set, some of the POSIX classes are converted to
4549 different escape sequences that use Unicode properties. */
4550
4551 #ifdef SUPPORT_UCP
4552 if ((options & PCRE_UCP) != 0)
4553 {
4554 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4555 if (posix_substitutes[pc] != NULL)
4556 {
4557 nestptr = tempptr + 1;
4558 ptr = posix_substitutes[pc] - 1;
4559 continue;
4560 }
4561 }
4562 #endif
4563 /* In the non-UCP case, we build the bit map for the POSIX class in a
4564 chunk of local store because we may be adding and subtracting from it,
4565 and we don't want to subtract bits that may be in the main map already.
4566 At the end we or the result into the bit map that is being built. */
4567
4568 posix_class *= 3;
4569
4570 /* Copy in the first table (always present) */
4571
4572 memcpy(pbits, cbits + posix_class_maps[posix_class],
4573 32 * sizeof(pcre_uint8));
4574
4575 /* If there is a second table, add or remove it as required. */
4576
4577 taboffset = posix_class_maps[posix_class + 1];
4578 tabopt = posix_class_maps[posix_class + 2];
4579
4580 if (taboffset >= 0)
4581 {
4582 if (tabopt >= 0)
4583 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4584 else
4585 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4586 }
4587
4588 /* Now see if we need to remove any special characters. An option
4589 value of 1 removes vertical space and 2 removes underscore. */
4590
4591 if (tabopt < 0) tabopt = -tabopt;
4592 if (tabopt == 1) pbits[1] &= ~0x3c;
4593 else if (tabopt == 2) pbits[11] &= 0x7f;
4594
4595 /* Add the POSIX table or its complement into the main table that is
4596 being built and we are done. */
4597
4598 if (local_negate)
4599 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4600 else
4601 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4602
4603 ptr = tempptr + 1;
4604 /* Every class contains at least one < 256 character. */
4605 class_has_8bitchar = 1;
4606 /* Every class contains at least two characters. */
4607 class_one_char = 2;
4608 continue; /* End of POSIX syntax handling */
4609 }
4610
4611 /* Backslash may introduce a single character, or it may introduce one
4612 of the specials, which just set a flag. The sequence \b is a special
4613 case. Inside a class (and only there) it is treated as backspace. We
4614 assume that other escapes have more than one character in them, so
4615 speculatively set both class_has_8bitchar and class_one_char bigger
4616 than one. Unrecognized escapes fall through and are either treated
4617 as literal characters (by default), or are faulted if
4618 PCRE_EXTRA is set. */
4619
4620 if (c == CHAR_BACKSLASH)
4621 {
4622 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4623 TRUE);
4624 if (*errorcodeptr != 0) goto FAILED;
4625 if (escape == 0) c = ec;
4626 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4627 else if (escape == ESC_N) /* \N is not supported in a class */
4628 {
4629 *errorcodeptr = ERR71;
4630 goto FAILED;
4631 }
4632 else if (escape == ESC_Q) /* Handle start of quoted string */
4633 {
4634 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4635 {
4636 ptr += 2; /* avoid empty string */
4637 }
4638 else inescq = TRUE;
4639 continue;
4640 }
4641 else if (escape == ESC_E) continue; /* Ignore orphan \E */
4642
4643 else
4644 {
4645 register const pcre_uint8 *cbits = cd->cbits;
4646 /* Every class contains at least two < 256 characters. */
4647 class_has_8bitchar++;
4648 /* Every class contains at least two characters. */
4649 class_one_char += 2;
4650
4651 switch (escape)
4652 {
4653 #ifdef SUPPORT_UCP
4654 case ESC_du: /* These are the values given for \d etc */
4655 case ESC_DU: /* when PCRE_UCP is set. We replace the */
4656 case ESC_wu: /* escape sequence with an appropriate \p */
4657 case ESC_WU: /* or \P to test Unicode properties instead */
4658 case ESC_su: /* of the default ASCII testing. */
4659 case ESC_SU:
4660 nestptr = ptr;
4661 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4662 class_has_8bitchar--; /* Undo! */
4663 continue;
4664 #endif
4665 case ESC_d:
4666 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4667 continue;
4668
4669 case ESC_D:
4670 should_flip_negation = TRUE;
4671 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4672 continue;
4673
4674 case ESC_w:
4675 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4676 continue;
4677
4678 case ESC_W:
4679 should_flip_negation = TRUE;
4680 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4681 continue;
4682
4683 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4684 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4685 previously set by something earlier in the character class.
4686 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4687 we could just adjust the appropriate bit. From PCRE 8.34 we no
4688 longer treat \s and \S specially. */
4689
4690 case ESC_s:
4691 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4692 continue;
4693
4694 case ESC_S:
4695 should_flip_negation = TRUE;
4696 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4697 continue;
4698
4699 /* The rest apply in both UCP and non-UCP cases. */
4700
4701 case ESC_h:
4702 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4703 PRIV(hspace_list), NOTACHAR);
4704 continue;
4705
4706 case ESC_H:
4707 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4708 cd, PRIV(hspace_list));
4709 continue;
4710
4711 case ESC_v:
4712 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
4713 PRIV(vspace_list), NOTACHAR);
4714 continue;
4715
4716 case ESC_V:
4717 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4718 cd, PRIV(vspace_list));
4719 continue;
4720
4721 #ifdef SUPPORT_UCP
4722 case ESC_p:
4723 case ESC_P:
4724 {
4725 BOOL negated;
4726 unsigned int ptype = 0, pdata = 0;
4727 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
4728 goto FAILED;
4729 *class_uchardata++ = ((escape == ESC_p) != negated)?
4730 XCL_PROP : XCL_NOTPROP;
4731 *class_uchardata++ = ptype;
4732 *class_uchardata++ = pdata;
4733 class_has_8bitchar--; /* Undo! */
4734 continue;
4735 }
4736 #endif
4737 /* Unrecognized escapes are faulted if PCRE is running in its
4738 strict mode. By default, for compatibility with Perl, they are
4739 treated as literals. */
4740
4741 default:
4742 if ((options & PCRE_EXTRA) != 0)
4743 {
4744 *errorcodeptr = ERR7;
4745 goto FAILED;
4746 }
4747 class_has_8bitchar--; /* Undo the speculative increase. */
4748 class_one_char -= 2; /* Undo the speculative increase. */
4749 c = *ptr; /* Get the final character and fall through */
4750 break;
4751 }
4752 }
4753
4754 /* Fall through if the escape just defined a single character (c >= 0).
4755 This may be greater than 256. */
4756
4757 escape = 0;
4758
4759 } /* End of backslash handling */
4760
4761 /* A character may be followed by '-' to form a range. However, Perl does
4762 not permit ']' to be the end of the range. A '-' character at the end is
4763 treated as a literal. Perl ignores orphaned \E sequences entirely. The
4764 code for handling \Q and \E is messy. */
4765
4766 CHECK_RANGE:
4767 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4768 {
4769 inescq = FALSE;
4770 ptr += 2;
4771 }
4772 oldptr = ptr;
4773
4774 /* Remember if \r or \n were explicitly used */
4775
4776 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4777
4778 /* Check for range */
4779
4780 if (!inescq && ptr[1] == CHAR_MINUS)
4781 {
4782 pcre_uint32 d;
4783 ptr += 2;
4784 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4785
4786 /* If we hit \Q (not followed by \E) at this point, go into escaped
4787 mode. */
4788
4789 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4790 {
4791 ptr += 2;
4792 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4793 { ptr += 2; continue; }
4794 inescq = TRUE;
4795 break;
4796 }
4797
4798 /* Minus (hyphen) at the end of a class is treated as a literal, so put
4799 back the pointer and jump to handle the character that preceded it. */
4800
4801 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4802 {
4803 ptr = oldptr;
4804 goto CLASS_SINGLE_CHARACTER;
4805 }
4806
4807 /* Otherwise, we have a potential range; pick up the next character */
4808
4809 #ifdef SUPPORT_UTF
4810 if (utf)
4811 { /* Braces are required because the */
4812 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4813 }
4814 else
4815 #endif
4816 d = *ptr; /* Not UTF-8 mode */
4817
4818 /* The second part of a range can be a single-character escape, but
4819 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4820 in such circumstances. */
4821
4822 if (!inescq && d == CHAR_BACKSLASH)
4823 {
4824 int descape;
4825 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
4826 if (*errorcodeptr != 0) goto FAILED;
4827
4828 /* \b is backspace; any other special means the '-' was literal. */
4829
4830 if (descape != 0)
4831 {
4832 if (descape == ESC_b) d = CHAR_BS; else
4833 {
4834 ptr = oldptr;
4835 goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4836 }
4837 }
4838 }
4839
4840 /* Check that the two values are in the correct order. Optimize
4841 one-character ranges. */
4842
4843 if (d < c)
4844 {
4845 *errorcodeptr = ERR8;
4846 goto FAILED;
4847 }
4848 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4849
4850 /* We have found a character range, so single character optimizations
4851 cannot be done anymore. Any value greater than 1 indicates that there
4852 is more than one character. */
4853
4854 class_one_char = 2;
4855
4856 /* Remember an explicit \r or \n, and add the range to the class. */
4857
4858 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4859
4860 class_has_8bitchar +=
4861 add_to_class(classbits, &class_uchardata, options, cd, c, d);
4862
4863 continue; /* Go get the next char in the class */
4864 }
4865
4866 /* Handle a single character - we can get here for a normal non-escape
4867 char, or after \ that introduces a single character or for an apparent
4868 range that isn't. Only the value 1 matters for class_one_char, so don't
4869 increase it if it is already 2 or more ... just in case there's a class
4870 with a zillion characters in it. */
4871
4872 CLASS_SINGLE_CHARACTER:
4873 if (class_one_char < 2) class_one_char++;
4874
4875 /* If class_one_char is 1, we have the first single character in the
4876 class, and there have been no prior ranges, or XCLASS items generated by
4877 escapes. If this is the final character in the class, we can optimize by
4878 turning the item into a 1-character OP_CHAR[I] if it's positive, or
4879 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
4880 to be set. Otherwise, there can be no first char if this item is first,
4881 whatever repeat count may follow. In the case of reqchar, save the
4882 previous value for reinstating. */
4883
4884 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4885 {
4886 ptr++;
4887 zeroreqchar = reqchar;
4888 zeroreqcharflags = reqcharflags;
4889
4890 if (negate_class)
4891 {
4892 #ifdef SUPPORT_UCP
4893 int d;
4894 #endif
4895 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4896 zerofirstchar = firstchar;
4897 zerofirstcharflags = firstcharflags;
4898
4899 /* For caseless UTF-8 mode when UCP support is available, check
4900 whether this character has more than one other case. If so, generate
4901 a special OP_NOTPROP item instead of OP_NOTI. */
4902
4903 #ifdef SUPPORT_UCP
4904 if (utf && (options & PCRE_CASELESS) != 0 &&
4905 (d = UCD_CASESET(c)) != 0)
4906 {
4907 *code++ = OP_NOTPROP;
4908 *code++ = PT_CLIST;
4909 *code++ = d;
4910 }
4911 else
4912 #endif
4913 /* Char has only one other case, or UCP not available */
4914
4915 {
4916 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4917 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4918 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4919 code += PRIV(ord2utf)(c, code);
4920 else
4921 #endif
4922 *code++ = c;
4923 }
4924
4925 /* We are finished with this character class */
4926
4927 goto END_CLASS;
4928 }
4929
4930 /* For a single, positive character, get the value into mcbuffer, and
4931 then we can handle this with the normal one-character code. */
4932
4933 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
4934 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4935 mclength = PRIV(ord2utf)(c, mcbuffer);
4936 else
4937 #endif
4938 {
4939 mcbuffer[0] = c;
4940 mclength = 1;
4941 }
4942 goto ONE_CHAR;
4943 } /* End of 1-char optimization */
4944
4945 /* There is more than one character in the class, or an XCLASS item
4946 has been generated. Add this character to the class. */
4947
4948 class_has_8bitchar +=
4949 add_to_class(classbits, &class_uchardata, options, cd, c, c);
4950 }
4951
4952 /* Loop until ']' reached. This "while" is the end of the "do" far above.
4953 If we are at the end of an internal nested string, revert to the outer
4954 string. */
4955
4956 while (((c = *(++ptr)) != CHAR_NULL ||
4957 (nestptr != NULL &&
4958 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
4959 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4960
4961 /* Check for missing terminating ']' */
4962
4963 if (c == CHAR_NULL)
4964 {
4965 *errorcodeptr = ERR6;
4966 goto FAILED;
4967 }
4968
4969 /* We will need an XCLASS if data has been placed in class_uchardata. In
4970 the second phase this is a sufficient test. However, in the pre-compile
4971 phase, class_uchardata gets emptied to prevent workspace overflow, so it
4972 only if the very last character in the class needs XCLASS will it contain
4973 anything at this point. For this reason, xclass gets set TRUE above when
4974 uchar_classdata is emptied, and that's why this code is the way it is here
4975 instead of just doing a test on class_uchardata below. */
4976
4977 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4978 if (class_uchardata > class_uchardata_base) xclass = TRUE;
4979 #endif
4980
4981 /* If this is the first thing in the branch, there can be no first char
4982 setting, whatever the repeat count. Any reqchar setting must remain
4983 unchanged after any kind of repeat. */
4984
4985 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4986 zerofirstchar = firstchar;
4987 zerofirstcharflags = firstcharflags;
4988 zeroreqchar = reqchar;
4989 zeroreqcharflags = reqcharflags;
4990
4991 /* If there are characters with values > 255, we have to compile an
4992 extended class, with its own opcode, unless there was a negated special
4993 such as \S in the class, and PCRE_UCP is not set, because in that case all
4994 characters > 255 are in the class, so any that were explicitly given as
4995 well can be ignored. If (when there are explicit characters > 255 that must
4996 be listed) there are no characters < 256, we can omit the bitmap in the
4997 actual compiled code. */
4998
4999 #ifdef SUPPORT_UTF
5000 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5001 #elif !defined COMPILE_PCRE8
5002 if (xclass && !should_flip_negation)
5003 #endif
5004 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5005 {
5006 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5007 *code++ = OP_XCLASS;
5008 code += LINK_SIZE;
5009 *code = negate_class? XCL_NOT:0;
5010
5011 /* If the map is required, move up the extra data to make room for it;
5012 otherwise just move the code pointer to the end of the extra data. */
5013
5014 if (class_has_8bitchar > 0)
5015 {
5016 *code++ |= XCL_MAP;
5017 memmove(code + (32 / sizeof(pcre_uchar)), code,
5018 IN_UCHARS(class_uchardata - code));
5019 memcpy(code, classbits, 32);
5020 code = class_uchardata + (32 / sizeof(pcre_uchar));
5021 }
5022 else code = class_uchardata;
5023
5024 /* Now fill in the complete length of the item */
5025
5026 PUT(previous, 1, (int)(code - previous));
5027 break; /* End of class handling */
5028 }
5029 #endif
5030
5031 /* If there are no characters > 255, or they are all to be included or
5032 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5033 whole class was negated and whether there were negative specials such as \S
5034 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5035 negating it if necessary. */
5036
5037 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5038 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5039 {
5040 if (negate_class)
5041 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5042 memcpy(code, classbits, 32);
5043 }
5044 code += 32 / sizeof(pcre_uchar);
5045
5046 END_CLASS:
5047 break;
5048
5049
5050 /* ===================================================================*/
5051 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5052 has been tested above. */
5053
5054 case CHAR_LEFT_CURLY_BRACKET:
5055 if (!is_quantifier) goto NORMAL_CHAR;
5056 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5057 if (*errorcodeptr != 0) goto FAILED;
5058 goto REPEAT;
5059
5060 case CHAR_ASTERISK:
5061 repeat_min = 0;
5062 repeat_max = -1;
5063 goto REPEAT;
5064
5065 case CHAR_PLUS:
5066 repeat_min = 1;
5067 repeat_max = -1;
5068 goto REPEAT;
5069
5070 case CHAR_QUESTION_MARK:
5071 repeat_min = 0;
5072 repeat_max = 1;
5073
5074 REPEAT:
5075 if (previous == NULL)
5076 {
5077 *errorcodeptr = ERR9;
5078 goto FAILED;
5079 }
5080
5081 if (repeat_min == 0)
5082 {
5083 firstchar = zerofirstchar; /* Adjust for zero repeat */
5084 firstcharflags = zerofirstcharflags;
5085 reqchar = zeroreqchar; /* Ditto */
5086 reqcharflags = zeroreqcharflags;
5087 }
5088
5089 /* Remember whether this is a variable length repeat */
5090
5091 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5092
5093 op_type = 0; /* Default single-char op codes */
5094 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5095
5096 /* Save start of previous item, in case we have to move it up in order to
5097 insert something before it. */
5098
5099 tempcode = previous;
5100
5101 /* If the next character is '+', we have a possessive quantifier. This
5102 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5103 If the next character is '?' this is a minimizing repeat, by default,
5104 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5105 repeat type to the non-default. */
5106
5107 if (ptr[1] == CHAR_PLUS)
5108 {
5109 repeat_type = 0; /* Force greedy */
5110 possessive_quantifier = TRUE;
5111 ptr++;
5112 }
5113 else if (ptr[1] == CHAR_QUESTION_MARK)
5114 {
5115 repeat_type = greedy_non_default;
5116 ptr++;
5117 }
5118 else repeat_type = greedy_default;
5119
5120 /* If previous was a recursion call, wrap it in atomic brackets so that
5121 previous becomes the atomic group. All recursions were so wrapped in the
5122 past, but it no longer happens for non-repeated recursions. In fact, the
5123 repeated ones could be re-implemented independently so as not to need this,
5124 but for the moment we rely on the code for repeating groups. */
5125
5126 if (*previous == OP_RECURSE)
5127 {
5128 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5129 *previous = OP_ONCE;
5130 PUT(previous, 1, 2 + 2*LINK_SIZE);
5131 previous[2 + 2*LINK_SIZE] = OP_KET;
5132 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5133 code += 2 + 2 * LINK_SIZE;
5134 length_prevgroup = 3 + 3*LINK_SIZE;
5135
5136 /* When actually compiling, we need to check whether this was a forward
5137 reference, and if so, adjust the offset. */
5138
5139 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5140 {
5141 int offset = GET(cd->hwm, -LINK_SIZE);
5142 if (offset == previous + 1 - cd->start_code)
5143 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5144 }
5145 }
5146
5147 /* Now handle repetition for the different types of item. */
5148
5149 /* If previous was a character or negated character match, abolish the item
5150 and generate a repeat item instead. If a char item has a minimum of more
5151 than one, ensure that it is set in reqchar - it might not be if a sequence
5152 such as x{3} is the first thing in a branch because the x will have gone
5153 into firstchar instead. */
5154
5155 if (*previous == OP_CHAR || *previous == OP_CHARI
5156 || *previous == OP_NOT || *previous == OP_NOTI)
5157 {
5158 switch (*previous)
5159 {
5160 default: /* Make compiler happy. */
5161 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5162 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5163 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5164 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5165 }
5166
5167 /* Deal with UTF characters that take up more than one character. It's
5168 easier to write this out separately than try to macrify it. Use c to
5169 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5170 it's a length rather than a small character. */
5171
5172 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5173 if (utf && NOT_FIRSTCHAR(code[-1]))
5174 {
5175 pcre_uchar *lastchar = code - 1;
5176 BACKCHAR(lastchar);
5177 c = (int)(code - lastchar); /* Length of UTF-8 character */
5178 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5179 c |= UTF_LENGTH; /* Flag c as a length */
5180 }
5181 else
5182 #endif /* SUPPORT_UTF */
5183
5184 /* Handle the case of a single charater - either with no UTF support, or
5185 with UTF disabled, or for a single character UTF character. */
5186 {
5187 c = code[-1];
5188 if (*previous <= OP_CHARI && repeat_min > 1)
5189 {
5190 reqchar = c;
5191 reqcharflags = req_caseopt | cd->req_varyopt;
5192 }
5193 }
5194
5195 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5196 }
5197
5198 /* If previous was a character type match (\d or similar), abolish it and
5199 create a suitable repeat item. The code is shared with single-character
5200 repeats by setting op_type to add a suitable offset into repeat_type. Note
5201 the the Unicode property types will be present only when SUPPORT_UCP is
5202 defined, but we don't wrap the little bits of code here because it just
5203 makes it horribly messy. */
5204
5205 else if (*previous < OP_EODN)
5206 {
5207 pcre_uchar *oldcode;
5208 int prop_type, prop_value;
5209 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5210 c = *previous;
5211
5212 OUTPUT_SINGLE_REPEAT:
5213 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5214 {
5215 prop_type = previous[1];
5216 prop_value = previous[2];
5217 }
5218 else prop_type = prop_value = -1;
5219
5220 oldcode = code;
5221 code = previous; /* Usually overwrite previous item */
5222
5223 /* If the maximum is zero then the minimum must also be zero; Perl allows
5224 this case, so we do too - by simply omitting the item altogether. */
5225
5226 if (repeat_max == 0) goto END_REPEAT;
5227
5228 /* Combine the op_type with the repeat_type */
5229
5230 repeat_type += op_type;
5231
5232 /* A minimum of zero is handled either as the special case * or ?, or as
5233 an UPTO, with the maximum given. */
5234
5235 if (repeat_min == 0)
5236 {
5237 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5238 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5239 else
5240 {
5241 *code++ = OP_UPTO + repeat_type;
5242 PUT2INC(code, 0, repeat_max);
5243 }
5244 }
5245
5246 /* A repeat minimum of 1 is optimized into some special cases. If the
5247 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5248 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5249 one less than the maximum. */
5250
5251 else if (repeat_min == 1)
5252 {
5253 if (repeat_max == -1)
5254 *code++ = OP_PLUS + repeat_type;
5255 else
5256 {
5257 code = oldcode; /* leave previous item in place */
5258 if (repeat_max == 1) goto END_REPEAT;
5259 *code++ = OP_UPTO + repeat_type;
5260 PUT2INC(code, 0, repeat_max - 1);
5261 }
5262 }
5263
5264 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5265 handled as an EXACT followed by an UPTO. */
5266
5267 else
5268 {
5269 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5270 PUT2INC(code, 0, repeat_min);
5271
5272 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5273 we have to insert the character for the previous code. For a repeated
5274 Unicode property match, there are two extra bytes that define the
5275 required property. In UTF-8 mode, long characters have their length in
5276 c, with the UTF_LENGTH bit as a flag. */
5277
5278 if (repeat_max < 0)
5279 {
5280 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5281 if (utf && (c & UTF_LENGTH) != 0)
5282 {
5283 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5284 code += c & 7;
5285 }
5286 else
5287 #endif
5288 {
5289 *code++ = c;
5290 if (prop_type >= 0)
5291 {
5292 *code++ = prop_type;
5293 *code++ = prop_value;
5294 }
5295 }
5296 *code++ = OP_STAR + repeat_type;
5297 }
5298
5299 /* Else insert an UPTO if the max is greater than the min, again
5300 preceded by the character, for the previously inserted code. If the
5301 UPTO is just for 1 instance, we can use QUERY instead. */
5302
5303 else if (repeat_max != repeat_min)
5304 {
5305 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5306 if (utf && (c & UTF_LENGTH) != 0)
5307 {
5308 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5309 code += c & 7;
5310 }
5311 else
5312 #endif
5313 *code++ = c;
5314 if (prop_type >= 0)
5315 {
5316 *code++ = prop_type;
5317 *code++ = prop_value;
5318 }
5319 repeat_max -= repeat_min;
5320
5321 if (repeat_max == 1)
5322 {
5323 *code++ = OP_QUERY + repeat_type;
5324 }
5325 else
5326 {
5327 *code++ = OP_UPTO + repeat_type;
5328 PUT2INC(code, 0, repeat_max);
5329 }
5330 }
5331 }
5332
5333 /* The character or character type itself comes last in all cases. */
5334
5335 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5336 if (utf && (c & UTF_LENGTH) != 0)
5337 {
5338 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5339 code += c & 7;
5340 }
5341 else
5342 #endif
5343 *code++ = c;
5344
5345 /* For a repeated Unicode property match, there are two extra bytes that
5346 define the required property. */
5347
5348 #ifdef SUPPORT_UCP
5349 if (prop_type >= 0)
5350 {
5351 *code++ = prop_type;
5352 *code++ = prop_value;
5353 }
5354 #endif
5355 }
5356
5357 /* If previous was a character class or a back reference, we put the repeat
5358 stuff after it, but just skip the item if the repeat was {0,0}. */
5359
5360 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5361 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5362 *previous == OP_XCLASS ||
5363 #endif
5364 *previous == OP_REF || *previous == OP_REFI ||
5365 *previous == OP_DNREF || *previous == OP_DNREFI)
5366 {
5367 if (repeat_max == 0)
5368 {
5369 code = previous;
5370 goto END_REPEAT;
5371 }
5372
5373 if (repeat_min == 0 && repeat_max == -1)
5374 *code++ = OP_CRSTAR + repeat_type;
5375 else if (repeat_min == 1 && repeat_max == -1)
5376 *code++ = OP_CRPLUS + repeat_type;
5377 else if (repeat_min == 0 && repeat_max == 1)
5378 *code++ = OP_CRQUERY + repeat_type;
5379 else
5380 {
5381 *code++ = OP_CRRANGE + repeat_type;
5382 PUT2INC(code, 0, repeat_min);
5383 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5384 PUT2INC(code, 0, repeat_max);
5385 }
5386 }
5387
5388 /* If previous was a bracket group, we may have to replicate it in certain
5389 cases. Note that at this point we can encounter only the "basic" bracket
5390 opcodes such as BRA and CBRA, as this is the place where they get converted
5391 into the more special varieties such as BRAPOS and SBRA. A test for >=
5392 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5393 ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5394 repetition of assertions, but now it does, for Perl compatibility. */
5395
5396 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5397 {
5398 register int i;
5399 int len = (int)(code - previous);
5400 pcre_uchar *bralink = NULL;
5401 pcre_uchar *brazeroptr = NULL;
5402
5403 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5404 we just ignore the repeat. */
5405
5406 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5407 goto END_REPEAT;
5408
5409 /* There is no sense in actually repeating assertions. The only potential
5410 use of repetition is in cases when the assertion is optional. Therefore,
5411 if the minimum is greater than zero, just ignore the repeat. If the
5412 maximum is not not zero or one, set it to 1. */
5413
5414 if (*previous < OP_ONCE) /* Assertion */
5415 {
5416 if (repeat_min > 0) goto END_REPEAT;
5417 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5418 }
5419
5420 /* The case of a zero minimum is special because of the need to stick
5421 OP_BRAZERO in front of it, and because the group appears once in the
5422 data, whereas in other cases it appears the minimum number of times. For
5423 this reason, it is simplest to treat this case separately, as otherwise
5424 the code gets far too messy. There are several special subcases when the
5425 minimum is zero. */
5426
5427 if (repeat_min == 0)
5428 {
5429 /* If the maximum is also zero, we used to just omit the group from the
5430 output altogether, like this:
5431
5432 ** if (repeat_max == 0)
5433 ** {
5434 ** code = previous;
5435 ** goto END_REPEAT;
5436 ** }
5437
5438 However, that fails when a group or a subgroup within it is referenced
5439 as a subroutine from elsewhere in the pattern, so now we stick in
5440 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5441 don't have a list of which groups are referenced, we cannot do this
5442 selectively.
5443
5444 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5445 and do no more at this point. However, we do need to adjust any
5446 OP_RECURSE calls inside the group that refer to the group itself or any
5447 internal or forward referenced group, because the offset is from the
5448 start of the whole regex. Temporarily terminate the pattern while doing
5449 this. */
5450
5451 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5452 {
5453 *code = OP_END;
5454 adjust_recurse(previous, 1, utf, cd, save_hwm);
5455 memmove(previous + 1, previous, IN_UCHARS(len));
5456 code++;
5457 if (repeat_max == 0)
5458 {
5459 *previous++ = OP_SKIPZERO;
5460 goto END_REPEAT;
5461 }
5462 brazeroptr = previous; /* Save for possessive optimizing */
5463 *previous++ = OP_BRAZERO + repeat_type;
5464 }
5465
5466 /* If the maximum is greater than 1 and limited, we have to replicate
5467 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5468 The first one has to be handled carefully because it's the original
5469 copy, which has to be moved up. The remainder can be handled by code
5470 that is common with the non-zero minimum case below. We have to
5471 adjust the value or repeat_max, since one less copy is required. Once
5472 again, we may have to adjust any OP_RECURSE calls inside the group. */
5473
5474 else
5475 {
5476 int offset;
5477 *code = OP_END;
5478 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5479 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5480 code += 2 + LINK_SIZE;
5481 *previous++ = OP_BRAZERO + repeat_type;
5482 *previous++ = OP_BRA;
5483
5484 /* We chain together the bracket offset fields that have to be
5485 filled in later when the ends of the brackets are reached. */
5486
5487 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5488 bralink = previous;
5489 PUTINC(previous, 0, offset);
5490 }
5491
5492 repeat_max--;
5493 }
5494
5495 /* If the minimum is greater than zero, replicate the group as many
5496 times as necessary, and adjust the maximum to the number of subsequent
5497 copies that we need. If we set a first char from the group, and didn't
5498 set a required char, copy the latter from the former. If there are any
5499 forward reference subroutine calls in the group, there will be entries on
5500 the workspace list; replicate these with an appropriate increment. */
5501
5502 else
5503 {
5504 if (repeat_min > 1)
5505 {
5506 /* In the pre-compile phase, we don't actually do the replication. We
5507 just adjust the length as if we had. Do some paranoid checks for
5508 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5509 integer type when available, otherwise double. */
5510
5511 if (lengthptr != NULL)
5512 {
5513 int delta = (repeat_min - 1)*length_prevgroup;
5514 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5515 (INT64_OR_DOUBLE)length_prevgroup >
5516 (INT64_OR_DOUBLE)INT_MAX ||
5517 OFLOW_MAX - *lengthptr < delta)
5518 {
5519 *errorcodeptr = ERR20;
5520 goto FAILED;
5521 }
5522 *lengthptr += delta;
5523 }
5524
5525 /* This is compiling for real. If there is a set first byte for
5526 the group, and we have not yet set a "required byte", set it. Make
5527 sure there is enough workspace for copying forward references before
5528 doing the copy. */
5529
5530 else
5531 {
5532 if (groupsetfirstchar && reqcharflags < 0)
5533 {
5534 reqchar = firstchar;
5535 reqcharflags = firstcharflags;
5536 }
5537
5538 for (i = 1; i < repeat_min; i++)
5539 {
5540 pcre_uchar *hc;
5541 pcre_uchar *this_hwm = cd->hwm;
5542 memcpy(code, previous, IN_UCHARS(len));
5543
5544 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5545 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5546 {
5547 int save_offset = save_hwm - cd->start_workspace;
5548 int this_offset = this_hwm - cd->start_workspace;
5549 *errorcodeptr = expand_workspace(cd);
5550 if (*errorcodeptr != 0) goto FAILED;
5551 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5552 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5553 }
5554
5555 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5556 {
5557 PUT(cd->hwm, 0, GET(hc, 0) + len);
5558 cd->hwm += LINK_SIZE;
5559 }
5560 save_hwm = this_hwm;
5561 code += len;
5562 }
5563 }
5564 }
5565
5566 if (repeat_max > 0) repeat_max -= repeat_min;
5567 }
5568
5569 /* This code is common to both the zero and non-zero minimum cases. If
5570 the maximum is limited, it replicates the group in a nested fashion,
5571 remembering the bracket starts on a stack. In the case of a zero minimum,
5572 the first one was set up above. In all cases the repeat_max now specifies
5573 the number of additional copies needed. Again, we must remember to
5574 replicate entries on the forward reference list. */
5575
5576 if (repeat_max >= 0)
5577 {
5578 /* In the pre-compile phase, we don't actually do the replication. We
5579 just adjust the length as if we had. For each repetition we must add 1
5580 to the length for BRAZERO and for all but the last repetition we must
5581 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5582 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5583 a 64-bit integer type when available, otherwise double. */
5584
5585 if (lengthptr != NULL && repeat_max > 0)
5586 {
5587 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5588 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5589 if ((INT64_OR_DOUBLE)repeat_max *
5590 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5591 > (INT64_OR_DOUBLE)INT_MAX ||
5592 OFLOW_MAX - *lengthptr < delta)
5593 {
5594 *errorcodeptr = ERR20;
5595 goto FAILED;
5596 }
5597 *lengthptr += delta;
5598 }
5599
5600 /* This is compiling for real */
5601
5602 else for (i = repeat_max - 1; i >= 0; i--)
5603 {
5604 pcre_uchar *hc;
5605 pcre_uchar *this_hwm = cd->hwm;
5606
5607 *code++ = OP_BRAZERO + repeat_type;
5608
5609 /* All but the final copy start a new nesting, maintaining the
5610 chain of brackets outstanding. */
5611
5612 if (i != 0)
5613 {
5614 int offset;
5615 *code++ = OP_BRA;
5616 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5617 bralink = code;
5618 PUTINC(code, 0, offset);
5619 }
5620
5621 memcpy(code, previous, IN_UCHARS(len));
5622
5623 /* Ensure there is enough workspace for forward references before
5624 copying them. */
5625
5626 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5627 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5628 {
5629 int save_offset = save_hwm - cd->start_workspace;
5630 int this_offset = this_hwm - cd->start_workspace;
5631 *errorcodeptr = expand_workspace(cd);
5632 if (*errorcodeptr != 0) goto FAILED;
5633 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5634 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5635 }
5636
5637 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5638 {
5639 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5640 cd->hwm += LINK_SIZE;
5641 }
5642 save_hwm = this_hwm;
5643 code += len;
5644 }
5645
5646 /* Now chain through the pending brackets, and fill in their length
5647 fields (which are holding the chain links pro tem). */
5648
5649 while (bralink != NULL)
5650 {
5651 int oldlinkoffset;
5652 int offset = (int)(code - bralink + 1);
5653 pcre_uchar *bra = code - offset;
5654 oldlinkoffset = GET(bra, 1);
5655 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5656 *code++ = OP_KET;
5657 PUTINC(code, 0, offset);
5658 PUT(bra, 1, offset);
5659 }
5660 }
5661
5662 /* If the maximum is unlimited, set a repeater in the final copy. For
5663 ONCE brackets, that's all we need to do. However, possessively repeated
5664 ONCE brackets can be converted into non-capturing brackets, as the
5665 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5666 deal with possessive ONCEs specially.
5667
5668 Otherwise, when we are doing the actual compile phase, check to see
5669 whether this group is one that could match an empty string. If so,
5670 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5671 that runtime checking can be done. [This check is also applied to ONCE
5672 groups at runtime, but in a different way.]
5673
5674 Then, if the quantifier was possessive and the bracket is not a
5675 conditional, we convert the BRA code to the POS form, and the KET code to
5676 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5677 subpattern at both the start and at the end.) The use of special opcodes
5678 makes it possible to reduce greatly the stack usage in pcre_exec(). If
5679 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5680
5681 Then, if the minimum number of matches is 1 or 0, cancel the possessive
5682 flag so that the default action below, of wrapping everything inside
5683 atomic brackets, does not happen. When the minimum is greater than 1,
5684 there will be earlier copies of the group, and so we still have to wrap
5685 the whole thing. */
5686
5687 else
5688 {
5689 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5690 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5691
5692 /* Convert possessive ONCE brackets to non-capturing */
5693
5694 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5695 possessive_quantifier) *bracode = OP_BRA;
5696
5697 /* For non-possessive ONCE brackets, all we need to do is to
5698 set the KET. */
5699
5700 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5701 *ketcode = OP_KETRMAX + repeat_type;
5702
5703 /* Handle non-ONCE brackets and possessive ONCEs (which have been
5704 converted to non-capturing above). */
5705
5706 else
5707 {
5708 /* In the compile phase, check for empty string matching. */
5709
5710 if (lengthptr == NULL)
5711 {
5712 pcre_uchar *scode = bracode;
5713 do
5714 {
5715 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
5716 {
5717 *bracode += OP_SBRA - OP_BRA;
5718 break;
5719 }
5720 scode += GET(scode, 1);
5721 }
5722 while (*scode == OP_ALT);
5723 }
5724
5725 /* Handle possessive quantifiers. */
5726
5727 if (possessive_quantifier)
5728 {
5729 /* For COND brackets, we wrap the whole thing in a possessively
5730 repeated non-capturing bracket, because we have not invented POS
5731 versions of the COND opcodes. Because we are moving code along, we
5732 must ensure that any pending recursive references are updated. */
5733
5734 if (*bracode == OP_COND || *bracode == OP_SCOND)
5735 {
5736 int nlen = (int)(code - bracode);
5737 *code = OP_END;
5738 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5739 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5740 code += 1 + LINK_SIZE;
5741 nlen += 1 + LINK_SIZE;
5742 *bracode = OP_BRAPOS;
5743 *code++ = OP_KETRPOS;
5744 PUTINC(code, 0, nlen);
5745 PUT(bracode, 1, nlen);
5746 }
5747
5748 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5749
5750 else
5751 {
5752 *bracode += 1; /* Switch to xxxPOS opcodes */
5753 *ketcode = OP_KETRPOS;
5754 }
5755
5756 /* If the minimum is zero, mark it as possessive, then unset the
5757 possessive flag when the minimum is 0 or 1. */
5758
5759 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5760 if (repeat_min < 2) possessive_quantifier = FALSE;
5761 }
5762
5763 /* Non-possessive quantifier */
5764
5765 else *ketcode = OP_KETRMAX + repeat_type;
5766 }
5767 }
5768 }
5769
5770 /* If previous is OP_FAIL, it was generated by an empty class [] in
5771 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5772 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5773 error above. We can just ignore the repeat in JS case. */
5774
5775 else if (*previous == OP_FAIL) goto END_REPEAT;
5776
5777 /* Else there's some kind of shambles */
5778
5779 else
5780 {
5781 *errorcodeptr = ERR11;
5782 goto FAILED;
5783 }
5784
5785 /* If the character following a repeat is '+', or if certain optimization
5786 tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5787 there are special alternative opcodes for this case. For anything else, we
5788 wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5789 notation is just syntactic sugar, taken from Sun's Java package, but the
5790 special opcodes can optimize it.
5791
5792 Some (but not all) possessively repeated subpatterns have already been
5793 completely handled in the code just above. For them, possessive_quantifier
5794 is always FALSE at this stage.
5795
5796 Note that the repeated item starts at tempcode, not at previous, which
5797 might be the first part of a string whose (former) last char we repeated.
5798
5799 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5800 an 'upto' may follow. We skip over an 'exact' item, and then test the
5801 length of what remains before proceeding. */
5802
5803 if (possessive_quantifier)
5804 {
5805 int len;
5806
5807 if (*tempcode == OP_TYPEEXACT)
5808 tempcode += PRIV(OP_lengths)[*tempcode] +
5809 ((tempcode[1 + IMM2_SIZE] == OP_PROP
5810 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5811
5812 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5813 {
5814 tempcode += PRIV(OP_lengths)[*tempcode];
5815 #ifdef SUPPORT_UTF
5816 if (utf && HAS_EXTRALEN(tempcode[-1]))
5817 tempcode += GET_EXTRALEN(tempcode[-1]);
5818 #endif
5819 }
5820
5821 len = (int)(code - tempcode);
5822 if (len > 0) switch (*tempcode)
5823 {
5824 case OP_STAR: *tempcode = OP_POSSTAR; break;
5825 case OP_PLUS: *tempcode = OP_POSPLUS; break;
5826 case OP_QUERY: *tempcode = OP_POSQUERY; break;
5827 case OP_UPTO: *tempcode = OP_POSUPTO; break;
5828
5829 case OP_STARI: *tempcode = OP_POSSTARI; break;
5830 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
5831 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5832 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
5833
5834 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
5835 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
5836 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5837 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
5838
5839 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
5840 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
5841 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5842 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
5843
5844 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
5845 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
5846 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5847 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
5848
5849 /* Because we are moving code along, we must ensure that any
5850 pending recursive references are updated. */
5851
5852 default:
5853 *code = OP_END;
5854 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5855 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5856 code += 1 + LINK_SIZE;
5857 len += 1 + LINK_SIZE;
5858 tempcode[0] = OP_ONCE;
5859 *code++ = OP_KET;
5860 PUTINC(code, 0, len);
5861 PUT(tempcode, 1, len);
5862 break;
5863 }
5864 }
5865
5866 /* In all case we no longer have a previous item. We also set the
5867 "follows varying string" flag for subsequently encountered reqchars if
5868 it isn't already set and we have just passed a varying length item. */
5869
5870 END_REPEAT:
5871 previous = NULL;
5872 cd->req_varyopt |= reqvary;
5873 break;
5874
5875
5876 /* ===================================================================*/
5877 /* Start of nested parenthesized sub-expression, or comment or lookahead or
5878 lookbehind or option setting or condition or all the other extended
5879 parenthesis forms. */
5880
5881 case CHAR_LEFT_PARENTHESIS:
5882 newoptions = options;
5883 skipbytes = 0;
5884 bravalue = OP_CBRA;
5885 save_hwm = cd->hwm;
5886 reset_bracount = FALSE;
5887
5888 /* First deal with various "verbs" that can be introduced by '*'. */
5889
5890 ptr++;
5891 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5892 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5893 {
5894 int i, namelen;
5895 int arglen = 0;
5896 const char *vn = verbnames;
5897 const pcre_uchar *name = ptr + 1;
5898 const pcre_uchar *arg = NULL;
5899 previous = NULL;
5900 ptr++;
5901 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5902 namelen = (int)(ptr - name);
5903
5904 /* It appears that Perl allows any characters whatsoever, other than
5905 a closing parenthesis, to appear in arguments, so we no longer insist on
5906 letters, digits, and underscores. */
5907
5908 if (*ptr == CHAR_COLON)
5909 {
5910 arg = ++ptr;
5911 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5912 arglen = (int)(ptr - arg);
5913 if ((unsigned int)arglen > MAX_MARK)
5914 {
5915 *errorcodeptr = ERR75;
5916 goto FAILED;
5917 }
5918 }
5919
5920 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5921 {
5922 *errorcodeptr = ERR60;
5923 goto FAILED;
5924 }
5925
5926 /* Scan the table of verb names */
5927
5928 for (i = 0; i < verbcount; i++)
5929 {
5930 if (namelen == verbs[i].len &&
5931 STRNCMP_UC_C8(name, vn, namelen) == 0)
5932 {
5933 int setverb;
5934
5935 /* Check for open captures before ACCEPT and convert it to
5936 ASSERT_ACCEPT if in an assertion. */
5937
5938 if (verbs[i].op == OP_ACCEPT)
5939 {
5940 open_capitem *oc;
5941 if (arglen != 0)
5942 {
5943 *errorcodeptr = ERR59;
5944 goto FAILED;
5945 }
5946 cd->had_accept = TRUE;
5947 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5948 {
5949 *code++ = OP_CLOSE;
5950 PUT2INC(code, 0, oc->number);
5951 }
5952 setverb = *code++ =
5953 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5954
5955 /* Do not set firstchar after *ACCEPT */
5956 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5957 }
5958
5959 /* Handle other cases with/without an argument */
5960
5961 else if (arglen == 0)
5962 {
5963 if (verbs[i].op < 0) /* Argument is mandatory */
5964 {
5965 *errorcodeptr = ERR66;
5966 goto FAILED;
5967 }
5968 setverb = *code++ = verbs[i].op;
5969 }
5970
5971 else
5972 {
5973 if (verbs[i].op_arg < 0) /* Argument is forbidden */
5974 {
5975 *errorcodeptr = ERR59;
5976 goto FAILED;
5977 }
5978 setverb = *code++ = verbs[i].op_arg;
5979 *code++ = arglen;
5980 memcpy(code, arg, IN_UCHARS(arglen));
5981 code += arglen;
5982 *code++ = 0;
5983 }
5984
5985 switch (setverb)
5986 {
5987 case OP_THEN:
5988 case OP_THEN_ARG:
5989 cd->external_flags |= PCRE_HASTHEN;
5990 break;
5991
5992 case OP_PRUNE:
5993 case OP_PRUNE_ARG:
5994 case OP_SKIP:
5995 case OP_SKIP_ARG:
5996 cd->had_pruneorskip = TRUE;
5997 break;
5998 }
5999
6000 break; /* Found verb, exit loop */
6001 }
6002
6003 vn += verbs[i].len + 1;
6004 }
6005
6006 if (i < verbcount) continue; /* Successfully handled a verb */
6007 *errorcodeptr = ERR60; /* Verb not recognized */
6008 goto FAILED;
6009 }
6010
6011 /* Deal with the extended parentheses; all are introduced by '?', and the
6012 appearance of any of them means that this is not a capturing group. */
6013
6014 else if (*ptr == CHAR_QUESTION_MARK)
6015 {
6016 int i, set, unset, namelen;
6017 int *optset;
6018 const pcre_uchar *name;
6019 pcre_uchar *slot;
6020
6021 switch (*(++ptr))
6022 {
6023 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
6024 ptr++;
6025 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6026 if (*ptr == CHAR_NULL)
6027 {
6028 *errorcodeptr = ERR18;
6029 goto FAILED;
6030 }
6031 continue;
6032
6033
6034 /* ------------------------------------------------------------ */
6035 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6036 reset_bracount = TRUE;
6037 /* Fall through */
6038
6039 /* ------------------------------------------------------------ */
6040 case CHAR_COLON: /* Non-capturing bracket */
6041 bravalue = OP_BRA;
6042 ptr++;
6043 break;
6044
6045
6046 /* ------------------------------------------------------------ */
6047 case CHAR_LEFT_PARENTHESIS:
6048 bravalue = OP_COND; /* Conditional group */
6049 tempptr = ptr;
6050
6051 /* A condition can be an assertion, a number (referring to a numbered
6052 group), a name (referring to a named group), or 'R', referring to
6053 recursion. R<digits> and R&name are also permitted for recursion tests.
6054
6055 There are several syntaxes for testing a named group: (?(name)) is used
6056 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
6057
6058 There are two unfortunate ambiguities, caused by history. (a) 'R' can
6059 be the recursive thing or the name 'R' (and similarly for 'R' followed
6060 by digits), and (b) a number could be a name that consists of digits.
6061 In both cases, we look for a name first; if not found, we try the other
6062 cases.
6063
6064 For compatibility with auto-callouts, we allow a callout to be
6065 specified before a condition that is an assertion. First, check for the
6066 syntax of a callout; if found, adjust the temporary pointer that is
6067 used to check for an assertion condition. That's all that is needed! */
6068
6069 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6070 {
6071 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6072 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6073 tempptr += i + 1;
6074 }
6075
6076 /* For conditions that are assertions, check the syntax, and then exit
6077 the switch. This will take control down to where bracketed groups,
6078 including assertions, are processed. */
6079
6080 if (tempptr[1] == CHAR_QUESTION_MARK &&
6081 (tempptr[2] == CHAR_EQUALS_SIGN ||
6082 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6083 tempptr[2] == CHAR_LESS_THAN_SIGN))
6084 break;
6085
6086 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6087 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6088
6089 code[1+LINK_SIZE] = OP_CREF;
6090 skipbytes = 1+IMM2_SIZE;
6091 refsign = -1;
6092
6093 /* Check for a test for recursion in a named group. */
6094
6095 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
6096 {
6097 terminator = -1;
6098 ptr += 2;
6099 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6100 }
6101
6102 /* Check for a test for a named group's having been set, using the Perl
6103 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6104 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6105 consist entirely of digits, there is scope for ambiguity. */
6106
6107 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6108 {
6109 terminator = CHAR_GREATER_THAN_SIGN;
6110 ptr++;
6111 }
6112 else if (ptr[1] == CHAR_APOSTROPHE)
6113 {
6114 terminator = CHAR_APOSTROPHE;
6115 ptr++;
6116 }
6117 else
6118 {
6119 terminator = CHAR_NULL;
6120 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6121 }
6122
6123 /* When a name is one of a number of duplicates, a different opcode is
6124 used and it needs more memory. Unfortunately we cannot tell whether a
6125 name is a duplicate in the first pass, so we have to allow for more
6126 memory except when we know it is a relative numerical reference. */
6127
6128 if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6129
6130 /* We now expect to read a name (possibly all digits); any thing else
6131 is an error. In the case of all digits, also get it as a number. */
6132
6133 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6134 {
6135 ptr += 1; /* To get the right offset */
6136 *errorcodeptr = ERR28;
6137 goto FAILED;
6138 }
6139
6140 recno = 0;
6141 name = ++ptr;
6142 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6143 {
6144 if (recno >= 0)
6145 recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
6146 ptr++;
6147 }
6148 namelen = (int)(ptr - name);
6149
6150 /* Check the terminator */
6151
6152 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6153 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6154 {
6155 ptr--; /* Error offset */
6156 *errorcodeptr = ERR26;
6157 goto FAILED;
6158 }
6159
6160 /* Do no further checking in the pre-compile phase. */
6161
6162 if (lengthptr != NULL) break;
6163
6164 /* In the real compile we do the work of looking for the actual
6165 reference. If the string started with "+" or "-" we require the rest to
6166 be digits, in which case recno will be set. */
6167
6168 if (refsign > 0)
6169 {
6170 if (recno <= 0)
6171 {
6172 *errorcodeptr = ERR58;
6173 goto FAILED;
6174 }
6175 recno = (refsign == CHAR_MINUS)?
6176 cd->bracount - recno + 1 : recno +cd->bracount;
6177 if (recno <= 0 || recno > cd->final_bracount)
6178 {
6179 *errorcodeptr = ERR15;
6180 goto FAILED;
6181 }
6182 PUT2(code, 2+LINK_SIZE, recno);
6183 break;
6184 }
6185
6186 /* Otherwise (did not start with "+" or "-"), start by looking for the
6187 name. */
6188
6189 slot = cd->name_table;
6190 for (i = 0; i < cd->names_found; i++)
6191 {
6192 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6193 slot += cd->name_entry_size;
6194 }
6195
6196 /* Found the named subpattern. If the name is duplicated, add one to
6197 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6198 appropriate data values. Otherwise, just insert the unique subpattern
6199 number. */
6200
6201 if (i < cd->names_found)
6202 {
6203 int offset = i++;
6204 int count = 1;
6205 recno = GET2(slot, 0); /* Number from first found */
6206 for (; i < cd->names_found; i++)
6207 {
6208 slot += cd->name_entry_size;
6209 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6210 count++;
6211 }
6212 if (count > 1)
6213 {
6214 PUT2(code, 2+LINK_SIZE, offset);
6215 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6216 skipbytes += IMM2_SIZE;
6217 code[1+LINK_SIZE]++;
6218 }
6219 else /* Not a duplicated name */
6220 {
6221 PUT2(code, 2+LINK_SIZE, recno);
6222 }
6223 }
6224
6225 /* If terminator == CHAR_NULL it means that the name followed directly
6226 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6227 are some further alternatives to try. For the cases where terminator !=
6228 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
6229 now checked all the possibilities, so give an error. */
6230
6231 else if (terminator != CHAR_NULL)
6232 {
6233 *errorcodeptr = ERR15;
6234 goto FAILED;
6235 }
6236
6237 /* Check for (?(R) for recursion. Allow digits after R to specify a
6238 specific group number. */
6239
6240 else if (*name == CHAR_R)
6241 {
6242 recno = 0;
6243 for (i = 1; i < namelen; i++)
6244 {
6245 if (!IS_DIGIT(name[i]))
6246 {
6247 *errorcodeptr = ERR15;
6248 goto FAILED;
6249 }
6250 recno = recno * 10 + name[i] - CHAR_0;
6251 }
6252 if (recno == 0) recno = RREF_ANY;
6253 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
6254 PUT2(code, 2+LINK_SIZE, recno);
6255 }
6256
6257 /* Similarly, check for the (?(DEFINE) "condition", which is always
6258 false. */
6259
6260 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6261 {
6262 code[1+LINK_SIZE] = OP_DEF;
6263 skipbytes = 1;
6264 }
6265
6266 /* Check for the "name" actually being a subpattern number. We are
6267 in the second pass here, so final_bracount is set. */
6268
6269 else if (recno > 0 && recno <= cd->final_bracount)
6270 {
6271 PUT2(code, 2+LINK_SIZE, recno);
6272 }
6273
6274 /* Either an unidentified subpattern, or a reference to (?(0) */
6275
6276 else
6277 {
6278 *errorcodeptr = (recno == 0)? ERR35: ERR15;
6279 goto FAILED;
6280 }
6281 break;
6282
6283
6284 /* ------------------------------------------------------------ */
6285 case CHAR_EQUALS_SIGN: /* Positive lookahead */
6286 bravalue = OP_ASSERT;
6287 cd->assert_depth += 1;
6288 ptr++;
6289 break;
6290
6291
6292 /* ------------------------------------------------------------ */
6293 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
6294 ptr++;
6295 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */
6296 {
6297 *code++ = OP_FAIL;
6298 previous = NULL;
6299 continue;
6300 }
6301 bravalue = OP_ASSERT_NOT;
6302 cd->assert_depth += 1;
6303 break;
6304
6305
6306 /* ------------------------------------------------------------ */
6307 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
6308 switch (ptr[1])
6309 {
6310 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
6311 bravalue = OP_ASSERTBACK;
6312 cd->assert_depth += 1;
6313 ptr += 2;
6314 break;
6315
6316 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
6317 bravalue = OP_ASSERTBACK_NOT;
6318 cd->assert_depth += 1;
6319 ptr += 2;
6320 break;
6321
6322 default: /* Could be name define, else bad */
6323 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
6324 goto DEFINE_NAME;
6325 ptr++; /* Correct offset for error */
6326 *errorcodeptr = ERR24;
6327 goto FAILED;
6328 }
6329 break;
6330
6331
6332 /* ------------------------------------------------------------ */
6333 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
6334 bravalue = OP_ONCE;
6335 ptr++;
6336 break;
6337
6338
6339 /* ------------------------------------------------------------ */
6340 case CHAR_C: /* Callout - may be followed by digits; */
6341 previous_callout = code; /* Save for later completion */
6342 after_manual_callout = 1; /* Skip one item before completing */
6343 *code++ = OP_CALLOUT;
6344 {
6345 int n = 0;
6346 ptr++;
6347 while(IS_DIGIT(*ptr))
6348 n = n * 10 + *ptr++ - CHAR_0;
6349 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6350 {
6351 *errorcodeptr = ERR39;
6352 goto FAILED;
6353 }
6354 if (n > 255)
6355 {
6356 *errorcodeptr = ERR38;
6357 goto FAILED;
6358 }
6359 *code++ = n;
6360 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6361 PUT(code, LINK_SIZE, 0); /* Default length */
6362 code += 2 * LINK_SIZE;
6363 }
6364 previous = NULL;
6365 continue;
6366
6367
6368 /* ------------------------------------------------------------ */
6369 case CHAR_P: /* Python-style named subpattern handling */
6370 if (*(++ptr) == CHAR_EQUALS_SIGN ||
6371 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
6372 {
6373 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6374 terminator = CHAR_RIGHT_PARENTHESIS;
6375 goto NAMED_REF_OR_RECURSE;
6376 }
6377 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
6378 {
6379 *errorcodeptr = ERR41;
6380 goto FAILED;
6381 }
6382 /* Fall through to handle (?P< as (?< is handled */
6383
6384
6385 /* ------------------------------------------------------------ */
6386 DEFINE_NAME: /* Come here from (?< handling */
6387 case CHAR_APOSTROPHE:
6388 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6389 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6390 name = ++ptr;
6391
6392 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6393 namelen = (int)(ptr - name);
6394
6395 /* In the pre-compile phase, do a syntax check, remember the longest
6396 name, and then remember the group in a vector, expanding it if
6397 necessary. Duplicates for the same number are skipped; other duplicates
6398 are checked for validity. In the actual compile, there is nothing to
6399 do. */
6400
6401 if (lengthptr != NULL)
6402 {
6403 named_group *ng;
6404 pcre_uint32 number = cd->bracount + 1;
6405
6406 if (*ptr != (pcre_uchar)terminator)
6407 {
6408 *errorcodeptr = ERR42;
6409 goto FAILED;
6410 }
6411
6412 if (cd->names_found >= MAX_NAME_COUNT)
6413 {
6414 *errorcodeptr = ERR49;
6415 goto FAILED;
6416 }
6417
6418 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6419 {
6420 cd->name_entry_size = namelen + IMM2_SIZE + 1;
6421 if (namelen > MAX_NAME_SIZE)
6422 {
6423 *errorcodeptr = ERR48;
6424 goto FAILED;
6425 }
6426 }
6427
6428 /* Scan the list to check for duplicates. For duplicate names, if the
6429 number is the same, break the loop, which causes the name to be
6430 discarded; otherwise, if DUPNAMES is not set, give an error.
6431 If it is set, allow the name with a different number, but continue
6432 scanning in case this is a duplicate with the same number. For
6433 non-duplicate names, give an error if the number is duplicated. */
6434
6435 ng = cd->named_groups;
6436 for (i = 0; i < cd->names_found; i++, ng++)
6437 {
6438 if (namelen == ng->length &&
6439 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6440 {
6441 if (ng->number == number) break;
6442 if ((options & PCRE_DUPNAMES) == 0)
6443 {
6444 *errorcodeptr = ERR43;
6445 goto FAILED;
6446 }
6447 cd->dupnames = TRUE; /* Duplicate names exist */
6448 }
6449 else if (ng->number == number)
6450 {
6451 *errorcodeptr = ERR65;
6452 goto FAILED;
6453 }
6454 }
6455
6456 if (i >= cd->names_found) /* Not a duplicate with same number */
6457 {
6458 /* Increase the list size if necessary */
6459
6460 if (cd->names_found >= cd->named_group_list_size)
6461 {
6462 int newsize = cd->named_group_list_size * 2;
6463 named_group *newspace = (PUBL(malloc))
6464 (newsize * sizeof(named_group));
6465
6466 if (newspace == NULL)
6467 {
6468 *errorcodeptr = ERR21;
6469 goto FAILED;
6470 }
6471
6472 memcpy(newspace, cd->named_groups,
6473 cd->named_group_list_size * sizeof(named_group));
6474 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
6475 (PUBL(free))((void *)cd->named_groups);
6476 cd->named_groups = newspace;
6477 cd->named_group_list_size = newsize;
6478 }
6479
6480 cd->named_groups[cd->names_found].name = name;
6481 cd->named_groups[cd->names_found].length = namelen;
6482 cd->named_groups[cd->names_found].number = number;
6483 cd->names_found++;
6484 }
6485 }
6486
6487 ptr++; /* Move past > or ' in both passes. */
6488 goto NUMBERED_GROUP;
6489
6490
6491 /* ------------------------------------------------------------ */
6492 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
6493 terminator = CHAR_RIGHT_PARENTHESIS;
6494 is_recurse = TRUE;
6495 /* Fall through */
6496
6497 /* We come here from the Python syntax above that handles both
6498 references (?P=name) and recursion (?P>name), as well as falling
6499 through from the Perl recursion syntax (?&name). We also come here from
6500 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6501 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6502
6503 NAMED_REF_OR_RECURSE:
6504 name = ++ptr;
6505 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6506 namelen = (int)(ptr - name);
6507
6508 /* In the pre-compile phase, do a syntax check. We used to just set
6509 a dummy reference number, because it was not used in the first pass.
6510 However, with the change of recursive back references to be atomic,
6511 we have to look for the number so that this state can be identified, as
6512 otherwise the incorrect length is computed. If it's not a backwards
6513 reference, the dummy number will do. */
6514
6515 if (lengthptr != NULL)
6516 {
6517 named_group *ng;
6518
6519 if (namelen == 0)
6520 {
6521 *errorcodeptr = ERR62;
6522 goto FAILED;
6523 }
6524 if (*ptr != (pcre_uchar)terminator)
6525 {
6526 *errorcodeptr = ERR42;
6527 goto FAILED;
6528 }
6529 if (namelen > MAX_NAME_SIZE)
6530 {
6531 *errorcodeptr = ERR48;
6532 goto FAILED;
6533 }
6534
6535 /* The name table does not exist in the first pass; instead we must
6536 scan the list of names encountered so far in order to get the
6537 number. If the name is not found, set the value to 0 for a forward
6538 reference. */
6539
6540 ng = cd->named_groups;
6541 for (i = 0; i < cd->names_found; i++, ng++)
6542 {
6543 if (namelen == ng->length &&
6544 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6545 break;
6546 }
6547 recno = (i < cd->names_found)? ng->number : 0;
6548
6549 /* Count named back references. */
6550
6551 if (!is_recurse) cd->namedrefcount++;
6552 }
6553
6554 /* In the real compile, search the name table. We check the name
6555 first, and then check that we have reached the end of the name in the
6556 table. That way, if the name is longer than any in the table, the
6557 comparison will fail without reading beyond the table entry. */
6558
6559 else
6560 {
6561 slot = cd->name_table;
6562 for (i = 0; i < cd->names_found; i++)
6563 {
6564 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6565 slot[IMM2_SIZE+namelen] == 0)
6566 break;
6567 slot += cd->name_entry_size;
6568 }
6569
6570 if (i < cd->names_found)
6571 {
6572 recno = GET2(slot, 0);
6573 }
6574 else
6575 {
6576 *errorcodeptr = ERR15;
6577 goto FAILED;
6578 }
6579 }
6580
6581 /* In both phases, for recursions, we can now go to the code than
6582 handles numerical recursion. */
6583
6584 if (is_recurse) goto HANDLE_RECURSION;
6585
6586 /* In the second pass we must see if the name is duplicated. If so, we
6587 generate a different opcode. */
6588
6589 if (lengthptr == NULL && cd->dupnames)
6590 {
6591 int count = 1;
6592 unsigned int index = i;
6593 pcre_uchar *cslot = slot + cd->name_entry_size;
6594
6595 for (i++; i < cd->names_found; i++)
6596 {
6597 if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
6598 count++;
6599 cslot += cd->name_entry_size;
6600 }
6601
6602 if (count > 1)
6603 {
6604 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6605 previous = code;
6606 *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6607 PUT2INC(code, 0, index);
6608 PUT2INC(code, 0, count);
6609
6610 /* Process each potentially referenced group. */
6611
6612 for (; slot < cslot; slot += cd->name_entry_size)
6613 {
6614 open_capitem *oc;
6615 recno = GET2(slot, 0);
6616 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6617 if (recno > cd->top_backref) cd->top_backref = recno;
6618
6619 /* Check to see if this back reference is recursive, that it, it
6620 is inside the group that it references. A flag is set so that the
6621 group can be made atomic. */
6622
6623 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6624 {
6625 if (oc->number == recno)
6626 {
6627 oc->flag = TRUE;
6628 break;
6629 }
6630 }
6631 }
6632
6633 continue; /* End of back ref handling */
6634 }
6635 }
6636
6637 /* First pass, or a non-duplicated name. */
6638
6639 goto HANDLE_REFERENCE;
6640
6641
6642 /* ------------------------------------------------------------ */
6643 case CHAR_R: /* Recursion */
6644 ptr++; /* Same as (?0) */
6645 /* Fall through */
6646
6647
6648 /* ------------------------------------------------------------ */
6649 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
6650 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: