/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1389 - (show annotations)
Tue Nov 5 18:05:29 2013 UTC (5 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 305857 byte(s)
Implement compile-time nested parentheses limit, specified at build time.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Tables of names of POSIX character classes and their lengths. The names are
264 now all in a single string, to reduce the number of relocations when a shared
265 library is dynamically loaded. The list of lengths is terminated by a zero
266 length entry. The first three must be alpha, lower, upper, as this is assumed
267 for handling case independence. The indices for graph, print, and punct are
268 needed, so identify them. */
269
270 static const char posix_names[] =
271 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
272 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
273 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
274 STRING_word0 STRING_xdigit;
275
276 static const pcre_uint8 posix_name_lengths[] = {
277 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
278
279 #define PC_GRAPH 8
280 #define PC_PRINT 9
281 #define PC_PUNCT 10
282
283
284 /* Table of class bit maps for each POSIX class. Each class is formed from a
285 base map, with an optional addition or removal of another map. Then, for some
286 classes, there is some additional tweaking: for [:blank:] the vertical space
287 characters are removed, and for [:alpha:] and [:alnum:] the underscore
288 character is removed. The triples in the table consist of the base map offset,
289 second map offset or -1 if no second map, and a non-negative value for map
290 addition or a negative value for map subtraction (if there are two maps). The
291 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
292 remove vertical space characters, 2 => remove underscore. */
293
294 static const int posix_class_maps[] = {
295 cbit_word, cbit_digit, -2, /* alpha */
296 cbit_lower, -1, 0, /* lower */
297 cbit_upper, -1, 0, /* upper */
298 cbit_word, -1, 2, /* alnum - word without underscore */
299 cbit_print, cbit_cntrl, 0, /* ascii */
300 cbit_space, -1, 1, /* blank - a GNU extension */
301 cbit_cntrl, -1, 0, /* cntrl */
302 cbit_digit, -1, 0, /* digit */
303 cbit_graph, -1, 0, /* graph */
304 cbit_print, -1, 0, /* print */
305 cbit_punct, -1, 0, /* punct */
306 cbit_space, -1, 0, /* space */
307 cbit_word, -1, 0, /* word - a Perl extension */
308 cbit_xdigit,-1, 0 /* xdigit */
309 };
310
311 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
312 Unicode property escapes. */
313
314 #ifdef SUPPORT_UCP
315 static const pcre_uchar string_PNd[] = {
316 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
317 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
318 static const pcre_uchar string_pNd[] = {
319 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
320 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
321 static const pcre_uchar string_PXsp[] = {
322 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
323 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
324 static const pcre_uchar string_pXsp[] = {
325 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
326 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
327 static const pcre_uchar string_PXwd[] = {
328 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
329 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
330 static const pcre_uchar string_pXwd[] = {
331 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333
334 static const pcre_uchar *substitutes[] = {
335 string_PNd, /* \D */
336 string_pNd, /* \d */
337 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
338 string_pXsp, /* \s */ /* space and POSIX space are the same. */
339 string_PXwd, /* \W */
340 string_pXwd /* \w */
341 };
342
343 /* The POSIX class substitutes must be in the order of the POSIX class names,
344 defined above, and there are both positive and negative cases. NULL means no
345 general substitute of a Unicode property escape (\p or \P). However, for some
346 POSIX classes (e.g. graph, print, punct) a special property code is compiled
347 directly. */
348
349 static const pcre_uchar string_pL[] = {
350 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
352 static const pcre_uchar string_pLl[] = {
353 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355 static const pcre_uchar string_pLu[] = {
356 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
357 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358 static const pcre_uchar string_pXan[] = {
359 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
360 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361 static const pcre_uchar string_h[] = {
362 CHAR_BACKSLASH, CHAR_h, '\0' };
363 static const pcre_uchar string_pXps[] = {
364 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
365 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
366 static const pcre_uchar string_PL[] = {
367 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
368 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
369 static const pcre_uchar string_PLl[] = {
370 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372 static const pcre_uchar string_PLu[] = {
373 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
374 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
375 static const pcre_uchar string_PXan[] = {
376 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
377 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
378 static const pcre_uchar string_H[] = {
379 CHAR_BACKSLASH, CHAR_H, '\0' };
380 static const pcre_uchar string_PXps[] = {
381 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
382 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
383
384 static const pcre_uchar *posix_substitutes[] = {
385 string_pL, /* alpha */
386 string_pLl, /* lower */
387 string_pLu, /* upper */
388 string_pXan, /* alnum */
389 NULL, /* ascii */
390 string_h, /* blank */
391 NULL, /* cntrl */
392 string_pNd, /* digit */
393 NULL, /* graph */
394 NULL, /* print */
395 NULL, /* punct */
396 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
397 string_pXwd, /* word */ /* Perl and POSIX space are the same */
398 NULL, /* xdigit */
399 /* Negated cases */
400 string_PL, /* ^alpha */
401 string_PLl, /* ^lower */
402 string_PLu, /* ^upper */
403 string_PXan, /* ^alnum */
404 NULL, /* ^ascii */
405 string_H, /* ^blank */
406 NULL, /* ^cntrl */
407 string_PNd, /* ^digit */
408 NULL, /* ^graph */
409 NULL, /* ^print */
410 NULL, /* ^punct */
411 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
412 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
413 NULL /* ^xdigit */
414 };
415 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
416 #endif
417
418 #define STRING(a) # a
419 #define XSTRING(s) STRING(s)
420
421 /* The texts of compile-time error messages. These are "char *" because they
422 are passed to the outside world. Do not ever re-use any error number, because
423 they are documented. Always add a new error instead. Messages marked DEAD below
424 are no longer used. This used to be a table of strings, but in order to reduce
425 the number of relocations needed when a shared library is loaded dynamically,
426 it is now one long string. We cannot use a table of offsets, because the
427 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
428 simply count through to the one we want - this isn't a performance issue
429 because these strings are used only when there is a compilation error.
430
431 Each substring ends with \0 to insert a null character. This includes the final
432 substring, so that the whole string ends with \0\0, which can be detected when
433 counting through. */
434
435 static const char error_texts[] =
436 "no error\0"
437 "\\ at end of pattern\0"
438 "\\c at end of pattern\0"
439 "unrecognized character follows \\\0"
440 "numbers out of order in {} quantifier\0"
441 /* 5 */
442 "number too big in {} quantifier\0"
443 "missing terminating ] for character class\0"
444 "invalid escape sequence in character class\0"
445 "range out of order in character class\0"
446 "nothing to repeat\0"
447 /* 10 */
448 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
449 "internal error: unexpected repeat\0"
450 "unrecognized character after (? or (?-\0"
451 "POSIX named classes are supported only within a class\0"
452 "missing )\0"
453 /* 15 */
454 "reference to non-existent subpattern\0"
455 "erroffset passed as NULL\0"
456 "unknown option bit(s) set\0"
457 "missing ) after comment\0"
458 "parentheses nested too deeply\0" /** DEAD **/
459 /* 20 */
460 "regular expression is too large\0"
461 "failed to get memory\0"
462 "unmatched parentheses\0"
463 "internal error: code overflow\0"
464 "unrecognized character after (?<\0"
465 /* 25 */
466 "lookbehind assertion is not fixed length\0"
467 "malformed number or name after (?(\0"
468 "conditional group contains more than two branches\0"
469 "assertion expected after (?(\0"
470 "(?R or (?[+-]digits must be followed by )\0"
471 /* 30 */
472 "unknown POSIX class name\0"
473 "POSIX collating elements are not supported\0"
474 "this version of PCRE is compiled without UTF support\0"
475 "spare error\0" /** DEAD **/
476 "character value in \\x{} or \\o{} is too large\0"
477 /* 35 */
478 "invalid condition (?(0)\0"
479 "\\C not allowed in lookbehind assertion\0"
480 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
481 "number after (?C is > 255\0"
482 "closing ) for (?C expected\0"
483 /* 40 */
484 "recursive call could loop indefinitely\0"
485 "unrecognized character after (?P\0"
486 "syntax error in subpattern name (missing terminator)\0"
487 "two named subpatterns have the same name\0"
488 "invalid UTF-8 string\0"
489 /* 45 */
490 "support for \\P, \\p, and \\X has not been compiled\0"
491 "malformed \\P or \\p sequence\0"
492 "unknown property name after \\P or \\p\0"
493 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
494 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
495 /* 50 */
496 "repeated subpattern is too long\0" /** DEAD **/
497 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
498 "internal error: overran compiling workspace\0"
499 "internal error: previously-checked referenced subpattern not found\0"
500 "DEFINE group contains more than one branch\0"
501 /* 55 */
502 "repeating a DEFINE group is not allowed\0" /** DEAD **/
503 "inconsistent NEWLINE options\0"
504 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
505 "a numbered reference must not be zero\0"
506 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
507 /* 60 */
508 "(*VERB) not recognized or malformed\0"
509 "number is too big\0"
510 "subpattern name expected\0"
511 "digit expected after (?+\0"
512 "] is an invalid data character in JavaScript compatibility mode\0"
513 /* 65 */
514 "different names for subpatterns of the same number are not allowed\0"
515 "(*MARK) must have an argument\0"
516 "this version of PCRE is not compiled with Unicode property support\0"
517 "\\c must be followed by an ASCII character\0"
518 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
519 /* 70 */
520 "internal error: unknown opcode in find_fixedlength()\0"
521 "\\N is not supported in a class\0"
522 "too many forward references\0"
523 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
524 "invalid UTF-16 string\0"
525 /* 75 */
526 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
527 "character value in \\u.... sequence is too large\0"
528 "invalid UTF-32 string\0"
529 "setting UTF is disabled by the application\0"
530 "non-hex character in \\x{} (closing brace missing?)\0"
531 /* 80 */
532 "non-octal character in \\o{} (closing brace missing?)\0"
533 "missing opening brace after \\o\0"
534 "parentheses are too deeply nested\0"
535 ;
536
537 /* Table to identify digits and hex digits. This is used when compiling
538 patterns. Note that the tables in chartables are dependent on the locale, and
539 may mark arbitrary characters as digits - but the PCRE compiling code expects
540 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
541 a private table here. It costs 256 bytes, but it is a lot faster than doing
542 character value tests (at least in some simple cases I timed), and in some
543 applications one wants PCRE to compile efficiently as well as match
544 efficiently.
545
546 For convenience, we use the same bit definitions as in chartables:
547
548 0x04 decimal digit
549 0x08 hexadecimal digit
550
551 Then we can use ctype_digit and ctype_xdigit in the code. */
552
553 /* Using a simple comparison for decimal numbers rather than a memory read
554 is much faster, and the resulting code is simpler (the compiler turns it
555 into a subtraction and unsigned comparison). */
556
557 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
558
559 #ifndef EBCDIC
560
561 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
562 UTF-8 mode. */
563
564 static const pcre_uint8 digitab[] =
565 {
566 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
572 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
573 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
574 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
578 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
598
599 #else
600
601 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
602
603 static const pcre_uint8 digitab[] =
604 {
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
621 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
624 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
627 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
629 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
635 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
636 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
637
638 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
639 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
640 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
641 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
643 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
647 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
648 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
650 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
652 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
655 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
656 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
657 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
658 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
659 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
660 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
661 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
662 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
663 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
664 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
665 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
666 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
667 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
668 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
669 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
670 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
671 #endif
672
673
674 /* This table is used to check whether auto-possessification is possible
675 between adjacent character-type opcodes. The left-hand (repeated) opcode is
676 used to select the row, and the right-hand opcode is use to select the column.
677 A value of 1 means that auto-possessification is OK. For example, the second
678 value in the first row means that \D+\d can be turned into \D++\d.
679
680 The Unicode property types (\P and \p) have to be present to fill out the table
681 because of what their opcode values are, but the table values should always be
682 zero because property types are handled separately in the code. The last four
683 columns apply to items that cannot be repeated, so there is no need to have
684 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
685 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
686
687 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
688 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
689
690 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
691 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
692 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
693 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
694 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
695 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
696 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
697 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
698 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
699 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
700 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
701 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
702 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
703 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
704 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
705 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
706 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
707 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
708 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
709 };
710
711
712 /* This table is used to check whether auto-possessification is possible
713 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
714 left-hand (repeated) opcode is used to select the row, and the right-hand
715 opcode is used to select the column. The values are as follows:
716
717 0 Always return FALSE (never auto-possessify)
718 1 Character groups are distinct (possessify if both are OP_PROP)
719 2 Check character categories in the same group (general or particular)
720 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
721
722 4 Check left general category vs right particular category
723 5 Check right general category vs left particular category
724
725 6 Left alphanum vs right general category
726 7 Left space vs right general category
727 8 Left word vs right general category
728
729 9 Right alphanum vs left general category
730 10 Right space vs left general category
731 11 Right word vs left general category
732
733 12 Left alphanum vs right particular category
734 13 Left space vs right particular category
735 14 Left word vs right particular category
736
737 15 Right alphanum vs left particular category
738 16 Right space vs left particular category
739 17 Right word vs left particular category
740 */
741
742 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
743 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
744 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
745 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
746 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
747 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
748 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
749 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
750 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
751 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
752 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
753 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
754 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
755 };
756
757 /* This table is used to check whether auto-possessification is possible
758 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
759 specifies a general category and the other specifies a particular category. The
760 row is selected by the general category and the column by the particular
761 category. The value is 1 if the particular category is not part of the general
762 category. */
763
764 static const pcre_uint8 catposstab[7][30] = {
765 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
766 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
767 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
768 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
769 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
770 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
771 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
772 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
773 };
774
775 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
776 a general or particular category. The properties in each row are those
777 that apply to the character set in question. Duplication means that a little
778 unnecessary work is done when checking, but this keeps things much simpler
779 because they can all use the same code. For more details see the comment where
780 this table is used.
781
782 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
783 "space", but from Perl 5.18 it's included, so both categories are treated the
784 same here. */
785
786 static const pcre_uint8 posspropstab[3][4] = {
787 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
788 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
789 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
790 };
791
792 /* This table is used when converting repeating opcodes into possessified
793 versions as a result of an explicit possessive quantifier such as ++. A zero
794 value means there is no possessified version - in those cases the item in
795 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
796 because all relevant opcodes are less than that. */
797
798 static const pcre_uint8 opcode_possessify[] = {
799 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
800 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
801
802 0, /* NOTI */
803 OP_POSSTAR, 0, /* STAR, MINSTAR */
804 OP_POSPLUS, 0, /* PLUS, MINPLUS */
805 OP_POSQUERY, 0, /* QUERY, MINQUERY */
806 OP_POSUPTO, 0, /* UPTO, MINUPTO */
807 0, /* EXACT */
808 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
809
810 OP_POSSTARI, 0, /* STARI, MINSTARI */
811 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
812 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
813 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
814 0, /* EXACTI */
815 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
816
817 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
818 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
819 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
820 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
821 0, /* NOTEXACT */
822 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
823
824 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
825 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
826 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
827 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
828 0, /* NOTEXACTI */
829 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
830
831 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
832 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
833 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
834 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
835 0, /* TYPEEXACT */
836 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
837
838 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
839 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
840 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
841 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
842 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
843
844 0, 0, 0, /* CLASS, NCLASS, XCLASS */
845 0, 0, /* REF, REFI */
846 0, 0, /* DNREF, DNREFI */
847 0, 0 /* RECURSE, CALLOUT */
848 };
849
850
851
852 /*************************************************
853 * Find an error text *
854 *************************************************/
855
856 /* The error texts are now all in one long string, to save on relocations. As
857 some of the text is of unknown length, we can't use a table of offsets.
858 Instead, just count through the strings. This is not a performance issue
859 because it happens only when there has been a compilation error.
860
861 Argument: the error number
862 Returns: pointer to the error string
863 */
864
865 static const char *
866 find_error_text(int n)
867 {
868 const char *s = error_texts;
869 for (; n > 0; n--)
870 {
871 while (*s++ != CHAR_NULL) {};
872 if (*s == CHAR_NULL) return "Error text not found (please report)";
873 }
874 return s;
875 }
876
877
878
879 /*************************************************
880 * Expand the workspace *
881 *************************************************/
882
883 /* This function is called during the second compiling phase, if the number of
884 forward references fills the existing workspace, which is originally a block on
885 the stack. A larger block is obtained from malloc() unless the ultimate limit
886 has been reached or the increase will be rather small.
887
888 Argument: pointer to the compile data block
889 Returns: 0 if all went well, else an error number
890 */
891
892 static int
893 expand_workspace(compile_data *cd)
894 {
895 pcre_uchar *newspace;
896 int newsize = cd->workspace_size * 2;
897
898 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
899 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
900 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
901 return ERR72;
902
903 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
904 if (newspace == NULL) return ERR21;
905 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
906 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
907 if (cd->workspace_size > COMPILE_WORK_SIZE)
908 (PUBL(free))((void *)cd->start_workspace);
909 cd->start_workspace = newspace;
910 cd->workspace_size = newsize;
911 return 0;
912 }
913
914
915
916 /*************************************************
917 * Check for counted repeat *
918 *************************************************/
919
920 /* This function is called when a '{' is encountered in a place where it might
921 start a quantifier. It looks ahead to see if it really is a quantifier or not.
922 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
923 where the ddds are digits.
924
925 Arguments:
926 p pointer to the first char after '{'
927
928 Returns: TRUE or FALSE
929 */
930
931 static BOOL
932 is_counted_repeat(const pcre_uchar *p)
933 {
934 if (!IS_DIGIT(*p)) return FALSE;
935 p++;
936 while (IS_DIGIT(*p)) p++;
937 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
938
939 if (*p++ != CHAR_COMMA) return FALSE;
940 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
941
942 if (!IS_DIGIT(*p)) return FALSE;
943 p++;
944 while (IS_DIGIT(*p)) p++;
945
946 return (*p == CHAR_RIGHT_CURLY_BRACKET);
947 }
948
949
950
951 /*************************************************
952 * Handle escapes *
953 *************************************************/
954
955 /* This function is called when a \ has been encountered. It either returns a
956 positive value for a simple escape such as \n, or 0 for a data character which
957 will be placed in chptr. A backreference to group n is returned as negative n.
958 When UTF-8 is enabled, a positive value greater than 255 may be returned in
959 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
960 character of the escape sequence.
961
962 Arguments:
963 ptrptr points to the pattern position pointer
964 chptr points to a returned data character
965 errorcodeptr points to the errorcode variable
966 bracount number of previous extracting brackets
967 options the options bits
968 isclass TRUE if inside a character class
969
970 Returns: zero => a data character
971 positive => a special escape sequence
972 negative => a back reference
973 on error, errorcodeptr is set
974 */
975
976 static int
977 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
978 int bracount, int options, BOOL isclass)
979 {
980 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
981 BOOL utf = (options & PCRE_UTF8) != 0;
982 const pcre_uchar *ptr = *ptrptr + 1;
983 pcre_uint32 c;
984 int escape = 0;
985 int i;
986
987 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
988 ptr--; /* Set pointer back to the last byte */
989
990 /* If backslash is at the end of the pattern, it's an error. */
991
992 if (c == CHAR_NULL) *errorcodeptr = ERR1;
993
994 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
995 in a table. A non-zero result is something that can be returned immediately.
996 Otherwise further processing may be required. */
997
998 #ifndef EBCDIC /* ASCII/UTF-8 coding */
999 /* Not alphanumeric */
1000 else if (c < CHAR_0 || c > CHAR_z) {}
1001 else if ((i = escapes[c - CHAR_0]) != 0)
1002 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1003
1004 #else /* EBCDIC coding */
1005 /* Not alphanumeric */
1006 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1007 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1008 #endif
1009
1010 /* Escapes that need further processing, or are illegal. */
1011
1012 else
1013 {
1014 const pcre_uchar *oldptr;
1015 BOOL braced, negated, overflow;
1016 int s;
1017
1018 switch (c)
1019 {
1020 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1021 error. */
1022
1023 case CHAR_l:
1024 case CHAR_L:
1025 *errorcodeptr = ERR37;
1026 break;
1027
1028 case CHAR_u:
1029 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1030 {
1031 /* In JavaScript, \u must be followed by four hexadecimal numbers.
1032 Otherwise it is a lowercase u letter. */
1033 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1034 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1035 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1036 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1037 {
1038 c = 0;
1039 for (i = 0; i < 4; ++i)
1040 {
1041 register pcre_uint32 cc = *(++ptr);
1042 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1043 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1044 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1045 #else /* EBCDIC coding */
1046 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1047 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1048 #endif
1049 }
1050
1051 #if defined COMPILE_PCRE8
1052 if (c > (utf ? 0x10ffffU : 0xffU))
1053 #elif defined COMPILE_PCRE16
1054 if (c > (utf ? 0x10ffffU : 0xffffU))
1055 #elif defined COMPILE_PCRE32
1056 if (utf && c > 0x10ffffU)
1057 #endif
1058 {
1059 *errorcodeptr = ERR76;
1060 }
1061 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1062 }
1063 }
1064 else
1065 *errorcodeptr = ERR37;
1066 break;
1067
1068 case CHAR_U:
1069 /* In JavaScript, \U is an uppercase U letter. */
1070 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1071 break;
1072
1073 /* In a character class, \g is just a literal "g". Outside a character
1074 class, \g must be followed by one of a number of specific things:
1075
1076 (1) A number, either plain or braced. If positive, it is an absolute
1077 backreference. If negative, it is a relative backreference. This is a Perl
1078 5.10 feature.
1079
1080 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1081 is part of Perl's movement towards a unified syntax for back references. As
1082 this is synonymous with \k{name}, we fudge it up by pretending it really
1083 was \k.
1084
1085 (3) For Oniguruma compatibility we also support \g followed by a name or a
1086 number either in angle brackets or in single quotes. However, these are
1087 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1088 the ESC_g code (cf \k). */
1089
1090 case CHAR_g:
1091 if (isclass) break;
1092 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1093 {
1094 escape = ESC_g;
1095 break;
1096 }
1097
1098 /* Handle the Perl-compatible cases */
1099
1100 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1101 {
1102 const pcre_uchar *p;
1103 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1104 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1105 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1106 {
1107 escape = ESC_k;
1108 break;
1109 }
1110 braced = TRUE;
1111 ptr++;
1112 }
1113 else braced = FALSE;
1114
1115 if (ptr[1] == CHAR_MINUS)
1116 {
1117 negated = TRUE;
1118 ptr++;
1119 }
1120 else negated = FALSE;
1121
1122 /* The integer range is limited by the machine's int representation. */
1123 s = 0;
1124 overflow = FALSE;
1125 while (IS_DIGIT(ptr[1]))
1126 {
1127 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1128 {
1129 overflow = TRUE;
1130 break;
1131 }
1132 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1133 }
1134 if (overflow) /* Integer overflow */
1135 {
1136 while (IS_DIGIT(ptr[1]))
1137 ptr++;
1138 *errorcodeptr = ERR61;
1139 break;
1140 }
1141
1142 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1143 {
1144 *errorcodeptr = ERR57;
1145 break;
1146 }
1147
1148 if (s == 0)
1149 {
1150 *errorcodeptr = ERR58;
1151 break;
1152 }
1153
1154 if (negated)
1155 {
1156 if (s > bracount)
1157 {
1158 *errorcodeptr = ERR15;
1159 break;
1160 }
1161 s = bracount - (s - 1);
1162 }
1163
1164 escape = -s;
1165 break;
1166
1167 /* The handling of escape sequences consisting of a string of digits
1168 starting with one that is not zero is not straightforward. Perl has changed
1169 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1170 recommended to avoid the ambiguities in the old syntax.
1171
1172 Outside a character class, the digits are read as a decimal number. If the
1173 number is less than 8 (used to be 10), or if there are that many previous
1174 extracting left brackets, then it is a back reference. Otherwise, up to
1175 three octal digits are read to form an escaped byte. Thus \123 is likely to
1176 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1177 the octal value is greater than 377, the least significant 8 bits are
1178 taken. \8 and \9 are treated as the literal characters 8 and 9.
1179
1180 Inside a character class, \ followed by a digit is always either a literal
1181 8 or 9 or an octal number. */
1182
1183 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1184 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1185
1186 if (!isclass)
1187 {
1188 oldptr = ptr;
1189 /* The integer range is limited by the machine's int representation. */
1190 s = (int)(c -CHAR_0);
1191 overflow = FALSE;
1192 while (IS_DIGIT(ptr[1]))
1193 {
1194 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1195 {
1196 overflow = TRUE;
1197 break;
1198 }
1199 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1200 }
1201 if (overflow) /* Integer overflow */
1202 {
1203 while (IS_DIGIT(ptr[1]))
1204 ptr++;
1205 *errorcodeptr = ERR61;
1206 break;
1207 }
1208 if (s < 8 || s <= bracount) /* Check for back reference */
1209 {
1210 escape = -s;
1211 break;
1212 }
1213 ptr = oldptr; /* Put the pointer back and fall through */
1214 }
1215
1216 /* Handle a digit following \ when the number is not a back reference. If
1217 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1218 then treat the digit as a following literal. At least by Perl 5.18 this
1219 changed so as not to insert the binary zero. */
1220
1221 if ((c = *ptr) >= CHAR_8) break;
1222
1223 /* Fall through with a digit less than 8 */
1224
1225 /* \0 always starts an octal number, but we may drop through to here with a
1226 larger first octal digit. The original code used just to take the least
1227 significant 8 bits of octal numbers (I think this is what early Perls used
1228 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1229 but no more than 3 octal digits. */
1230
1231 case CHAR_0:
1232 c -= CHAR_0;
1233 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1234 c = c * 8 + *(++ptr) - CHAR_0;
1235 #ifdef COMPILE_PCRE8
1236 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1237 #endif
1238 break;
1239
1240 /* \o is a relatively new Perl feature, supporting a more general way of
1241 specifying character codes in octal. The only supported form is \o{ddd}. */
1242
1243 case CHAR_o:
1244 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1245 {
1246 ptr += 2;
1247 c = 0;
1248 overflow = FALSE;
1249 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1250 {
1251 register pcre_uint32 cc = *ptr++;
1252 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1253 #ifdef COMPILE_PCRE32
1254 if (c >= 0x20000000l) { overflow = TRUE; break; }
1255 #endif
1256 c = (c << 3) + cc - CHAR_0 ;
1257 #if defined COMPILE_PCRE8
1258 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1259 #elif defined COMPILE_PCRE16
1260 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1261 #elif defined COMPILE_PCRE32
1262 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1263 #endif
1264 }
1265 if (overflow)
1266 {
1267 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1268 *errorcodeptr = ERR34;
1269 }
1270 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1271 {
1272 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1273 }
1274 else *errorcodeptr = ERR80;
1275 }
1276 break;
1277
1278 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1279 numbers. Otherwise it is a lowercase x letter. */
1280
1281 case CHAR_x:
1282 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1283 {
1284 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1285 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1286 {
1287 c = 0;
1288 for (i = 0; i < 2; ++i)
1289 {
1290 register pcre_uint32 cc = *(++ptr);
1291 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1292 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1293 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1294 #else /* EBCDIC coding */
1295 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1296 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1297 #endif
1298 }
1299 }
1300 } /* End JavaScript handling */
1301
1302 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1303 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1304 digits. If not, { used to be treated as a data character. However, Perl
1305 seems to read hex digits up to the first non-such, and ignore the rest, so
1306 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1307 now gives an error. */
1308
1309 else
1310 {
1311 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1312 {
1313 ptr += 2;
1314 c = 0;
1315 overflow = FALSE;
1316 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1317 {
1318 register pcre_uint32 cc = *ptr++;
1319 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1320
1321 #ifdef COMPILE_PCRE32
1322 if (c >= 0x10000000l) { overflow = TRUE; break; }
1323 #endif
1324
1325 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1326 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1327 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1328 #else /* EBCDIC coding */
1329 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1330 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1331 #endif
1332
1333 #if defined COMPILE_PCRE8
1334 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1335 #elif defined COMPILE_PCRE16
1336 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1337 #elif defined COMPILE_PCRE32
1338 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1339 #endif
1340 }
1341
1342 if (overflow)
1343 {
1344 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1345 *errorcodeptr = ERR34;
1346 }
1347
1348 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1349 {
1350 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1351 }
1352
1353 /* If the sequence of hex digits does not end with '}', give an error.
1354 We used just to recognize this construct and fall through to the normal
1355 \x handling, but nowadays Perl gives an error, which seems much more
1356 sensible, so we do too. */
1357
1358 else *errorcodeptr = ERR79;
1359 } /* End of \x{} processing */
1360
1361 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1362
1363 else
1364 {
1365 c = 0;
1366 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1367 {
1368 pcre_uint32 cc; /* Some compilers don't like */
1369 cc = *(++ptr); /* ++ in initializers */
1370 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1371 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1372 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1373 #else /* EBCDIC coding */
1374 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1375 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1376 #endif
1377 }
1378 } /* End of \xdd handling */
1379 } /* End of Perl-style \x handling */
1380 break;
1381
1382 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1383 An error is given if the byte following \c is not an ASCII character. This
1384 coding is ASCII-specific, but then the whole concept of \cx is
1385 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1386
1387 case CHAR_c:
1388 c = *(++ptr);
1389 if (c == CHAR_NULL)
1390 {
1391 *errorcodeptr = ERR2;
1392 break;
1393 }
1394 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1395 if (c > 127) /* Excludes all non-ASCII in either mode */
1396 {
1397 *errorcodeptr = ERR68;
1398 break;
1399 }
1400 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1401 c ^= 0x40;
1402 #else /* EBCDIC coding */
1403 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1404 c ^= 0xC0;
1405 #endif
1406 break;
1407
1408 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1409 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1410 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1411 odd, but there used to be some cases other than the default, and there may
1412 be again in future, so I haven't "optimized" it. */
1413
1414 default:
1415 if ((options & PCRE_EXTRA) != 0) switch(c)
1416 {
1417 default:
1418 *errorcodeptr = ERR3;
1419 break;
1420 }
1421 break;
1422 }
1423 }
1424
1425 /* Perl supports \N{name} for character names, as well as plain \N for "not
1426 newline". PCRE does not support \N{name}. However, it does support
1427 quantification such as \N{2,3}. */
1428
1429 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1430 !is_counted_repeat(ptr+2))
1431 *errorcodeptr = ERR37;
1432
1433 /* If PCRE_UCP is set, we change the values for \d etc. */
1434
1435 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1436 escape += (ESC_DU - ESC_D);
1437
1438 /* Set the pointer to the final character before returning. */
1439
1440 *ptrptr = ptr;
1441 *chptr = c;
1442 return escape;
1443 }
1444
1445
1446
1447 #ifdef SUPPORT_UCP
1448 /*************************************************
1449 * Handle \P and \p *
1450 *************************************************/
1451
1452 /* This function is called after \P or \p has been encountered, provided that
1453 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1454 pointing at the P or p. On exit, it is pointing at the final character of the
1455 escape sequence.
1456
1457 Argument:
1458 ptrptr points to the pattern position pointer
1459 negptr points to a boolean that is set TRUE for negation else FALSE
1460 ptypeptr points to an unsigned int that is set to the type value
1461 pdataptr points to an unsigned int that is set to the detailed property value
1462 errorcodeptr points to the error code variable
1463
1464 Returns: TRUE if the type value was found, or FALSE for an invalid type
1465 */
1466
1467 static BOOL
1468 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1469 unsigned int *pdataptr, int *errorcodeptr)
1470 {
1471 pcre_uchar c;
1472 int i, bot, top;
1473 const pcre_uchar *ptr = *ptrptr;
1474 pcre_uchar name[32];
1475
1476 c = *(++ptr);
1477 if (c == CHAR_NULL) goto ERROR_RETURN;
1478
1479 *negptr = FALSE;
1480
1481 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1482 negation. */
1483
1484 if (c == CHAR_LEFT_CURLY_BRACKET)
1485 {
1486 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1487 {
1488 *negptr = TRUE;
1489 ptr++;
1490 }
1491 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1492 {
1493 c = *(++ptr);
1494 if (c == CHAR_NULL) goto ERROR_RETURN;
1495 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1496 name[i] = c;
1497 }
1498 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1499 name[i] = 0;
1500 }
1501
1502 /* Otherwise there is just one following character */
1503
1504 else
1505 {
1506 name[0] = c;
1507 name[1] = 0;
1508 }
1509
1510 *ptrptr = ptr;
1511
1512 /* Search for a recognized property name using binary chop */
1513
1514 bot = 0;
1515 top = PRIV(utt_size);
1516
1517 while (bot < top)
1518 {
1519 int r;
1520 i = (bot + top) >> 1;
1521 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1522 if (r == 0)
1523 {
1524 *ptypeptr = PRIV(utt)[i].type;
1525 *pdataptr = PRIV(utt)[i].value;
1526 return TRUE;
1527 }
1528 if (r > 0) bot = i + 1; else top = i;
1529 }
1530
1531 *errorcodeptr = ERR47;
1532 *ptrptr = ptr;
1533 return FALSE;
1534
1535 ERROR_RETURN:
1536 *errorcodeptr = ERR46;
1537 *ptrptr = ptr;
1538 return FALSE;
1539 }
1540 #endif
1541
1542
1543
1544 /*************************************************
1545 * Read repeat counts *
1546 *************************************************/
1547
1548 /* Read an item of the form {n,m} and return the values. This is called only
1549 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1550 so the syntax is guaranteed to be correct, but we need to check the values.
1551
1552 Arguments:
1553 p pointer to first char after '{'
1554 minp pointer to int for min
1555 maxp pointer to int for max
1556 returned as -1 if no max
1557 errorcodeptr points to error code variable
1558
1559 Returns: pointer to '}' on success;
1560 current ptr on error, with errorcodeptr set non-zero
1561 */
1562
1563 static const pcre_uchar *
1564 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1565 {
1566 int min = 0;
1567 int max = -1;
1568
1569 /* Read the minimum value and do a paranoid check: a negative value indicates
1570 an integer overflow. */
1571
1572 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1573 if (min < 0 || min > 65535)
1574 {
1575 *errorcodeptr = ERR5;
1576 return p;
1577 }
1578
1579 /* Read the maximum value if there is one, and again do a paranoid on its size.
1580 Also, max must not be less than min. */
1581
1582 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1583 {
1584 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1585 {
1586 max = 0;
1587 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1588 if (max < 0 || max > 65535)
1589 {
1590 *errorcodeptr = ERR5;
1591 return p;
1592 }
1593 if (max < min)
1594 {
1595 *errorcodeptr = ERR4;
1596 return p;
1597 }
1598 }
1599 }
1600
1601 /* Fill in the required variables, and pass back the pointer to the terminating
1602 '}'. */
1603
1604 *minp = min;
1605 *maxp = max;
1606 return p;
1607 }
1608
1609
1610
1611 /*************************************************
1612 * Find first significant op code *
1613 *************************************************/
1614
1615 /* This is called by several functions that scan a compiled expression looking
1616 for a fixed first character, or an anchoring op code etc. It skips over things
1617 that do not influence this. For some calls, it makes sense to skip negative
1618 forward and all backward assertions, and also the \b assertion; for others it
1619 does not.
1620
1621 Arguments:
1622 code pointer to the start of the group
1623 skipassert TRUE if certain assertions are to be skipped
1624
1625 Returns: pointer to the first significant opcode
1626 */
1627
1628 static const pcre_uchar*
1629 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1630 {
1631 for (;;)
1632 {
1633 switch ((int)*code)
1634 {
1635 case OP_ASSERT_NOT:
1636 case OP_ASSERTBACK:
1637 case OP_ASSERTBACK_NOT:
1638 if (!skipassert) return code;
1639 do code += GET(code, 1); while (*code == OP_ALT);
1640 code += PRIV(OP_lengths)[*code];
1641 break;
1642
1643 case OP_WORD_BOUNDARY:
1644 case OP_NOT_WORD_BOUNDARY:
1645 if (!skipassert) return code;
1646 /* Fall through */
1647
1648 case OP_CALLOUT:
1649 case OP_CREF:
1650 case OP_DNCREF:
1651 case OP_RREF:
1652 case OP_DNRREF:
1653 case OP_DEF:
1654 code += PRIV(OP_lengths)[*code];
1655 break;
1656
1657 default:
1658 return code;
1659 }
1660 }
1661 /* Control never reaches here */
1662 }
1663
1664
1665
1666 /*************************************************
1667 * Find the fixed length of a branch *
1668 *************************************************/
1669
1670 /* Scan a branch and compute the fixed length of subject that will match it,
1671 if the length is fixed. This is needed for dealing with backward assertions.
1672 In UTF8 mode, the result is in characters rather than bytes. The branch is
1673 temporarily terminated with OP_END when this function is called.
1674
1675 This function is called when a backward assertion is encountered, so that if it
1676 fails, the error message can point to the correct place in the pattern.
1677 However, we cannot do this when the assertion contains subroutine calls,
1678 because they can be forward references. We solve this by remembering this case
1679 and doing the check at the end; a flag specifies which mode we are running in.
1680
1681 Arguments:
1682 code points to the start of the pattern (the bracket)
1683 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1684 atend TRUE if called when the pattern is complete
1685 cd the "compile data" structure
1686
1687 Returns: the fixed length,
1688 or -1 if there is no fixed length,
1689 or -2 if \C was encountered (in UTF-8 mode only)
1690 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1691 or -4 if an unknown opcode was encountered (internal error)
1692 */
1693
1694 static int
1695 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1696 {
1697 int length = -1;
1698
1699 register int branchlength = 0;
1700 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1701
1702 /* Scan along the opcodes for this branch. If we get to the end of the
1703 branch, check the length against that of the other branches. */
1704
1705 for (;;)
1706 {
1707 int d;
1708 pcre_uchar *ce, *cs;
1709 register pcre_uchar op = *cc;
1710
1711 switch (op)
1712 {
1713 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1714 OP_BRA (normal non-capturing bracket) because the other variants of these
1715 opcodes are all concerned with unlimited repeated groups, which of course
1716 are not of fixed length. */
1717
1718 case OP_CBRA:
1719 case OP_BRA:
1720 case OP_ONCE:
1721 case OP_ONCE_NC:
1722 case OP_COND:
1723 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1724 if (d < 0) return d;
1725 branchlength += d;
1726 do cc += GET(cc, 1); while (*cc == OP_ALT);
1727 cc += 1 + LINK_SIZE;
1728 break;
1729
1730 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1731 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1732 an ALT. If it is END it's the end of the outer call. All can be handled by
1733 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1734 because they all imply an unlimited repeat. */
1735
1736 case OP_ALT:
1737 case OP_KET:
1738 case OP_END:
1739 case OP_ACCEPT:
1740 case OP_ASSERT_ACCEPT:
1741 if (length < 0) length = branchlength;
1742 else if (length != branchlength) return -1;
1743 if (*cc != OP_ALT) return length;
1744 cc += 1 + LINK_SIZE;
1745 branchlength = 0;
1746 break;
1747
1748 /* A true recursion implies not fixed length, but a subroutine call may
1749 be OK. If the subroutine is a forward reference, we can't deal with
1750 it until the end of the pattern, so return -3. */
1751
1752 case OP_RECURSE:
1753 if (!atend) return -3;
1754 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1755 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1756 if (cc > cs && cc < ce) return -1; /* Recursion */
1757 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1758 if (d < 0) return d;
1759 branchlength += d;
1760 cc += 1 + LINK_SIZE;
1761 break;
1762
1763 /* Skip over assertive subpatterns */
1764
1765 case OP_ASSERT:
1766 case OP_ASSERT_NOT:
1767 case OP_ASSERTBACK:
1768 case OP_ASSERTBACK_NOT:
1769 do cc += GET(cc, 1); while (*cc == OP_ALT);
1770 cc += PRIV(OP_lengths)[*cc];
1771 break;
1772
1773 /* Skip over things that don't match chars */
1774
1775 case OP_MARK:
1776 case OP_PRUNE_ARG:
1777 case OP_SKIP_ARG:
1778 case OP_THEN_ARG:
1779 cc += cc[1] + PRIV(OP_lengths)[*cc];
1780 break;
1781
1782 case OP_CALLOUT:
1783 case OP_CIRC:
1784 case OP_CIRCM:
1785 case OP_CLOSE:
1786 case OP_COMMIT:
1787 case OP_CREF:
1788 case OP_DEF:
1789 case OP_DNCREF:
1790 case OP_DNRREF:
1791 case OP_DOLL:
1792 case OP_DOLLM:
1793 case OP_EOD:
1794 case OP_EODN:
1795 case OP_FAIL:
1796 case OP_NOT_WORD_BOUNDARY:
1797 case OP_PRUNE:
1798 case OP_REVERSE:
1799 case OP_RREF:
1800 case OP_SET_SOM:
1801 case OP_SKIP:
1802 case OP_SOD:
1803 case OP_SOM:
1804 case OP_THEN:
1805 case OP_WORD_BOUNDARY:
1806 cc += PRIV(OP_lengths)[*cc];
1807 break;
1808
1809 /* Handle literal characters */
1810
1811 case OP_CHAR:
1812 case OP_CHARI:
1813 case OP_NOT:
1814 case OP_NOTI:
1815 branchlength++;
1816 cc += 2;
1817 #ifdef SUPPORT_UTF
1818 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1819 #endif
1820 break;
1821
1822 /* Handle exact repetitions. The count is already in characters, but we
1823 need to skip over a multibyte character in UTF8 mode. */
1824
1825 case OP_EXACT:
1826 case OP_EXACTI:
1827 case OP_NOTEXACT:
1828 case OP_NOTEXACTI:
1829 branchlength += (int)GET2(cc,1);
1830 cc += 2 + IMM2_SIZE;
1831 #ifdef SUPPORT_UTF
1832 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1833 #endif
1834 break;
1835
1836 case OP_TYPEEXACT:
1837 branchlength += GET2(cc,1);
1838 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1839 cc += 2;
1840 cc += 1 + IMM2_SIZE + 1;
1841 break;
1842
1843 /* Handle single-char matchers */
1844
1845 case OP_PROP:
1846 case OP_NOTPROP:
1847 cc += 2;
1848 /* Fall through */
1849
1850 case OP_HSPACE:
1851 case OP_VSPACE:
1852 case OP_NOT_HSPACE:
1853 case OP_NOT_VSPACE:
1854 case OP_NOT_DIGIT:
1855 case OP_DIGIT:
1856 case OP_NOT_WHITESPACE:
1857 case OP_WHITESPACE:
1858 case OP_NOT_WORDCHAR:
1859 case OP_WORDCHAR:
1860 case OP_ANY:
1861 case OP_ALLANY:
1862 branchlength++;
1863 cc++;
1864 break;
1865
1866 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1867 otherwise \C is coded as OP_ALLANY. */
1868
1869 case OP_ANYBYTE:
1870 return -2;
1871
1872 /* Check a class for variable quantification */
1873
1874 case OP_CLASS:
1875 case OP_NCLASS:
1876 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1877 case OP_XCLASS:
1878 /* The original code caused an unsigned overflow in 64 bit systems,
1879 so now we use a conditional statement. */
1880 if (op == OP_XCLASS)
1881 cc += GET(cc, 1);
1882 else
1883 cc += PRIV(OP_lengths)[OP_CLASS];
1884 #else
1885 cc += PRIV(OP_lengths)[OP_CLASS];
1886 #endif
1887
1888 switch (*cc)
1889 {
1890 case OP_CRSTAR:
1891 case OP_CRMINSTAR:
1892 case OP_CRPLUS:
1893 case OP_CRMINPLUS:
1894 case OP_CRQUERY:
1895 case OP_CRMINQUERY:
1896 case OP_CRPOSSTAR:
1897 case OP_CRPOSPLUS:
1898 case OP_CRPOSQUERY:
1899 return -1;
1900
1901 case OP_CRRANGE:
1902 case OP_CRMINRANGE:
1903 case OP_CRPOSRANGE:
1904 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1905 branchlength += (int)GET2(cc,1);
1906 cc += 1 + 2 * IMM2_SIZE;
1907 break;
1908
1909 default:
1910 branchlength++;
1911 }
1912 break;
1913
1914 /* Anything else is variable length */
1915
1916 case OP_ANYNL:
1917 case OP_BRAMINZERO:
1918 case OP_BRAPOS:
1919 case OP_BRAPOSZERO:
1920 case OP_BRAZERO:
1921 case OP_CBRAPOS:
1922 case OP_EXTUNI:
1923 case OP_KETRMAX:
1924 case OP_KETRMIN:
1925 case OP_KETRPOS:
1926 case OP_MINPLUS:
1927 case OP_MINPLUSI:
1928 case OP_MINQUERY:
1929 case OP_MINQUERYI:
1930 case OP_MINSTAR:
1931 case OP_MINSTARI:
1932 case OP_MINUPTO:
1933 case OP_MINUPTOI:
1934 case OP_NOTMINPLUS:
1935 case OP_NOTMINPLUSI:
1936 case OP_NOTMINQUERY:
1937 case OP_NOTMINQUERYI:
1938 case OP_NOTMINSTAR:
1939 case OP_NOTMINSTARI:
1940 case OP_NOTMINUPTO:
1941 case OP_NOTMINUPTOI:
1942 case OP_NOTPLUS:
1943 case OP_NOTPLUSI:
1944 case OP_NOTPOSPLUS:
1945 case OP_NOTPOSPLUSI:
1946 case OP_NOTPOSQUERY:
1947 case OP_NOTPOSQUERYI:
1948 case OP_NOTPOSSTAR:
1949 case OP_NOTPOSSTARI:
1950 case OP_NOTPOSUPTO:
1951 case OP_NOTPOSUPTOI:
1952 case OP_NOTQUERY:
1953 case OP_NOTQUERYI:
1954 case OP_NOTSTAR:
1955 case OP_NOTSTARI:
1956 case OP_NOTUPTO:
1957 case OP_NOTUPTOI:
1958 case OP_PLUS:
1959 case OP_PLUSI:
1960 case OP_POSPLUS:
1961 case OP_POSPLUSI:
1962 case OP_POSQUERY:
1963 case OP_POSQUERYI:
1964 case OP_POSSTAR:
1965 case OP_POSSTARI:
1966 case OP_POSUPTO:
1967 case OP_POSUPTOI:
1968 case OP_QUERY:
1969 case OP_QUERYI:
1970 case OP_REF:
1971 case OP_REFI:
1972 case OP_DNREF:
1973 case OP_DNREFI:
1974 case OP_SBRA:
1975 case OP_SBRAPOS:
1976 case OP_SCBRA:
1977 case OP_SCBRAPOS:
1978 case OP_SCOND:
1979 case OP_SKIPZERO:
1980 case OP_STAR:
1981 case OP_STARI:
1982 case OP_TYPEMINPLUS:
1983 case OP_TYPEMINQUERY:
1984 case OP_TYPEMINSTAR:
1985 case OP_TYPEMINUPTO:
1986 case OP_TYPEPLUS:
1987 case OP_TYPEPOSPLUS:
1988 case OP_TYPEPOSQUERY:
1989 case OP_TYPEPOSSTAR:
1990 case OP_TYPEPOSUPTO:
1991 case OP_TYPEQUERY:
1992 case OP_TYPESTAR:
1993 case OP_TYPEUPTO:
1994 case OP_UPTO:
1995 case OP_UPTOI:
1996 return -1;
1997
1998 /* Catch unrecognized opcodes so that when new ones are added they
1999 are not forgotten, as has happened in the past. */
2000
2001 default:
2002 return -4;
2003 }
2004 }
2005 /* Control never gets here */
2006 }
2007
2008
2009
2010 /*************************************************
2011 * Scan compiled regex for specific bracket *
2012 *************************************************/
2013
2014 /* This little function scans through a compiled pattern until it finds a
2015 capturing bracket with the given number, or, if the number is negative, an
2016 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2017 so that it can be called from pcre_study() when finding the minimum matching
2018 length.
2019
2020 Arguments:
2021 code points to start of expression
2022 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2023 number the required bracket number or negative to find a lookbehind
2024
2025 Returns: pointer to the opcode for the bracket, or NULL if not found
2026 */
2027
2028 const pcre_uchar *
2029 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2030 {
2031 for (;;)
2032 {
2033 register pcre_uchar c = *code;
2034
2035 if (c == OP_END) return NULL;
2036
2037 /* XCLASS is used for classes that cannot be represented just by a bit
2038 map. This includes negated single high-valued characters. The length in
2039 the table is zero; the actual length is stored in the compiled code. */
2040
2041 if (c == OP_XCLASS) code += GET(code, 1);
2042
2043 /* Handle recursion */
2044
2045 else if (c == OP_REVERSE)
2046 {
2047 if (number < 0) return (pcre_uchar *)code;
2048 code += PRIV(OP_lengths)[c];
2049 }
2050
2051 /* Handle capturing bracket */
2052
2053 else if (c == OP_CBRA || c == OP_SCBRA ||
2054 c == OP_CBRAPOS || c == OP_SCBRAPOS)
2055 {
2056 int n = (int)GET2(code, 1+LINK_SIZE);
2057 if (n == number) return (pcre_uchar *)code;
2058 code += PRIV(OP_lengths)[c];
2059 }
2060
2061 /* Otherwise, we can get the item's length from the table, except that for
2062 repeated character types, we have to test for \p and \P, which have an extra
2063 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2064 must add in its length. */
2065
2066 else
2067 {
2068 switch(c)
2069 {
2070 case OP_TYPESTAR:
2071 case OP_TYPEMINSTAR:
2072 case OP_TYPEPLUS:
2073 case OP_TYPEMINPLUS:
2074 case OP_TYPEQUERY:
2075 case OP_TYPEMINQUERY:
2076 case OP_TYPEPOSSTAR:
2077 case OP_TYPEPOSPLUS:
2078 case OP_TYPEPOSQUERY:
2079 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2080 break;
2081
2082 case OP_TYPEUPTO:
2083 case OP_TYPEMINUPTO:
2084 case OP_TYPEEXACT:
2085 case OP_TYPEPOSUPTO:
2086 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2087 code += 2;
2088 break;
2089
2090 case OP_MARK:
2091 case OP_PRUNE_ARG:
2092 case OP_SKIP_ARG:
2093 case OP_THEN_ARG:
2094 code += code[1];
2095 break;
2096 }
2097
2098 /* Add in the fixed length from the table */
2099
2100 code += PRIV(OP_lengths)[c];
2101
2102 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2103 a multi-byte character. The length in the table is a minimum, so we have to
2104 arrange to skip the extra bytes. */
2105
2106 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2107 if (utf) switch(c)
2108 {
2109 case OP_CHAR:
2110 case OP_CHARI:
2111 case OP_EXACT:
2112 case OP_EXACTI:
2113 case OP_UPTO:
2114 case OP_UPTOI:
2115 case OP_MINUPTO:
2116 case OP_MINUPTOI:
2117 case OP_POSUPTO:
2118 case OP_POSUPTOI:
2119 case OP_STAR:
2120 case OP_STARI:
2121 case OP_MINSTAR:
2122 case OP_MINSTARI:
2123 case OP_POSSTAR:
2124 case OP_POSSTARI:
2125 case OP_PLUS:
2126 case OP_PLUSI:
2127 case OP_MINPLUS:
2128 case OP_MINPLUSI:
2129 case OP_POSPLUS:
2130 case OP_POSPLUSI:
2131 case OP_QUERY:
2132 case OP_QUERYI:
2133 case OP_MINQUERY:
2134 case OP_MINQUERYI:
2135 case OP_POSQUERY:
2136 case OP_POSQUERYI:
2137 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2138 break;
2139 }
2140 #else
2141 (void)(utf); /* Keep compiler happy by referencing function argument */
2142 #endif
2143 }
2144 }
2145 }
2146
2147
2148
2149 /*************************************************
2150 * Scan compiled regex for recursion reference *
2151 *************************************************/
2152
2153 /* This little function scans through a compiled pattern until it finds an
2154 instance of OP_RECURSE.
2155
2156 Arguments:
2157 code points to start of expression
2158 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2159
2160 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2161 */
2162
2163 static const pcre_uchar *
2164 find_recurse(const pcre_uchar *code, BOOL utf)
2165 {
2166 for (;;)
2167 {
2168 register pcre_uchar c = *code;
2169 if (c == OP_END) return NULL;
2170 if (c == OP_RECURSE) return code;
2171
2172 /* XCLASS is used for classes that cannot be represented just by a bit
2173 map. This includes negated single high-valued characters. The length in
2174 the table is zero; the actual length is stored in the compiled code. */
2175
2176 if (c == OP_XCLASS) code += GET(code, 1);
2177
2178 /* Otherwise, we can get the item's length from the table, except that for
2179 repeated character types, we have to test for \p and \P, which have an extra
2180 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2181 must add in its length. */
2182
2183 else
2184 {
2185 switch(c)
2186 {
2187 case OP_TYPESTAR:
2188 case OP_TYPEMINSTAR:
2189 case OP_TYPEPLUS:
2190 case OP_TYPEMINPLUS:
2191 case OP_TYPEQUERY:
2192 case OP_TYPEMINQUERY:
2193 case OP_TYPEPOSSTAR:
2194 case OP_TYPEPOSPLUS:
2195 case OP_TYPEPOSQUERY:
2196 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2197 break;
2198
2199 case OP_TYPEPOSUPTO:
2200 case OP_TYPEUPTO:
2201 case OP_TYPEMINUPTO:
2202 case OP_TYPEEXACT:
2203 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2204 code += 2;
2205 break;
2206
2207 case OP_MARK:
2208 case OP_PRUNE_ARG:
2209 case OP_SKIP_ARG:
2210 case OP_THEN_ARG:
2211 code += code[1];
2212 break;
2213 }
2214
2215 /* Add in the fixed length from the table */
2216
2217 code += PRIV(OP_lengths)[c];
2218
2219 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2220 by a multi-byte character. The length in the table is a minimum, so we have
2221 to arrange to skip the extra bytes. */
2222
2223 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2224 if (utf) switch(c)
2225 {
2226 case OP_CHAR:
2227 case OP_CHARI:
2228 case OP_NOT:
2229 case OP_NOTI:
2230 case OP_EXACT:
2231 case OP_EXACTI:
2232 case OP_NOTEXACT:
2233 case OP_NOTEXACTI:
2234 case OP_UPTO:
2235 case OP_UPTOI:
2236 case OP_NOTUPTO:
2237 case OP_NOTUPTOI:
2238 case OP_MINUPTO:
2239 case OP_MINUPTOI:
2240 case OP_NOTMINUPTO:
2241 case OP_NOTMINUPTOI:
2242 case OP_POSUPTO:
2243 case OP_POSUPTOI:
2244 case OP_NOTPOSUPTO:
2245 case OP_NOTPOSUPTOI:
2246 case OP_STAR:
2247 case OP_STARI:
2248 case OP_NOTSTAR:
2249 case OP_NOTSTARI:
2250 case OP_MINSTAR:
2251 case OP_MINSTARI:
2252 case OP_NOTMINSTAR:
2253 case OP_NOTMINSTARI:
2254 case OP_POSSTAR:
2255 case OP_POSSTARI:
2256 case OP_NOTPOSSTAR:
2257 case OP_NOTPOSSTARI:
2258 case OP_PLUS:
2259 case OP_PLUSI:
2260 case OP_NOTPLUS:
2261 case OP_NOTPLUSI:
2262 case OP_MINPLUS:
2263 case OP_MINPLUSI:
2264 case OP_NOTMINPLUS:
2265 case OP_NOTMINPLUSI:
2266 case OP_POSPLUS:
2267 case OP_POSPLUSI:
2268 case OP_NOTPOSPLUS:
2269 case OP_NOTPOSPLUSI:
2270 case OP_QUERY:
2271 case OP_QUERYI:
2272 case OP_NOTQUERY:
2273 case OP_NOTQUERYI:
2274 case OP_MINQUERY:
2275 case OP_MINQUERYI:
2276 case OP_NOTMINQUERY:
2277 case OP_NOTMINQUERYI:
2278 case OP_POSQUERY:
2279 case OP_POSQUERYI:
2280 case OP_NOTPOSQUERY:
2281 case OP_NOTPOSQUERYI:
2282 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2283 break;
2284 }
2285 #else
2286 (void)(utf); /* Keep compiler happy by referencing function argument */
2287 #endif
2288 }
2289 }
2290 }
2291
2292
2293
2294 /*************************************************
2295 * Scan compiled branch for non-emptiness *
2296 *************************************************/
2297
2298 /* This function scans through a branch of a compiled pattern to see whether it
2299 can match the empty string or not. It is called from could_be_empty()
2300 below and from compile_branch() when checking for an unlimited repeat of a
2301 group that can match nothing. Note that first_significant_code() skips over
2302 backward and negative forward assertions when its final argument is TRUE. If we
2303 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2304 bracket whose current branch will already have been scanned.
2305
2306 Arguments:
2307 code points to start of search
2308 endcode points to where to stop
2309 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2310 cd contains pointers to tables etc.
2311 recurses chain of recurse_check to catch mutual recursion
2312
2313 Returns: TRUE if what is matched could be empty
2314 */
2315
2316 typedef struct recurse_check {
2317 struct recurse_check *prev;
2318 const pcre_uchar *group;
2319 } recurse_check;
2320
2321 static BOOL
2322 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2323 BOOL utf, compile_data *cd, recurse_check *recurses)
2324 {
2325 register pcre_uchar c;
2326 recurse_check this_recurse;
2327
2328 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2329 code < endcode;
2330 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2331 {
2332 const pcre_uchar *ccode;
2333
2334 c = *code;
2335
2336 /* Skip over forward assertions; the other assertions are skipped by
2337 first_significant_code() with a TRUE final argument. */
2338
2339 if (c == OP_ASSERT)
2340 {
2341 do code += GET(code, 1); while (*code == OP_ALT);
2342 c = *code;
2343 continue;
2344 }
2345
2346 /* For a recursion/subroutine call, if its end has been reached, which
2347 implies a backward reference subroutine call, we can scan it. If it's a
2348 forward reference subroutine call, we can't. To detect forward reference
2349 we have to scan up the list that is kept in the workspace. This function is
2350 called only when doing the real compile, not during the pre-compile that
2351 measures the size of the compiled pattern. */
2352
2353 if (c == OP_RECURSE)
2354 {
2355 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2356 BOOL empty_branch;
2357
2358 /* Test for forward reference or uncompleted reference. This is disabled
2359 when called to scan a completed pattern by setting cd->start_workspace to
2360 NULL. */
2361
2362 if (cd->start_workspace != NULL)
2363 {
2364 const pcre_uchar *tcode;
2365 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2366 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2367 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2368 }
2369
2370 /* If we are scanning a completed pattern, there are no forward references
2371 and all groups are complete. We need to detect whether this is a recursive
2372 call, as otherwise there will be an infinite loop. If it is a recursion,
2373 just skip over it. Simple recursions are easily detected. For mutual
2374 recursions we keep a chain on the stack. */
2375
2376 else
2377 {
2378 recurse_check *r = recurses;
2379 const pcre_uchar *endgroup = scode;
2380
2381 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2382 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2383
2384 for (r = recurses; r != NULL; r = r->prev)
2385 if (r->group == scode) break;
2386 if (r != NULL) continue; /* Mutual recursion */
2387 }
2388
2389 /* Completed reference; scan the referenced group, remembering it on the
2390 stack chain to detect mutual recursions. */
2391
2392 empty_branch = FALSE;
2393 this_recurse.prev = recurses;
2394 this_recurse.group = scode;
2395
2396 do
2397 {
2398 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2399 {
2400 empty_branch = TRUE;
2401 break;
2402 }
2403 scode += GET(scode, 1);
2404 }
2405 while (*scode == OP_ALT);
2406
2407 if (!empty_branch) return FALSE; /* All branches are non-empty */
2408 continue;
2409 }
2410
2411 /* Groups with zero repeats can of course be empty; skip them. */
2412
2413 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2414 c == OP_BRAPOSZERO)
2415 {
2416 code += PRIV(OP_lengths)[c];
2417 do code += GET(code, 1); while (*code == OP_ALT);
2418 c = *code;
2419 continue;
2420 }
2421
2422 /* A nested group that is already marked as "could be empty" can just be
2423 skipped. */
2424
2425 if (c == OP_SBRA || c == OP_SBRAPOS ||
2426 c == OP_SCBRA || c == OP_SCBRAPOS)
2427 {
2428 do code += GET(code, 1); while (*code == OP_ALT);
2429 c = *code;
2430 continue;
2431 }
2432
2433 /* For other groups, scan the branches. */
2434
2435 if (c == OP_BRA || c == OP_BRAPOS ||
2436 c == OP_CBRA || c == OP_CBRAPOS ||
2437 c == OP_ONCE || c == OP_ONCE_NC ||
2438 c == OP_COND)
2439 {
2440 BOOL empty_branch;
2441 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2442
2443 /* If a conditional group has only one branch, there is a second, implied,
2444 empty branch, so just skip over the conditional, because it could be empty.
2445 Otherwise, scan the individual branches of the group. */
2446
2447 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2448 code += GET(code, 1);
2449 else
2450 {
2451 empty_branch = FALSE;
2452 do
2453 {
2454 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2455 empty_branch = TRUE;
2456 code += GET(code, 1);
2457 }
2458 while (*code == OP_ALT);
2459 if (!empty_branch) return FALSE; /* All branches are non-empty */
2460 }
2461
2462 c = *code;
2463 continue;
2464 }
2465
2466 /* Handle the other opcodes */
2467
2468 switch (c)
2469 {
2470 /* Check for quantifiers after a class. XCLASS is used for classes that
2471 cannot be represented just by a bit map. This includes negated single
2472 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2473 actual length is stored in the compiled code, so we must update "code"
2474 here. */
2475
2476 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2477 case OP_XCLASS:
2478 ccode = code += GET(code, 1);
2479 goto CHECK_CLASS_REPEAT;
2480 #endif
2481
2482 case OP_CLASS:
2483 case OP_NCLASS:
2484 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2485
2486 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2487 CHECK_CLASS_REPEAT:
2488 #endif
2489
2490 switch (*ccode)
2491 {
2492 case OP_CRSTAR: /* These could be empty; continue */
2493 case OP_CRMINSTAR:
2494 case OP_CRQUERY:
2495 case OP_CRMINQUERY:
2496 case OP_CRPOSSTAR:
2497 case OP_CRPOSQUERY:
2498 break;
2499
2500 default: /* Non-repeat => class must match */
2501 case OP_CRPLUS: /* These repeats aren't empty */
2502 case OP_CRMINPLUS:
2503 case OP_CRPOSPLUS:
2504 return FALSE;
2505
2506 case OP_CRRANGE:
2507 case OP_CRMINRANGE:
2508 case OP_CRPOSRANGE:
2509 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2510 break;
2511 }
2512 break;
2513
2514 /* Opcodes that must match a character */
2515
2516 case OP_ANY:
2517 case OP_ALLANY:
2518 case OP_ANYBYTE:
2519
2520 case OP_PROP:
2521 case OP_NOTPROP:
2522 case OP_ANYNL:
2523
2524 case OP_NOT_HSPACE:
2525 case OP_HSPACE:
2526 case OP_NOT_VSPACE:
2527 case OP_VSPACE:
2528 case OP_EXTUNI:
2529
2530 case OP_NOT_DIGIT:
2531 case OP_DIGIT:
2532 case OP_NOT_WHITESPACE:
2533 case OP_WHITESPACE:
2534 case OP_NOT_WORDCHAR:
2535 case OP_WORDCHAR:
2536
2537 case OP_CHAR:
2538 case OP_CHARI:
2539 case OP_NOT:
2540 case OP_NOTI:
2541
2542 case OP_PLUS:
2543 case OP_PLUSI:
2544 case OP_MINPLUS:
2545 case OP_MINPLUSI:
2546
2547 case OP_NOTPLUS:
2548 case OP_NOTPLUSI:
2549 case OP_NOTMINPLUS:
2550 case OP_NOTMINPLUSI:
2551
2552 case OP_POSPLUS:
2553 case OP_POSPLUSI:
2554 case OP_NOTPOSPLUS:
2555 case OP_NOTPOSPLUSI:
2556
2557 case OP_EXACT:
2558 case OP_EXACTI:
2559 case OP_NOTEXACT:
2560 case OP_NOTEXACTI:
2561
2562 case OP_TYPEPLUS:
2563 case OP_TYPEMINPLUS:
2564 case OP_TYPEPOSPLUS:
2565 case OP_TYPEEXACT:
2566
2567 return FALSE;
2568
2569 /* These are going to continue, as they may be empty, but we have to
2570 fudge the length for the \p and \P cases. */
2571
2572 case OP_TYPESTAR:
2573 case OP_TYPEMINSTAR:
2574 case OP_TYPEPOSSTAR:
2575 case OP_TYPEQUERY:
2576 case OP_TYPEMINQUERY:
2577 case OP_TYPEPOSQUERY:
2578 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2579 break;
2580
2581 /* Same for these */
2582
2583 case OP_TYPEUPTO:
2584 case OP_TYPEMINUPTO:
2585 case OP_TYPEPOSUPTO:
2586 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2587 code += 2;
2588 break;
2589
2590 /* End of branch */
2591
2592 case OP_KET:
2593 case OP_KETRMAX:
2594 case OP_KETRMIN:
2595 case OP_KETRPOS:
2596 case OP_ALT:
2597 return TRUE;
2598
2599 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2600 MINUPTO, and POSUPTO and their caseless and negative versions may be
2601 followed by a multibyte character. */
2602
2603 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2604 case OP_STAR:
2605 case OP_STARI:
2606 case OP_NOTSTAR:
2607 case OP_NOTSTARI:
2608
2609 case OP_MINSTAR:
2610 case OP_MINSTARI:
2611 case OP_NOTMINSTAR:
2612 case OP_NOTMINSTARI:
2613
2614 case OP_POSSTAR:
2615 case OP_POSSTARI:
2616 case OP_NOTPOSSTAR:
2617 case OP_NOTPOSSTARI:
2618
2619 case OP_QUERY:
2620 case OP_QUERYI:
2621 case OP_NOTQUERY:
2622 case OP_NOTQUERYI:
2623
2624 case OP_MINQUERY:
2625 case OP_MINQUERYI:
2626 case OP_NOTMINQUERY:
2627 case OP_NOTMINQUERYI:
2628
2629 case OP_POSQUERY:
2630 case OP_POSQUERYI:
2631 case OP_NOTPOSQUERY:
2632 case OP_NOTPOSQUERYI:
2633
2634 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2635 break;
2636
2637 case OP_UPTO:
2638 case OP_UPTOI:
2639 case OP_NOTUPTO:
2640 case OP_NOTUPTOI:
2641
2642 case OP_MINUPTO:
2643 case OP_MINUPTOI:
2644 case OP_NOTMINUPTO:
2645 case OP_NOTMINUPTOI:
2646
2647 case OP_POSUPTO:
2648 case OP_POSUPTOI:
2649 case OP_NOTPOSUPTO:
2650 case OP_NOTPOSUPTOI:
2651
2652 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2653 break;
2654 #endif
2655
2656 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2657 string. */
2658
2659 case OP_MARK:
2660 case OP_PRUNE_ARG:
2661 case OP_SKIP_ARG:
2662 case OP_THEN_ARG:
2663 code += code[1];
2664 break;
2665
2666 /* None of the remaining opcodes are required to match a character. */
2667
2668 default:
2669 break;
2670 }
2671 }
2672
2673 return TRUE;
2674 }
2675
2676
2677
2678 /*************************************************
2679 * Scan compiled regex for non-emptiness *
2680 *************************************************/
2681
2682 /* This function is called to check for left recursive calls. We want to check
2683 the current branch of the current pattern to see if it could match the empty
2684 string. If it could, we must look outwards for branches at other levels,
2685 stopping when we pass beyond the bracket which is the subject of the recursion.
2686 This function is called only during the real compile, not during the
2687 pre-compile.
2688
2689 Arguments:
2690 code points to start of the recursion
2691 endcode points to where to stop (current RECURSE item)
2692 bcptr points to the chain of current (unclosed) branch starts
2693 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2694 cd pointers to tables etc
2695
2696 Returns: TRUE if what is matched could be empty
2697 */
2698
2699 static BOOL
2700 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2701 branch_chain *bcptr, BOOL utf, compile_data *cd)
2702 {
2703 while (bcptr != NULL && bcptr->current_branch >= code)
2704 {
2705 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2706 return FALSE;
2707 bcptr = bcptr->outer;
2708 }
2709 return TRUE;
2710 }
2711
2712
2713
2714 /*************************************************
2715 * Base opcode of repeated opcodes *
2716 *************************************************/
2717
2718 /* Returns the base opcode for repeated single character type opcodes. If the
2719 opcode is not a repeated character type, it returns with the original value.
2720
2721 Arguments: c opcode
2722 Returns: base opcode for the type
2723 */
2724
2725 static pcre_uchar
2726 get_repeat_base(pcre_uchar c)
2727 {
2728 return (c > OP_TYPEPOSUPTO)? c :
2729 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2730 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2731 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2732 (c >= OP_STARI)? OP_STARI :
2733 OP_STAR;
2734 }
2735
2736
2737
2738 #ifdef SUPPORT_UCP
2739 /*************************************************
2740 * Check a character and a property *
2741 *************************************************/
2742
2743 /* This function is called by check_auto_possessive() when a property item
2744 is adjacent to a fixed character.
2745
2746 Arguments:
2747 c the character
2748 ptype the property type
2749 pdata the data for the type
2750 negated TRUE if it's a negated property (\P or \p{^)
2751
2752 Returns: TRUE if auto-possessifying is OK
2753 */
2754
2755 static BOOL
2756 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2757 BOOL negated)
2758 {
2759 const pcre_uint32 *p;
2760 const ucd_record *prop = GET_UCD(c);
2761
2762 switch(ptype)
2763 {
2764 case PT_LAMP:
2765 return (prop->chartype == ucp_Lu ||
2766 prop->chartype == ucp_Ll ||
2767 prop->chartype == ucp_Lt) == negated;
2768
2769 case PT_GC:
2770 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2771
2772 case PT_PC:
2773 return (pdata == prop->chartype) == negated;
2774
2775 case PT_SC:
2776 return (pdata == prop->script) == negated;
2777
2778 /* These are specials */
2779
2780 case PT_ALNUM:
2781 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2782 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2783
2784 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2785 means that Perl space and POSIX space are now identical. PCRE was changed
2786 at release 8.34. */
2787
2788 case PT_SPACE: /* Perl space */
2789 case PT_PXSPACE: /* POSIX space */
2790 switch(c)
2791 {
2792 HSPACE_CASES:
2793 VSPACE_CASES:
2794 return negated;
2795
2796 default:
2797 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2798 }
2799 break; /* Control never reaches here */
2800
2801 case PT_WORD:
2802 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2803 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2804 c == CHAR_UNDERSCORE) == negated;
2805
2806 case PT_CLIST:
2807 p = PRIV(ucd_caseless_sets) + prop->caseset;
2808 for (;;)
2809 {
2810 if (c < *p) return !negated;
2811 if (c == *p++) return negated;
2812 }
2813 break; /* Control never reaches here */
2814 }
2815
2816 return FALSE;
2817 }
2818 #endif /* SUPPORT_UCP */
2819
2820
2821
2822 /*************************************************
2823 * Fill the character property list *
2824 *************************************************/
2825
2826 /* Checks whether the code points to an opcode that can take part in auto-
2827 possessification, and if so, fills a list with its properties.
2828
2829 Arguments:
2830 code points to start of expression
2831 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2832 fcc points to case-flipping table
2833 list points to output list
2834 list[0] will be filled with the opcode
2835 list[1] will be non-zero if this opcode
2836 can match an empty character string
2837 list[2..7] depends on the opcode
2838
2839 Returns: points to the start of the next opcode if *code is accepted
2840 NULL if *code is not accepted
2841 */
2842
2843 static const pcre_uchar *
2844 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2845 const pcre_uint8 *fcc, pcre_uint32 *list)
2846 {
2847 pcre_uchar c = *code;
2848 const pcre_uchar *end;
2849 const pcre_uint32 *clist_src;
2850 pcre_uint32 *clist_dest;
2851 pcre_uint32 chr;
2852 pcre_uchar base;
2853
2854 list[0] = c;
2855 list[1] = FALSE;
2856 code++;
2857
2858 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2859 {
2860 base = get_repeat_base(c);
2861 c -= (base - OP_STAR);
2862
2863 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2864 code += IMM2_SIZE;
2865
2866 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2867
2868 switch(base)
2869 {
2870 case OP_STAR:
2871 list[0] = OP_CHAR;
2872 break;
2873
2874 case OP_STARI:
2875 list[0] = OP_CHARI;
2876 break;
2877
2878 case OP_NOTSTAR:
2879 list[0] = OP_NOT;
2880 break;
2881
2882 case OP_NOTSTARI:
2883 list[0] = OP_NOTI;
2884 break;
2885
2886 case OP_TYPESTAR:
2887 list[0] = *code;
2888 code++;
2889 break;
2890 }
2891 c = list[0];
2892 }
2893
2894 switch(c)
2895 {
2896 case OP_NOT_DIGIT:
2897 case OP_DIGIT:
2898 case OP_NOT_WHITESPACE:
2899 case OP_WHITESPACE:
2900 case OP_NOT_WORDCHAR:
2901 case OP_WORDCHAR:
2902 case OP_ANY:
2903 case OP_ALLANY:
2904 case OP_ANYNL:
2905 case OP_NOT_HSPACE:
2906 case OP_HSPACE:
2907 case OP_NOT_VSPACE:
2908 case OP_VSPACE:
2909 case OP_EXTUNI:
2910 case OP_EODN:
2911 case OP_EOD:
2912 case OP_DOLL:
2913 case OP_DOLLM:
2914 return code;
2915
2916 case OP_CHAR:
2917 case OP_NOT:
2918 GETCHARINCTEST(chr, code);
2919 list[2] = chr;
2920 list[3] = NOTACHAR;
2921 return code;
2922
2923 case OP_CHARI:
2924 case OP_NOTI:
2925 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2926 GETCHARINCTEST(chr, code);
2927 list[2] = chr;
2928
2929 #ifdef SUPPORT_UCP
2930 if (chr < 128 || (chr < 256 && !utf))
2931 list[3] = fcc[chr];
2932 else
2933 list[3] = UCD_OTHERCASE(chr);
2934 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2935 list[3] = (chr < 256) ? fcc[chr] : chr;
2936 #else
2937 list[3] = fcc[chr];
2938 #endif
2939
2940 /* The othercase might be the same value. */
2941
2942 if (chr == list[3])
2943 list[3] = NOTACHAR;
2944 else
2945 list[4] = NOTACHAR;
2946 return code;
2947
2948 #ifdef SUPPORT_UCP
2949 case OP_PROP:
2950 case OP_NOTPROP:
2951 if (code[0] != PT_CLIST)
2952 {
2953 list[2] = code[0];
2954 list[3] = code[1];
2955 return code + 2;
2956 }
2957
2958 /* Convert only if we have enough space. */
2959
2960 clist_src = PRIV(ucd_caseless_sets) + code[1];
2961 clist_dest = list + 2;
2962 code += 2;
2963
2964 do {
2965 if (clist_dest >= list + 8)
2966 {
2967 /* Early return if there is not enough space. This should never
2968 happen, since all clists are shorter than 5 character now. */
2969 list[2] = code[0];
2970 list[3] = code[1];
2971 return code;
2972 }
2973 *clist_dest++ = *clist_src;
2974 }
2975 while(*clist_src++ != NOTACHAR);
2976
2977 /* All characters are stored. The terminating NOTACHAR
2978 is copied form the clist itself. */
2979
2980 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2981 return code;
2982 #endif
2983
2984 case OP_NCLASS:
2985 case OP_CLASS:
2986 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2987 case OP_XCLASS:
2988 if (c == OP_XCLASS)
2989 end = code + GET(code, 0) - 1;
2990 else
2991 #endif
2992 end = code + 32 / sizeof(pcre_uchar);
2993
2994 switch(*end)
2995 {
2996 case OP_CRSTAR:
2997 case OP_CRMINSTAR:
2998 case OP_CRQUERY:
2999 case OP_CRMINQUERY:
3000 case OP_CRPOSSTAR:
3001 case OP_CRPOSQUERY:
3002 list[1] = TRUE;
3003 end++;
3004 break;
3005
3006 case OP_CRPLUS:
3007 case OP_CRMINPLUS:
3008 case OP_CRPOSPLUS:
3009 end++;
3010 break;
3011
3012 case OP_CRRANGE:
3013 case OP_CRMINRANGE:
3014 case OP_CRPOSRANGE:
3015 list[1] = (GET2(end, 1) == 0);
3016 end += 1 + 2 * IMM2_SIZE;
3017 break;
3018 }
3019 list[2] = end - code;
3020 return end;
3021 }
3022 return NULL; /* Opcode not accepted */
3023 }
3024
3025
3026
3027 /*************************************************
3028 * Scan further character sets for match *
3029 *************************************************/
3030
3031 /* Checks whether the base and the current opcode have a common character, in
3032 which case the base cannot be possessified.
3033
3034 Arguments:
3035 code points to the byte code
3036 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3037 cd static compile data
3038 base_list the data list of the base opcode
3039
3040 Returns: TRUE if the auto-possessification is possible
3041 */
3042
3043 static BOOL
3044 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3045 const pcre_uint32 *base_list, const pcre_uchar *base_end)
3046 {
3047 pcre_uchar c;
3048 pcre_uint32 list[8];
3049 const pcre_uint32 *chr_ptr;
3050 const pcre_uint32 *ochr_ptr;
3051 const pcre_uint32 *list_ptr;
3052 const pcre_uchar *next_code;
3053 const pcre_uint8 *class_bitset;
3054 const pcre_uint32 *set1, *set2, *set_end;
3055 pcre_uint32 chr;
3056 BOOL accepted, invert_bits;
3057
3058 /* Note: the base_list[1] contains whether the current opcode has greedy
3059 (represented by a non-zero value) quantifier. This is a different from
3060 other character type lists, which stores here that the character iterator
3061 matches to an empty string (also represented by a non-zero value). */
3062
3063 for(;;)
3064 {
3065 /* All operations move the code pointer forward.
3066 Therefore infinite recursions are not possible. */
3067
3068 c = *code;
3069
3070 /* Skip over callouts */
3071
3072 if (c == OP_CALLOUT)
3073 {
3074 code += PRIV(OP_lengths)[c];
3075 continue;
3076 }
3077
3078 if (c == OP_ALT)
3079 {
3080 do code += GET(code, 1); while (*code == OP_ALT);
3081 c = *code;
3082 }
3083
3084 switch(c)
3085 {
3086 case OP_END:
3087 case OP_KETRPOS:
3088 /* TRUE only in greedy case. The non-greedy case could be replaced by
3089 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3090 uses more memory, which we cannot get at this stage.) */
3091
3092 return base_list[1] != 0;
3093
3094 case OP_KET:
3095 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3096 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3097 cannot be converted to a possessive form. */
3098
3099 if (base_list[1] == 0) return FALSE;
3100
3101 switch(*(code - GET(code, 1)))
3102 {
3103 case OP_ASSERT:
3104 case OP_ASSERT_NOT:
3105 case OP_ASSERTBACK:
3106 case OP_ASSERTBACK_NOT:
3107 case OP_ONCE:
3108 case OP_ONCE_NC:
3109 /* Atomic sub-patterns and assertions can always auto-possessify their
3110 last iterator. */
3111 return TRUE;
3112 }
3113
3114 code += PRIV(OP_lengths)[c];
3115 continue;
3116
3117 case OP_ONCE:
3118 case OP_ONCE_NC:
3119 case OP_BRA:
3120 case OP_CBRA:
3121 next_code = code + GET(code, 1);
3122 code += PRIV(OP_lengths)[c];
3123
3124 while (*next_code == OP_ALT)
3125 {
3126 if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3127 code = next_code + 1 + LINK_SIZE;
3128 next_code += GET(next_code, 1);
3129 }
3130 continue;
3131
3132 case OP_BRAZERO:
3133 case OP_BRAMINZERO:
3134
3135 next_code = code + 1;
3136 if (*next_code != OP_BRA && *next_code != OP_CBRA
3137 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3138
3139 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3140
3141 /* The bracket content will be checked by the
3142 OP_BRA/OP_CBRA case above. */
3143 next_code += 1 + LINK_SIZE;
3144 if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3145 return FALSE;
3146
3147 code += PRIV(OP_lengths)[c];
3148 continue;
3149 }
3150
3151 /* Check for a supported opcode, and load its properties. */
3152
3153 code = get_chr_property_list(code, utf, cd->fcc, list);
3154 if (code == NULL) return FALSE; /* Unsupported */
3155
3156 /* If either opcode is a small character list, set pointers for comparing
3157 characters from that list with another list, or with a property. */
3158
3159 if (base_list[0] == OP_CHAR)
3160 {
3161 chr_ptr = base_list + 2;
3162 list_ptr = list;
3163 }
3164 else if (list[0] == OP_CHAR)
3165 {
3166 chr_ptr = list + 2;
3167 list_ptr = base_list;
3168 }
3169
3170 /* Character bitsets can also be compared to certain opcodes. */
3171
3172 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3173 #ifdef COMPILE_PCRE8
3174 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3175 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3176 #endif
3177 )
3178 {
3179 #ifdef COMPILE_PCRE8
3180 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3181 #else
3182 if (base_list[0] == OP_CLASS)
3183 #endif
3184 {
3185 set1 = (pcre_uint32 *)(base_end - base_list[2]);
3186 list_ptr = list;
3187 }
3188 else
3189 {
3190 set1 = (pcre_uint32 *)(code - list[2]);
3191 list_ptr = base_list;
3192 }
3193
3194 invert_bits = FALSE;
3195 switch(list_ptr[0])
3196 {
3197 case OP_CLASS:
3198 case OP_NCLASS:
3199 set2 = (pcre_uint32 *)
3200 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3201 break;
3202
3203 /* OP_XCLASS cannot be supported here, because its bitset
3204 is not necessarily complete. E.g: [a-\0x{200}] is stored
3205 as a character range, and the appropriate bits are not set. */
3206
3207 case OP_NOT_DIGIT:
3208 invert_bits = TRUE;
3209 /* Fall through */
3210 case OP_DIGIT:
3211 set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3212 break;
3213
3214 case OP_NOT_WHITESPACE:
3215 invert_bits = TRUE;
3216 /* Fall through */
3217 case OP_WHITESPACE:
3218 set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3219 break;
3220
3221 case OP_NOT_WORDCHAR:
3222 invert_bits = TRUE;
3223 /* Fall through */
3224 case OP_WORDCHAR:
3225 set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3226 break;
3227
3228 default:
3229 return FALSE;
3230 }
3231
3232 /* Compare 4 bytes to improve speed. */
3233 set_end = set1 + (32 / 4);
3234 if (invert_bits)
3235 {
3236 do
3237 {
3238 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3239 }
3240 while (set1 < set_end);
3241 }
3242 else
3243 {
3244 do
3245 {
3246 if ((*set1++ & *set2++) != 0) return FALSE;
3247 }
3248 while (set1 < set_end);
3249 }
3250
3251 if (list[1] == 0) return TRUE;
3252 /* Might be an empty repeat. */
3253 continue;
3254 }
3255
3256 /* Some property combinations also acceptable. Unicode property opcodes are
3257 processed specially; the rest can be handled with a lookup table. */
3258
3259 else
3260 {
3261 pcre_uint32 leftop, rightop;
3262
3263 leftop = base_list[0];
3264 rightop = list[0];
3265
3266 #ifdef SUPPORT_UCP
3267 accepted = FALSE; /* Always set in non-unicode case. */
3268 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3269 {
3270 if (rightop == OP_EOD)
3271 accepted = TRUE;
3272 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3273 {
3274 int n;
3275 const pcre_uint8 *p;
3276 BOOL same = leftop == rightop;
3277 BOOL lisprop = leftop == OP_PROP;
3278 BOOL risprop = rightop == OP_PROP;
3279 BOOL bothprop = lisprop && risprop;
3280
3281 /* There's a table that specifies how each combination is to be
3282 processed:
3283 0 Always return FALSE (never auto-possessify)
3284 1 Character groups are distinct (possessify if both are OP_PROP)
3285 2 Check character categories in the same group (general or particular)
3286 3 Return TRUE if the two opcodes are not the same
3287 ... see comments below
3288 */
3289
3290 n = propposstab[base_list[2]][list[2]];
3291 switch(n)
3292 {
3293 case 0: break;
3294 case 1: accepted = bothprop; break;
3295 case 2: accepted = (base_list[3] == list[3]) != same; break;
3296 case 3: accepted = !same; break;
3297
3298 case 4: /* Left general category, right particular category */
3299 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3300 break;
3301
3302 case 5: /* Right general category, left particular category */
3303 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3304 break;
3305
3306 /* This code is logically tricky. Think hard before fiddling with it.
3307 The posspropstab table has four entries per row. Each row relates to
3308 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3309 Only WORD actually needs all four entries, but using repeats for the
3310 others means they can all use the same code below.
3311
3312 The first two entries in each row are Unicode general categories, and
3313 apply always, because all the characters they include are part of the
3314 PCRE character set. The third and fourth entries are a general and a
3315 particular category, respectively, that include one or more relevant
3316 characters. One or the other is used, depending on whether the check
3317 is for a general or a particular category. However, in both cases the
3318 category contains more characters than the specials that are defined
3319 for the property being tested against. Therefore, it cannot be used
3320 in a NOTPROP case.
3321
3322 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3323 Underscore is covered by ucp_P or ucp_Po. */
3324
3325 case 6: /* Left alphanum vs right general category */
3326 case 7: /* Left space vs right general category */
3327 case 8: /* Left word vs right general category */
3328 p = posspropstab[n-6];
3329 accepted = risprop && lisprop ==
3330 (list[3] != p[0] &&
3331 list[3] != p[1] &&
3332 (list[3] != p[2] || !lisprop));
3333 break;
3334
3335 case 9: /* Right alphanum vs left general category */
3336 case 10: /* Right space vs left general category */
3337 case 11: /* Right word vs left general category */
3338 p = posspropstab[n-9];
3339 accepted = lisprop && risprop ==
3340 (base_list[3] != p[0] &&
3341 base_list[3] != p[1] &&
3342 (base_list[3] != p[2] || !risprop));
3343 break;
3344
3345 case 12: /* Left alphanum vs right particular category */
3346 case 13: /* Left space vs right particular category */
3347 case 14: /* Left word vs right particular category */
3348 p = posspropstab[n-12];
3349 accepted = risprop && lisprop ==
3350 (catposstab[p[0]][list[3]] &&
3351 catposstab[p[1]][list[3]] &&
3352 (list[3] != p[3] || !lisprop));
3353 break;
3354
3355 case 15: /* Right alphanum vs left particular category */
3356 case 16: /* Right space vs left particular category */
3357 case 17: /* Right word vs left particular category */
3358 p = posspropstab[n-15];
3359 accepted = lisprop && risprop ==
3360 (catposstab[p[0]][base_list[3]] &&
3361 catposstab[p[1]][base_list[3]] &&
3362 (base_list[3] != p[3] || !risprop));
3363 break;
3364 }
3365 }
3366 }
3367
3368 else
3369 #endif /* SUPPORT_UCP */
3370
3371 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3372 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3373 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3374
3375 if (!accepted)
3376 return FALSE;
3377
3378 if (list[1] == 0) return TRUE;
3379 /* Might be an empty repeat. */
3380 continue;
3381 }
3382
3383 /* Control reaches here only if one of the items is a small character list.
3384 All characters are checked against the other side. */
3385
3386 do
3387 {
3388 chr = *chr_ptr;
3389
3390 switch(list_ptr[0])
3391 {
3392 case OP_CHAR:
3393 ochr_ptr = list_ptr + 2;
3394 do
3395 {
3396 if (chr == *ochr_ptr) return FALSE;
3397 ochr_ptr++;
3398 }
3399 while(*ochr_ptr != NOTACHAR);
3400 break;
3401
3402 case OP_NOT:
3403 ochr_ptr = list_ptr + 2;
3404 do
3405 {
3406 if (chr == *ochr_ptr)
3407 break;
3408 ochr_ptr++;
3409 }
3410 while(*ochr_ptr != NOTACHAR);
3411 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3412 break;
3413
3414 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3415 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3416
3417 case OP_DIGIT:
3418 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3419 break;
3420
3421 case OP_NOT_DIGIT:
3422 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3423 break;
3424
3425 case OP_WHITESPACE:
3426 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3427 break;
3428
3429 case OP_NOT_WHITESPACE:
3430 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3431 break;
3432
3433 case OP_WORDCHAR:
3434 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3435 break;
3436
3437 case OP_NOT_WORDCHAR:
3438 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3439 break;
3440
3441 case OP_HSPACE:
3442 switch(chr)
3443 {
3444 HSPACE_CASES: return FALSE;
3445 default: break;
3446 }
3447 break;
3448
3449 case OP_NOT_HSPACE:
3450 switch(chr)
3451 {
3452 HSPACE_CASES: break;
3453 default: return FALSE;
3454 }
3455 break;
3456
3457 case OP_ANYNL:
3458 case OP_VSPACE:
3459 switch(chr)
3460 {
3461 VSPACE_CASES: return FALSE;
3462 default: break;
3463 }
3464 break;
3465
3466 case OP_NOT_VSPACE:
3467 switch(chr)
3468 {
3469 VSPACE_CASES: break;
3470 default: return FALSE;
3471 }
3472 break;
3473
3474 case OP_DOLL:
3475 case OP_EODN:
3476 switch (chr)
3477 {
3478 case CHAR_CR:
3479 case CHAR_LF:
3480 case CHAR_VT:
3481 case CHAR_FF:
3482 case CHAR_NEL:
3483 #ifndef EBCDIC
3484 case 0x2028:
3485 case 0x2029:
3486 #endif /* Not EBCDIC */
3487 return FALSE;
3488 }
3489 break;
3490
3491 case OP_EOD: /* Can always possessify before \z */
3492 break;
3493
3494 #ifdef SUPPORT_UCP
3495 case OP_PROP:
3496 case OP_NOTPROP:
3497 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3498 list_ptr[0] == OP_NOTPROP))
3499 return FALSE;
3500 break;
3501 #endif
3502
3503 case OP_NCLASS:
3504 if (chr > 255) return FALSE;
3505 /* Fall through */
3506
3507 case OP_CLASS:
3508 if (chr > 255) break;
3509 class_bitset = (pcre_uint8 *)
3510 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3511 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3512 break;
3513
3514 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3515 case OP_XCLASS:
3516 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3517 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3518 break;
3519 #endif
3520
3521 default:
3522 return FALSE;
3523 }
3524
3525 chr_ptr++;
3526 }
3527 while(*chr_ptr != NOTACHAR);
3528
3529 /* At least one character must be matched from this opcode. */
3530
3531 if (list[1] == 0) return TRUE;
3532 }
3533
3534 return FALSE;
3535 }
3536
3537
3538
3539 /*************************************************
3540 * Scan compiled regex for auto-possession *
3541 *************************************************/
3542
3543 /* Replaces single character iterations with their possessive alternatives
3544 if appropriate. This function modifies the compiled opcode!
3545
3546 Arguments:
3547 code points to start of the byte code
3548 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3549 cd static compile data
3550
3551 Returns: nothing
3552 */
3553
3554 static void
3555 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3556 {
3557 register pcre_uchar c;
3558 const pcre_uchar *end;
3559 pcre_uchar *repeat_opcode;
3560 pcre_uint32 list[8];
3561
3562 for (;;)
3563 {
3564 c = *code;
3565
3566 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3567 {
3568 c -= get_repeat_base(c) - OP_STAR;
3569 end = (c <= OP_MINUPTO) ?
3570 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3571 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3572
3573 if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3574 {
3575 switch(c)
3576 {
3577 case OP_STAR:
3578 *code += OP_POSSTAR - OP_STAR;
3579 break;
3580
3581 case OP_MINSTAR:
3582 *code += OP_POSSTAR - OP_MINSTAR;
3583 break;
3584
3585 case OP_PLUS:
3586 *code += OP_POSPLUS - OP_PLUS;
3587 break;
3588
3589 case OP_MINPLUS:
3590 *code += OP_POSPLUS - OP_MINPLUS;
3591 break;
3592
3593 case OP_QUERY:
3594 *code += OP_POSQUERY - OP_QUERY;
3595 break;
3596
3597 case OP_MINQUERY:
3598 *code += OP_POSQUERY - OP_MINQUERY;
3599 break;
3600
3601 case OP_UPTO:
3602 *code += OP_POSUPTO - OP_UPTO;
3603 break;
3604
3605 case OP_MINUPTO:
3606 *code += OP_MINUPTO - OP_UPTO;
3607 break;
3608 }
3609 }
3610 c = *code;
3611 }
3612 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3613 {
3614 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3615 if (c == OP_XCLASS)
3616 repeat_opcode = code + GET(code, 1);
3617 else
3618 #endif
3619 repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3620
3621 c = *repeat_opcode;
3622 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3623 {
3624 /* end must not be NULL. */
3625 end = get_chr_property_list(code, utf, cd->fcc, list);
3626
3627 list[1] = (c & 1) == 0;
3628
3629 if (compare_opcodes(end, utf, cd, list, end))
3630 {
3631 switch (c)
3632 {
3633 case OP_CRSTAR:
3634 case OP_CRMINSTAR:
3635 *repeat_opcode = OP_CRPOSSTAR;
3636 break;
3637
3638 case OP_CRPLUS:
3639 case OP_CRMINPLUS:
3640 *repeat_opcode = OP_CRPOSPLUS;
3641 break;
3642
3643 case OP_CRQUERY:
3644 case OP_CRMINQUERY:
3645 *repeat_opcode = OP_CRPOSQUERY;
3646 break;
3647
3648 case OP_CRRANGE:
3649 case OP_CRMINRANGE:
3650 *repeat_opcode = OP_CRPOSRANGE;
3651 break;
3652 }
3653 }
3654 }
3655 c = *code;
3656 }
3657
3658 switch(c)
3659 {
3660 case OP_END:
3661 return;
3662
3663 case OP_TYPESTAR:
3664 case OP_TYPEMINSTAR:
3665 case OP_TYPEPLUS:
3666 case OP_TYPEMINPLUS:
3667 case OP_TYPEQUERY:
3668 case OP_TYPEMINQUERY:
3669 case OP_TYPEPOSSTAR:
3670 case OP_TYPEPOSPLUS:
3671 case OP_TYPEPOSQUERY:
3672 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3673 break;
3674
3675 case OP_TYPEUPTO:
3676 case OP_TYPEMINUPTO:
3677 case OP_TYPEEXACT:
3678 case OP_TYPEPOSUPTO:
3679 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3680 code += 2;
3681 break;
3682
3683 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3684 case OP_XCLASS:
3685 code += GET(code, 1);
3686 break;
3687 #endif
3688
3689 case OP_MARK:
3690 case OP_PRUNE_ARG:
3691 case OP_SKIP_ARG:
3692 case OP_THEN_ARG:
3693 code += code[1];
3694 break;
3695 }
3696
3697 /* Add in the fixed length from the table */
3698
3699 code += PRIV(OP_lengths)[c];
3700
3701 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3702 a multi-byte character. The length in the table is a minimum, so we have to
3703 arrange to skip the extra bytes. */
3704
3705 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3706 if (utf) switch(c)
3707 {
3708 case OP_CHAR:
3709 case OP_CHARI:
3710 case OP_NOT:
3711 case OP_NOTI:
3712 case OP_STAR:
3713 case OP_MINSTAR:
3714 case OP_PLUS:
3715 case OP_MINPLUS:
3716 case OP_QUERY:
3717 case OP_MINQUERY:
3718 case OP_UPTO:
3719 case OP_MINUPTO:
3720 case OP_EXACT:
3721 case OP_POSSTAR:
3722 case OP_POSPLUS:
3723 case OP_POSQUERY:
3724 case OP_POSUPTO:
3725 case OP_STARI:
3726 case OP_MINSTARI:
3727 case OP_PLUSI:
3728 case OP_MINPLUSI:
3729 case OP_QUERYI:
3730 case OP_MINQUERYI:
3731 case OP_UPTOI:
3732 case OP_MINUPTOI:
3733 case OP_EXACTI:
3734 case OP_POSSTARI:
3735 case OP_POSPLUSI:
3736 case OP_POSQUERYI:
3737 case OP_POSUPTOI:
3738 case OP_NOTSTAR:
3739 case OP_NOTMINSTAR:
3740 case OP_NOTPLUS:
3741 case OP_NOTMINPLUS:
3742 case OP_NOTQUERY:
3743 case OP_NOTMINQUERY:
3744 case OP_NOTUPTO:
3745 case OP_NOTMINUPTO:
3746 case OP_NOTEXACT:
3747 case OP_NOTPOSSTAR:
3748 case OP_NOTPOSPLUS:
3749 case OP_NOTPOSQUERY:
3750 case OP_NOTPOSUPTO:
3751 case OP_NOTSTARI:
3752 case OP_NOTMINSTARI:
3753 case OP_NOTPLUSI:
3754 case OP_NOTMINPLUSI:
3755 case OP_NOTQUERYI:
3756 case OP_NOTMINQUERYI:
3757 case OP_NOTUPTOI:
3758 case OP_NOTMINUPTOI:
3759 case OP_NOTEXACTI:
3760 case OP_NOTPOSSTARI:
3761 case OP_NOTPOSPLUSI:
3762 case OP_NOTPOSQUERYI:
3763 case OP_NOTPOSUPTOI:
3764 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3765 break;
3766 }
3767 #else
3768 (void)(utf); /* Keep compiler happy by referencing function argument */
3769 #endif
3770 }
3771 }
3772
3773
3774
3775 /*************************************************
3776 * Check for POSIX class syntax *
3777 *************************************************/
3778
3779 /* This function is called when the sequence "[:" or "[." or "[=" is
3780 encountered in a character class. It checks whether this is followed by a
3781 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3782 reach an unescaped ']' without the special preceding character, return FALSE.
3783
3784 Originally, this function only recognized a sequence of letters between the
3785 terminators, but it seems that Perl recognizes any sequence of characters,
3786 though of course unknown POSIX names are subsequently rejected. Perl gives an
3787 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3788 didn't consider this to be a POSIX class. Likewise for [:1234:].
3789
3790 The problem in trying to be exactly like Perl is in the handling of escapes. We
3791 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3792 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3793 below handles the special case of \], but does not try to do any other escape
3794 processing. This makes it different from Perl for cases such as [:l\ower:]
3795 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3796 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
3797 I think.
3798
3799 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3800 It seems that the appearance of a nested POSIX class supersedes an apparent
3801 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3802 a digit.
3803
3804 In Perl, unescaped square brackets may also appear as part of class names. For
3805 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3806 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3807 seem right at all. PCRE does not allow closing square brackets in POSIX class
3808 names.
3809
3810 Arguments:
3811 ptr pointer to the initial [
3812 endptr where to return the end pointer
3813
3814 Returns: TRUE or FALSE
3815 */
3816
3817 static BOOL
3818 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3819 {
3820 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3821 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3822 for (++ptr; *ptr != CHAR_NULL; ptr++)
3823 {
3824 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3825 ptr++;
3826 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3827 else
3828 {
3829 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3830 {
3831 *endptr = ptr;
3832 return TRUE;
3833 }
3834 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3835 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3836 ptr[1] == CHAR_EQUALS_SIGN) &&
3837 check_posix_syntax(ptr, endptr))
3838 return FALSE;
3839 }
3840 }
3841 return FALSE;
3842 }
3843
3844
3845
3846
3847 /*************************************************
3848 * Check POSIX class name *
3849 *************************************************/
3850
3851 /* This function is called to check the name given in a POSIX-style class entry
3852 such as [:alnum:].
3853
3854 Arguments:
3855 ptr points to the first letter
3856 len the length of the name
3857
3858 Returns: a value representing the name, or -1 if unknown
3859 */
3860
3861 static int
3862 check_posix_name(const pcre_uchar *ptr, int len)
3863 {
3864 const char *pn = posix_names;
3865 register int yield = 0;
3866 while (posix_name_lengths[yield] != 0)
3867 {
3868 if (len == posix_name_lengths[yield] &&
3869 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3870 pn += posix_name_lengths[yield] + 1;
3871 yield++;
3872 }
3873 return -1;
3874 }
3875
3876
3877 /*************************************************
3878 * Adjust OP_RECURSE items in repeated group *
3879 *************************************************/
3880
3881 /* OP_RECURSE items contain an offset from the start of the regex to the group
3882 that is referenced. This means that groups can be replicated for fixed
3883 repetition simply by copying (because the recursion is allowed to refer to
3884 earlier groups that are outside the current group). However, when a group is
3885 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3886 inserted before it, after it has been compiled. This means that any OP_RECURSE
3887 items within it that refer to the group itself or any contained groups have to
3888 have their offsets adjusted. That one of the jobs of this function. Before it
3889 is called, the partially compiled regex must be temporarily terminated with
3890 OP_END.
3891
3892 This function has been extended with the possibility of forward references for
3893 recursions and subroutine calls. It must also check the list of such references
3894 for the group we are dealing with. If it finds that one of the recursions in
3895 the current group is on this list, it adjusts the offset in the list, not the
3896 value in the reference (which is a group number).
3897
3898 Arguments:
3899 group points to the start of the group
3900 adjust the amount by which the group is to be moved
3901 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3902 cd contains pointers to tables etc.
3903 save_hwm the hwm forward reference pointer at the start of the group
3904
3905 Returns: nothing
3906 */
3907
3908 static void
3909 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3910 pcre_uchar *save_hwm)
3911 {
3912 pcre_uchar *ptr = group;
3913
3914 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3915 {
3916 int offset;
3917 pcre_uchar *hc;
3918
3919 /* See if this recursion is on the forward reference list. If so, adjust the
3920 reference. */
3921
3922 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3923 {
3924 offset = (int)GET(hc, 0);
3925 if (cd->start_code + offset == ptr + 1)
3926 {
3927 PUT(hc, 0, offset + adjust);
3928 break;
3929 }
3930 }
3931
3932 /* Otherwise, adjust the recursion offset if it's after the start of this
3933 group. */
3934
3935 if (hc >= cd->hwm)
3936 {
3937 offset = (int)GET(ptr, 1);
3938 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3939 }
3940
3941 ptr += 1 + LINK_SIZE;
3942 }
3943 }
3944
3945
3946
3947 /*************************************************
3948 * Insert an automatic callout point *
3949 *************************************************/
3950
3951 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3952 callout points before each pattern item.
3953
3954 Arguments:
3955 code current code pointer
3956 ptr current pattern pointer
3957 cd pointers to tables etc
3958
3959 Returns: new code pointer
3960 */
3961
3962 static pcre_uchar *
3963 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3964 {
3965 *code++ = OP_CALLOUT;
3966 *code++ = 255;
3967 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
3968 PUT(code, LINK_SIZE, 0); /* Default length */
3969 return code + 2 * LINK_SIZE;
3970 }
3971
3972
3973
3974 /*************************************************
3975 * Complete a callout item *
3976 *************************************************/
3977
3978 /* A callout item contains the length of the next item in the pattern, which
3979 we can't fill in till after we have reached the relevant point. This is used
3980 for both automatic and manual callouts.
3981
3982 Arguments:
3983 previous_callout points to previous callout item
3984 ptr current pattern pointer
3985 cd pointers to tables etc
3986
3987 Returns: nothing
3988 */
3989
3990 static void
3991 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3992 {
3993 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3994 PUT(previous_callout, 2 + LINK_SIZE, length);
3995 }
3996
3997
3998
3999 #ifdef SUPPORT_UCP
4000 /*************************************************
4001 * Get othercase range *
4002 *************************************************/
4003
4004 /* This function is passed the start and end of a class range, in UTF-8 mode
4005 with UCP support. It searches up the characters, looking for ranges of
4006 characters in the "other" case. Each call returns the next one, updating the
4007 start address. A character with multiple other cases is returned on its own
4008 with a special return value.
4009
4010 Arguments:
4011 cptr points to starting character value; updated
4012 d end value
4013 ocptr where to put start of othercase range
4014 odptr where to put end of othercase range
4015
4016 Yield: -1 when no more
4017 0 when a range is returned
4018 >0 the CASESET offset for char with multiple other cases
4019 in this case, ocptr contains the original
4020 */
4021
4022 static int
4023 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4024 pcre_uint32 *odptr)
4025 {
4026 pcre_uint32 c, othercase, next;
4027 unsigned int co;
4028
4029 /* Find the first character that has an other case. If it has multiple other
4030 cases, return its case offset value. */
4031
4032 for (c = *cptr; c <= d; c++)
4033 {
4034 if ((co = UCD_CASESET(c)) != 0)
4035 {
4036 *ocptr = c++; /* Character that has the set */
4037 *cptr = c; /* Rest of input range */
4038 return (int)co;
4039 }
4040 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4041 }
4042
4043 if (c > d) return -1; /* Reached end of range */
4044
4045 *ocptr = othercase;
4046 next = othercase + 1;
4047
4048 for (++c; c <= d; c++)
4049 {
4050 if (UCD_OTHERCASE(c) != next) break;
4051 next++;
4052 }
4053
4054 *odptr = next - 1; /* End of othercase range */
4055 *cptr = c; /* Rest of input range */
4056 return 0;
4057 }
4058 #endif /* SUPPORT_UCP */
4059
4060
4061
4062 /*************************************************
4063 * Add a character or range to a class *
4064 *************************************************/
4065
4066 /* This function packages up the logic of adding a character or range of
4067 characters to a class. The character values in the arguments will be within the
4068 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4069 mutually recursive with the function immediately below.
4070
4071 Arguments:
4072 classbits the bit map for characters < 256
4073 uchardptr points to the pointer for extra data
4074 options the options word
4075 cd contains pointers to tables etc.
4076 start start of range character
4077 end end of range character
4078
4079 Returns: the number of < 256 characters added
4080 the pointer to extra data is updated
4081 */
4082
4083 static int
4084 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4085 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4086 {
4087 pcre_uint32 c;
4088 int n8 = 0;
4089
4090 /* If caseless matching is required, scan the range and process alternate
4091 cases. In Unicode, there are 8-bit characters that have alternate cases that
4092 are greater than 255 and vice-versa. Sometimes we can just extend the original
4093 range. */
4094
4095 if ((options & PCRE_CASELESS) != 0)
4096 {
4097 #ifdef SUPPORT_UCP
4098 if ((options & PCRE_UTF8) != 0)
4099 {
4100 int rc;
4101 pcre_uint32 oc, od;
4102
4103 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
4104 c = start;
4105
4106 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4107 {
4108 /* Handle a single character that has more than one other case. */
4109
4110 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4111 PRIV(ucd_caseless_sets) + rc, oc);
4112
4113 /* Do nothing if the other case range is within the original range. */
4114
4115 else if (oc >= start && od <= end) continue;
4116
4117 /* Extend the original range if there is overlap, noting that if oc < c, we
4118 can't have od > end because a subrange is always shorter than the basic
4119 range. Otherwise, use a recursive call to add the additional range. */
4120
4121 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4122 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
4123 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4124 }
4125 }
4126 else
4127 #endif /* SUPPORT_UCP */
4128
4129 /* Not UTF-mode, or no UCP */
4130
4131 for (c = start; c <= end && c < 256; c++)
4132 {
4133 SETBIT(classbits, cd->fcc[c]);
4134 n8++;
4135 }
4136 }
4137
4138 /* Now handle the original range. Adjust the final value according to the bit
4139 length - this means that the same lists of (e.g.) horizontal spaces can be used
4140 in all cases. */
4141
4142 #if defined COMPILE_PCRE8
4143 #ifdef SUPPORT_UTF
4144 if ((options & PCRE_UTF8) == 0)
4145 #endif
4146 if (end > 0xff) end = 0xff;
4147
4148 #elif defined COMPILE_PCRE16
4149 #ifdef SUPPORT_UTF
4150 if ((options & PCRE_UTF16) == 0)
4151 #endif
4152 if (end > 0xffff) end = 0xffff;
4153
4154 #endif /* COMPILE_PCRE[8|16] */
4155
4156 /* If all characters are less than 256, use the bit map. Otherwise use extra
4157 data. */
4158
4159 if (end < 0x100)
4160 {
4161 for (c = start; c <= end; c++)
4162 {
4163 n8++;
4164 SETBIT(classbits, c);
4165 }
4166 }
4167
4168 else
4169 {
4170 pcre_uchar *uchardata = *uchardptr;
4171
4172 #ifdef SUPPORT_UTF
4173 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
4174 {
4175 if (start < end)
4176 {
4177 *uchardata++ = XCL_RANGE;
4178 uchardata += PRIV(ord2utf)(start, uchardata);
4179 uchardata += PRIV(ord2utf)(end, uchardata);
4180 }
4181 else if (start == end)
4182 {
4183 *uchardata++ = XCL_SINGLE;
4184 uchardata += PRIV(ord2utf)(start, uchardata);
4185 }
4186 }
4187 else
4188 #endif /* SUPPORT_UTF */
4189
4190 /* Without UTF support, character values are constrained by the bit length,
4191 and can only be > 256 for 16-bit and 32-bit libraries. */
4192
4193 #ifdef COMPILE_PCRE8
4194 {}
4195 #else
4196 if (start < end)
4197 {
4198 *uchardata++ = XCL_RANGE;
4199 *uchardata++ = start;
4200 *uchardata++ = end;
4201 }
4202 else if (start == end)
4203 {
4204 *uchardata++ = XCL_SINGLE;
4205 *uchardata++ = start;
4206 }
4207 #endif
4208
4209 *uchardptr = uchardata; /* Updata extra data pointer */
4210 }
4211
4212 return n8; /* Number of 8-bit characters */
4213 }
4214
4215
4216
4217
4218 /*************************************************
4219 * Add a list of characters to a class *
4220 *************************************************/
4221
4222 /* This function is used for adding a list of case-equivalent characters to a
4223 class, and also for adding a list of horizontal or vertical whitespace. If the
4224 list is in order (which it should be), ranges of characters are detected and
4225 handled appropriately. This function is mutually recursive with the function
4226 above.
4227
4228 Arguments:
4229 classbits the bit map for characters < 256
4230 uchardptr points to the pointer for extra data
4231 options the options word
4232 cd contains pointers to tables etc.
4233 p points to row of 32-bit values, terminated by NOTACHAR
4234 except character to omit; this is used when adding lists of
4235 case-equivalent characters to avoid including the one we
4236 already know about
4237
4238 Returns: the number of < 256 characters added
4239 the pointer to extra data is updated
4240 */
4241
4242 static int
4243 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4244 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4245 {
4246 int n8 = 0;
4247 while (p[0] < NOTACHAR)
4248 {
4249 int n = 0;
4250 if (p[0] != except)
4251 {
4252 while(p[n+1] == p[0] + n + 1) n++;
4253 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4254 }
4255 p += n + 1;
4256 }
4257 return n8;
4258 }
4259
4260
4261
4262 /*************************************************
4263 * Add characters not in a list to a class *
4264 *************************************************/
4265
4266 /* This function is used for adding the complement of a list of horizontal or
4267 vertical whitespace to a class. The list must be in order.
4268
4269 Arguments:
4270 classbits the bit map for characters < 256
4271 uchardptr points to the pointer for extra data
4272 options the options word
4273 cd contains pointers to tables etc.
4274 p points to row of 32-bit values, terminated by NOTACHAR
4275
4276 Returns: the number of < 256 characters added
4277 the pointer to extra data is updated
4278 */
4279
4280 static int
4281 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4282 int options, compile_data *cd, const pcre_uint32 *p)
4283 {
4284 BOOL utf = (options & PCRE_UTF8) != 0;
4285 int n8 = 0;
4286 if (p[0] > 0)
4287 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4288 while (p[0] < NOTACHAR)
4289 {
4290 while (p[1] == p[0] + 1) p++;
4291 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4292 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4293 p++;
4294 }
4295 return n8;
4296 }
4297
4298
4299
4300 /*************************************************
4301 * Compile one branch *
4302 *************************************************/
4303
4304 /* Scan the pattern, compiling it into the a vector. If the options are
4305 changed during the branch, the pointer is used to change the external options
4306 bits. This function is used during the pre-compile phase when we are trying
4307 to find out the amount of memory needed, as well as during the real compile
4308 phase. The value of lengthptr distinguishes the two phases.
4309
4310 Arguments:
4311 optionsptr pointer to the option bits
4312 codeptr points to the pointer to the current code point
4313 ptrptr points to the current pattern pointer
4314 errorcodeptr points to error code variable
4315 firstcharptr place to put the first required character
4316 firstcharflagsptr place to put the first character flags, or a negative number
4317 reqcharptr place to put the last required character
4318 reqcharflagsptr place to put the last required character flags, or a negative number
4319 bcptr points to current branch chain
4320 cond_depth conditional nesting depth
4321 cd contains pointers to tables etc.
4322 lengthptr NULL during the real compile phase
4323 points to length accumulator during pre-compile phase
4324
4325 Returns: TRUE on success
4326 FALSE, with *errorcodeptr set non-zero on error
4327 */
4328
4329 static BOOL
4330 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4331 const pcre_uchar **ptrptr, int *errorcodeptr,
4332 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4333 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4334 branch_chain *bcptr, int cond_depth,
4335 compile_data *cd, int *lengthptr)
4336 {
4337 int repeat_type, op_type;
4338 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4339 int bravalue = 0;
4340 int greedy_default, greedy_non_default;
4341 pcre_uint32 firstchar, reqchar;
4342 pcre_int32 firstcharflags, reqcharflags;
4343 pcre_uint32 zeroreqchar, zerofirstchar;
4344 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4345 pcre_int32 req_caseopt, reqvary, tempreqvary;
4346 int options = *optionsptr; /* May change dynamically */
4347 int after_manual_callout = 0;
4348 int length_prevgroup = 0;
4349 register pcre_uint32 c;
4350 int escape;
4351 register pcre_uchar *code = *codeptr;
4352 pcre_uchar *last_code = code;
4353 pcre_uchar *orig_code = code;
4354 pcre_uchar *tempcode;
4355 BOOL inescq = FALSE;
4356 BOOL groupsetfirstchar = FALSE;
4357 const pcre_uchar *ptr = *ptrptr;
4358 const pcre_uchar *tempptr;
4359 const pcre_uchar *nestptr = NULL;
4360 pcre_uchar *previous = NULL;
4361 pcre_uchar *previous_callout = NULL;
4362 pcre_uchar *save_hwm = NULL;
4363 pcre_uint8 classbits[32];
4364
4365 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4366 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4367 dynamically as we process the pattern. */
4368
4369 #ifdef SUPPORT_UTF
4370 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4371 BOOL utf = (options & PCRE_UTF8) != 0;
4372 #ifndef COMPILE_PCRE32
4373 pcre_uchar utf_chars[6];
4374 #endif
4375 #else
4376 BOOL utf = FALSE;
4377 #endif
4378
4379 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4380 class_uchardata always so that it can be passed to add_to_class() always,
4381 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4382 alternative calls for the different cases. */
4383
4384 pcre_uchar *class_uchardata;
4385 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4386 BOOL xclass;
4387 pcre_uchar *class_uchardata_base;
4388 #endif
4389
4390 #ifdef PCRE_DEBUG
4391 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4392 #endif
4393
4394 /* Set up the default and non-default settings for greediness */
4395
4396 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4397 greedy_non_default = greedy_default ^ 1;
4398
4399 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4400 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4401 matches a non-fixed char first char; reqchar just remains unset if we never
4402 find one.
4403
4404 When we hit a repeat whose minimum is zero, we may have to adjust these values
4405 to take the zero repeat into account. This is implemented by setting them to
4406 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4407 item types that can be repeated set these backoff variables appropriately. */
4408
4409 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4410 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4411
4412 /* The variable req_caseopt contains either the REQ_CASELESS value
4413 or zero, according to the current setting of the caseless flag. The
4414 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4415 firstchar or reqchar variables to record the case status of the
4416 value. This is used only for ASCII characters. */
4417
4418 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4419
4420 /* Switch on next character until the end of the branch */
4421
4422 for (;; ptr++)
4423 {
4424 BOOL negate_class;
4425 BOOL should_flip_negation;
4426 BOOL possessive_quantifier;
4427 BOOL is_quantifier;
4428 BOOL is_recurse;
4429 BOOL reset_bracount;
4430 int class_has_8bitchar;
4431 int class_one_char;
4432 int newoptions;
4433 int recno;
4434 int refsign;
4435 int skipbytes;
4436 pcre_uint32 subreqchar, subfirstchar;
4437 pcre_int32 subreqcharflags, subfirstcharflags;
4438 int terminator;
4439 unsigned int mclength;
4440 unsigned int tempbracount;
4441 pcre_uint32 ec;
4442 pcre_uchar mcbuffer[8];
4443
4444 /* Get next character in the pattern */
4445
4446 c = *ptr;
4447
4448 /* If we are at the end of a nested substitution, revert to the outer level
4449 string. Nesting only happens one level deep. */
4450
4451 if (c == CHAR_NULL && nestptr != NULL)
4452 {
4453 ptr = nestptr;
4454 nestptr = NULL;
4455 c = *ptr;
4456 }
4457
4458 /* If we are in the pre-compile phase, accumulate the length used for the
4459 previous cycle of this loop. */
4460
4461 if (lengthptr != NULL)
4462 {
4463 #ifdef PCRE_DEBUG
4464 if (code > cd->hwm) cd->hwm = code; /* High water info */
4465 #endif
4466 if (code > cd->start_workspace + cd->workspace_size -
4467 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4468 {
4469 *errorcodeptr = ERR52;
4470 goto FAILED;
4471 }
4472
4473 /* There is at least one situation where code goes backwards: this is the
4474 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4475 the class is simply eliminated. However, it is created first, so we have to
4476 allow memory for it. Therefore, don't ever reduce the length at this point.
4477 */
4478
4479 if (code < last_code) code = last_code;
4480
4481 /* Paranoid check for integer overflow */
4482
4483 if (OFLOW_MAX - *lengthptr < code - last_code)
4484 {
4485 *errorcodeptr = ERR20;
4486 goto FAILED;
4487 }
4488
4489 *lengthptr += (int)(code - last_code);
4490 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4491 (int)(code - last_code), c, c));
4492
4493 /* If "previous" is set and it is not at the start of the work space, move
4494 it back to there, in order to avoid filling up the work space. Otherwise,
4495 if "previous" is NULL, reset the current code pointer to the start. */
4496
4497 if (previous != NULL)
4498 {
4499 if (previous > orig_code)
4500 {
4501 memmove(orig_code, previous, IN_UCHARS(code - previous));
4502 code -= previous - orig_code;
4503 previous = orig_code;
4504 }
4505 }
4506 else code = orig_code;
4507
4508 /* Remember where this code item starts so we can pick up the length
4509 next time round. */
4510
4511 last_code = code;
4512 }
4513
4514 /* In the real compile phase, just check the workspace used by the forward
4515 reference list. */
4516
4517 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4518 WORK_SIZE_SAFETY_MARGIN)
4519 {
4520 *errorcodeptr = ERR52;
4521 goto FAILED;
4522 }
4523
4524 /* If in \Q...\E, check for the end; if not, we have a literal */
4525
4526 if (inescq && c != CHAR_NULL)
4527 {
4528 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4529 {
4530 inescq = FALSE;
4531 ptr++;
4532 continue;
4533 }
4534 else
4535 {
4536 if (previous_callout != NULL)
4537 {
4538 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4539 complete_callout(previous_callout, ptr, cd);
4540 previous_callout = NULL;
4541 }
4542 if ((options & PCRE_AUTO_CALLOUT) != 0)
4543 {
4544 previous_callout = code;
4545 code = auto_callout(code, ptr, cd);
4546 }
4547 goto NORMAL_CHAR;
4548 }
4549 }
4550
4551 is_quantifier =
4552 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4553 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4554
4555 /* Fill in length of a previous callout, except when the next thing is a
4556 quantifier or when processing a property substitution string in UCP mode. */
4557
4558 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4559 after_manual_callout-- <= 0)
4560 {
4561 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4562 complete_callout(previous_callout, ptr, cd);
4563 previous_callout = NULL;
4564 }
4565
4566 /* In extended mode, skip white space and comments. */
4567
4568 if ((options & PCRE_EXTENDED) != 0)
4569 {
4570 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
4571 if (c == CHAR_NUMBER_SIGN)
4572 {
4573 ptr++;
4574 while (*ptr != CHAR_NULL)
4575 {
4576 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
4577 ptr++;
4578 #ifdef SUPPORT_UTF
4579 if (utf) FORWARDCHAR(ptr);
4580 #endif
4581 }
4582 if (*ptr != CHAR_NULL) continue;
4583
4584 /* Else fall through to handle end of string */
4585 c = 0;
4586 }
4587 }
4588
4589 /* No auto callout for quantifiers, or while processing property strings that
4590 are substituted for \w etc in UCP mode. */
4591
4592 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4593 {
4594 previous_callout = code;
4595 code = auto_callout(code, ptr, cd);
4596 }
4597
4598 switch(c)
4599 {
4600 /* ===================================================================*/
4601 case 0: /* The branch terminates at string end */
4602 case CHAR_VERTICAL_LINE: /* or | or ) */
4603 case CHAR_RIGHT_PARENTHESIS:
4604 *firstcharptr = firstchar;
4605 *firstcharflagsptr = firstcharflags;
4606 *reqcharptr = reqchar;
4607 *reqcharflagsptr = reqcharflags;
4608 *codeptr = code;
4609 *ptrptr = ptr;
4610 if (lengthptr != NULL)
4611 {
4612 if (OFLOW_MAX - *lengthptr < code - last_code)
4613 {
4614 *errorcodeptr = ERR20;
4615 goto FAILED;
4616 }
4617 *lengthptr += (int)(code - last_code); /* To include callout length */
4618 DPRINTF((">> end branch\n"));
4619 }
4620 return TRUE;
4621
4622
4623 /* ===================================================================*/
4624 /* Handle single-character metacharacters. In multiline mode, ^ disables
4625 the setting of any following char as a first character. */
4626
4627 case CHAR_CIRCUMFLEX_ACCENT:
4628 previous = NULL;
4629 if ((options & PCRE_MULTILINE) != 0)
4630 {
4631 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4632 *code++ = OP_CIRCM;
4633 }
4634 else *code++ = OP_CIRC;
4635 break;
4636
4637 case CHAR_DOLLAR_SIGN:
4638 previous = NULL;
4639 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4640 break;
4641
4642 /* There can never be a first char if '.' is first, whatever happens about
4643 repeats. The value of reqchar doesn't change either. */
4644
4645 case CHAR_DOT:
4646 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4647 zerofirstchar = firstchar;
4648 zerofirstcharflags = firstcharflags;
4649 zeroreqchar = reqchar;
4650 zeroreqcharflags = reqcharflags;
4651 previous = code;
4652 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4653 break;
4654
4655
4656 /* ===================================================================*/
4657 /* Character classes. If the included characters are all < 256, we build a
4658 32-byte bitmap of the permitted characters, except in the special case
4659 where there is only one such character. For negated classes, we build the
4660 map as usual, then invert it at the end. However, we use a different opcode
4661 so that data characters > 255 can be handled correctly.
4662
4663 If the class contains characters outside the 0-255 range, a different
4664 opcode is compiled. It may optionally have a bit map for characters < 256,
4665 but those above are are explicitly listed afterwards. A flag byte tells
4666 whether the bitmap is present, and whether this is a negated class or not.
4667
4668 In JavaScript compatibility mode, an isolated ']' causes an error. In
4669 default (Perl) mode, it is treated as a data character. */
4670
4671 case CHAR_RIGHT_SQUARE_BRACKET:
4672 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4673 {
4674 *errorcodeptr = ERR64;
4675 goto FAILED;
4676 }
4677 goto NORMAL_CHAR;
4678
4679 case CHAR_LEFT_SQUARE_BRACKET:
4680 previous = code;
4681
4682 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4683 they are encountered at the top level, so we'll do that too. */
4684
4685 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4686 ptr[1] == CHAR_EQUALS_SIGN) &&
4687 check_posix_syntax(ptr, &tempptr))
4688 {
4689 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4690 goto FAILED;
4691 }
4692
4693 /* If the first character is '^', set the negation flag and skip it. Also,
4694 if the first few characters (either before or after ^) are \Q\E or \E we
4695 skip them too. This makes for compatibility with Perl. */
4696
4697 negate_class = FALSE;
4698 for (;;)
4699 {
4700 c = *(++ptr);
4701 if (c == CHAR_BACKSLASH)
4702 {
4703 if (ptr[1] == CHAR_E)
4704 ptr++;
4705 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4706 ptr += 3;
4707 else
4708 break;
4709 }
4710 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4711 negate_class = TRUE;
4712 else break;
4713 }
4714
4715 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4716 an initial ']' is taken as a data character -- the code below handles
4717 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4718 [^] must match any character, so generate OP_ALLANY. */
4719
4720 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4721 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4722 {
4723 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4724 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4725 zerofirstchar = firstchar;
4726 zerofirstcharflags = firstcharflags;
4727 break;
4728 }
4729
4730 /* If a class contains a negative special such as \S, we need to flip the
4731 negation flag at the end, so that support for characters > 255 works
4732 correctly (they are all included in the class). */
4733
4734 should_flip_negation = FALSE;
4735
4736 /* For optimization purposes, we track some properties of the class:
4737 class_has_8bitchar will be non-zero if the class contains at least one <
4738 256 character; class_one_char will be 1 if the class contains just one
4739 character. */
4740
4741 class_has_8bitchar = 0;
4742 class_one_char = 0;
4743
4744 /* Initialize the 32-char bit map to all zeros. We build the map in a
4745 temporary bit of memory, in case the class contains fewer than two
4746 8-bit characters because in that case the compiled code doesn't use the bit
4747 map. */
4748
4749 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4750
4751 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4752 xclass = FALSE;
4753 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4754 class_uchardata_base = class_uchardata; /* Save the start */
4755 #endif
4756
4757 /* Process characters until ] is reached. By writing this as a "do" it
4758 means that an initial ] is taken as a data character. At the start of the
4759 loop, c contains the first byte of the character. */
4760
4761 if (c != CHAR_NULL) do
4762 {
4763 const pcre_uchar *oldptr;
4764
4765 #ifdef SUPPORT_UTF
4766 if (utf && HAS_EXTRALEN(c))
4767 { /* Braces are required because the */
4768 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4769 }
4770 #endif
4771
4772 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4773 /* In the pre-compile phase, accumulate the length of any extra
4774 data and reset the pointer. This is so that very large classes that
4775 contain a zillion > 255 characters no longer overwrite the work space
4776 (which is on the stack). We have to remember that there was XCLASS data,
4777 however. */
4778
4779 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4780 {
4781 xclass = TRUE;
4782 *lengthptr += class_uchardata - class_uchardata_base;
4783 class_uchardata = class_uchardata_base;
4784 }
4785 #endif
4786
4787 /* Inside \Q...\E everything is literal except \E */
4788
4789 if (inescq)
4790 {
4791 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4792 {
4793 inescq = FALSE; /* Reset literal state */
4794 ptr++; /* Skip the 'E' */
4795 continue; /* Carry on with next */
4796 }
4797 goto CHECK_RANGE; /* Could be range if \E follows */
4798 }
4799
4800 /* Handle POSIX class names. Perl allows a negation extension of the
4801 form [:^name:]. A square bracket that doesn't match the syntax is
4802 treated as a literal. We also recognize the POSIX constructions
4803 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4804 5.6 and 5.8 do. */
4805
4806 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4807 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4808 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4809 {
4810 BOOL local_negate = FALSE;
4811 int posix_class, taboffset, tabopt;
4812 register const pcre_uint8 *cbits = cd->cbits;
4813 pcre_uint8 pbits[32];
4814
4815 if (ptr[1] != CHAR_COLON)
4816 {
4817 *errorcodeptr = ERR31;
4818 goto FAILED;
4819 }
4820
4821 ptr += 2;
4822 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4823 {
4824 local_negate = TRUE;
4825 should_flip_negation = TRUE; /* Note negative special */
4826 ptr++;
4827 }
4828
4829 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4830 if (posix_class < 0)
4831 {
4832 *errorcodeptr = ERR30;
4833 goto FAILED;
4834 }
4835
4836 /* If matching is caseless, upper and lower are converted to
4837 alpha. This relies on the fact that the class table starts with
4838 alpha, lower, upper as the first 3 entries. */
4839
4840 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4841 posix_class = 0;
4842
4843 /* When PCRE_UCP is set, some of the POSIX classes are converted to
4844 different escape sequences that use Unicode properties \p or \P. Others
4845 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4846 directly. */
4847
4848 #ifdef SUPPORT_UCP
4849 if ((options & PCRE_UCP) != 0)
4850 {
4851 unsigned int ptype = 0;
4852 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4853
4854 /* The posix_substitutes table specifies which POSIX classes can be
4855 converted to \p or \P items. */
4856
4857 if (posix_substitutes[pc] != NULL)
4858 {
4859 nestptr = tempptr + 1;
4860 ptr = posix_substitutes[pc] - 1;
4861 continue;
4862 }
4863
4864 /* There are three other classes that generate special property calls
4865 that are recognized only in an XCLASS. */
4866
4867 else switch(posix_class)
4868 {
4869 case PC_GRAPH:
4870 ptype = PT_PXGRAPH;
4871 /* Fall through */
4872 case PC_PRINT:
4873 if (ptype == 0) ptype = PT_PXPRINT;
4874 /* Fall through */
4875 case PC_PUNCT:
4876 if (ptype == 0) ptype = PT_PXPUNCT;
4877 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4878 *class_uchardata++ = ptype;
4879 *class_uchardata++ = 0;
4880 ptr = tempptr + 1;
4881 continue;
4882
4883 /* For all other POSIX classes, no special action is taken in UCP
4884 mode. Fall through to the non_UCP case. */
4885
4886 default:
4887 break;
4888 }
4889 }
4890 #endif
4891 /* In the non-UCP case, or when UCP makes no difference, we build the
4892 bit map for the POSIX class in a chunk of local store because we may be
4893 adding and subtracting from it, and we don't want to subtract bits that
4894 may be in the main map already. At the end we or the result into the
4895 bit map that is being built. */
4896
4897 posix_class *= 3;
4898
4899 /* Copy in the first table (always present) */
4900
4901 memcpy(pbits, cbits + posix_class_maps[posix_class],
4902 32 * sizeof(pcre_uint8));
4903
4904 /* If there is a second table, add or remove it as required. */
4905
4906 taboffset = posix_class_maps[posix_class + 1];
4907 tabopt = posix_class_maps[posix_class + 2];
4908
4909 if (taboffset >= 0)
4910 {
4911 if (tabopt >= 0)
4912 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4913 else
4914 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4915 }
4916
4917 /* Now see if we need to remove any special characters. An option
4918 value of 1 removes vertical space and 2 removes underscore. */
4919
4920 if (tabopt < 0) tabopt = -tabopt;
4921 if (tabopt == 1) pbits[1] &= ~0x3c;
4922 else if (tabopt == 2) pbits[11] &= 0x7f;
4923
4924 /* Add the POSIX table or its complement into the main table that is
4925 being built and we are done. */
4926
4927 if (local_negate)
4928 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4929 else
4930 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4931
4932 ptr = tempptr + 1;
4933 /* Every class contains at least one < 256 character. */
4934 class_has_8bitchar = 1;
4935 /* Every class contains at least two characters. */
4936 class_one_char = 2;
4937 continue; /* End of POSIX syntax handling */
4938 }
4939
4940 /* Backslash may introduce a single character, or it may introduce one
4941 of the specials, which just set a flag. The sequence \b is a special
4942 case. Inside a class (and only there) it is treated as backspace. We
4943 assume that other escapes have more than one character in them, so
4944 speculatively set both class_has_8bitchar and class_one_char bigger
4945 than one. Unrecognized escapes fall through and are either treated
4946 as literal characters (by default), or are faulted if
4947 PCRE_EXTRA is set. */
4948
4949 if (c == CHAR_BACKSLASH)
4950 {
4951 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4952 TRUE);
4953 if (*errorcodeptr != 0) goto FAILED;
4954 if (escape == 0) c = ec;
4955 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4956 else if (escape == ESC_N) /* \N is not supported in a class */
4957 {
4958 *errorcodeptr = ERR71;
4959 goto FAILED;
4960 }
4961 else if (escape == ESC_Q) /* Handle start of quoted string */
4962 {
4963 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4964 {
4965 ptr += 2; /* avoid empty string */
4966 }
4967 else inescq = TRUE;
4968 continue;
4969 }
4970 else if (escape == ESC_E) continue; /* Ignore orphan \E */
4971
4972 else
4973 {
4974 register const pcre_uint8 *cbits = cd->cbits;
4975 /* Every class contains at least two < 256 characters. */
4976 class_has_8bitchar++;
4977 /* Every class contains at least two characters. */
4978 class_one_char += 2;
4979
4980 switch (escape)
4981 {
4982 #ifdef SUPPORT_UCP
4983 case ESC_du: /* These are the values given for \d etc */
4984 case ESC_DU: /* when PCRE_UCP is set. We replace the */
4985 case ESC_wu: /* escape sequence with an appropriate \p */
4986 case ESC_WU: /* or \P to test Unicode properties instead */
4987 case ESC_su: /* of the default ASCII testing. */
4988 case ESC_SU:
4989 nestptr = ptr;
4990 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4991 class_has_8bitchar--; /* Undo! */
4992 continue;
4993 #endif
4994 case ESC_d:
4995 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4996 continue;
4997
4998 case ESC_D:
4999 should_flip_negation = TRUE;
5000 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5001 continue;
5002
5003 case ESC_w:
5004 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5005 continue;
5006
5007 case ESC_W:
5008 should_flip_negation = TRUE;
5009 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5010 continue;
5011
5012 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5013 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5014 previously set by something earlier in the character class.
5015 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5016 we could just adjust the appropriate bit. From PCRE 8.34 we no
5017 longer treat \s and \S specially. */
5018
5019 case ESC_s:
5020 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5021 continue;
5022
5023 case ESC_S:
5024 should_flip_negation = TRUE;
5025 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5026 continue;
5027
5028 /* The rest apply in both UCP and non-UCP cases. */
5029
5030 case ESC_h:
5031 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5032 PRIV(hspace_list), NOTACHAR);
5033 continue;
5034
5035 case ESC_H:
5036 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5037 cd, PRIV(hspace_list));
5038 continue;
5039
5040 case ESC_v:
5041 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5042 PRIV(vspace_list), NOTACHAR);
5043 continue;
5044
5045 case ESC_V:
5046 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5047 cd, PRIV(vspace_list));
5048 continue;
5049
5050 #ifdef SUPPORT_UCP
5051 case ESC_p:
5052 case ESC_P:
5053 {
5054 BOOL negated;
5055 unsigned int ptype = 0, pdata = 0;
5056 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5057 goto FAILED;
5058 *class_uchardata++ = ((escape == ESC_p) != negated)?
5059 XCL_PROP : XCL_NOTPROP;
5060 *class_uchardata++ = ptype;
5061 *class_uchardata++ = pdata;
5062 class_has_8bitchar--; /* Undo! */
5063 continue;
5064 }
5065 #endif
5066 /* Unrecognized escapes are faulted if PCRE is running in its
5067 strict mode. By default, for compatibility with Perl, they are
5068 treated as literals. */
5069
5070 default:
5071 if ((options & PCRE_EXTRA) != 0)
5072 {
5073 *errorcodeptr = ERR7;
5074 goto FAILED;
5075 }
5076 class_has_8bitchar--; /* Undo the speculative increase. */
5077 class_one_char -= 2; /* Undo the speculative increase. */
5078 c = *ptr; /* Get the final character and fall through */
5079 break;
5080 }
5081 }
5082
5083 /* Fall through if the escape just defined a single character (c >= 0).
5084 This may be greater than 256. */
5085
5086 escape = 0;
5087
5088 } /* End of backslash handling */
5089
5090 /* A character may be followed by '-' to form a range. However, Perl does
5091 not permit ']' to be the end of the range. A '-' character at the end is
5092 treated as a literal. Perl ignores orphaned \E sequences entirely. The
5093 code for handling \Q and \E is messy. */
5094
5095 CHECK_RANGE:
5096 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5097 {
5098 inescq = FALSE;
5099 ptr += 2;
5100 }
5101 oldptr = ptr;
5102
5103 /* Remember if \r or \n were explicitly used */
5104
5105 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5106
5107 /* Check for range */
5108
5109 if (!inescq && ptr[1] == CHAR_MINUS)
5110 {
5111 pcre_uint32 d;
5112 ptr += 2;
5113 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5114
5115 /* If we hit \Q (not followed by \E) at this point, go into escaped
5116 mode. */
5117
5118 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5119 {
5120 ptr += 2;
5121 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5122 { ptr += 2; continue; }
5123 inescq = TRUE;
5124 break;
5125 }
5126
5127 /* Minus (hyphen) at the end of a class is treated as a literal, so put
5128 back the pointer and jump to handle the character that preceded it. */
5129
5130 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5131 {
5132 ptr = oldptr;
5133 goto CLASS_SINGLE_CHARACTER;
5134 }
5135
5136 /* Otherwise, we have a potential range; pick up the next character */
5137
5138 #ifdef SUPPORT_UTF
5139 if (utf)
5140 { /* Braces are required because the */
5141 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
5142 }
5143 else
5144 #endif
5145 d = *ptr; /* Not UTF-8 mode */
5146
5147 /* The second part of a range can be a single-character escape, but
5148 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
5149 in such circumstances. */
5150
5151 if (!inescq && d == CHAR_BACKSLASH)
5152 {
5153 int descape;
5154 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5155 if (*errorcodeptr != 0) goto FAILED;
5156
5157 /* \b is backspace; any other special means the '-' was literal. */
5158
5159 if (descape != 0)
5160 {
5161 if (descape == ESC_b) d = CHAR_BS; else
5162 {
5163 ptr = oldptr;
5164 goto CLASS_SINGLE_CHARACTER; /* A few lines below */
5165 }
5166 }
5167 }
5168
5169 /* Check that the two values are in the correct order. Optimize
5170 one-character ranges. */
5171
5172 if (d < c)
5173 {
5174 *errorcodeptr = ERR8;
5175 goto FAILED;
5176 }
5177 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
5178
5179 /* We have found a character range, so single character optimizations
5180 cannot be done anymore. Any value greater than 1 indicates that there
5181 is more than one character. */
5182
5183 class_one_char = 2;
5184
5185 /* Remember an explicit \r or \n, and add the range to the class. */
5186
5187 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5188
5189 class_has_8bitchar +=
5190 add_to_class(classbits, &class_uchardata, options, cd, c, d);
5191
5192 continue; /* Go get the next char in the class */
5193 }
5194
5195 /* Handle a single character - we can get here for a normal non-escape
5196 char, or after \ that introduces a single character or for an apparent
5197 range that isn't. Only the value 1 matters for class_one_char, so don't
5198 increase it if it is already 2 or more ... just in case there's a class
5199 with a zillion characters in it. */
5200
5201 CLASS_SINGLE_CHARACTER:
5202 if (class_one_char < 2) class_one_char++;
5203
5204 /* If class_one_char is 1, we have the first single character in the
5205 class, and there have been no prior ranges, or XCLASS items generated by
5206 escapes. If this is the final character in the class, we can optimize by
5207 turning the item into a 1-character OP_CHAR[I] if it's positive, or
5208 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
5209 to be set. Otherwise, there can be no first char if this item is first,
5210 whatever repeat count may follow. In the case of reqchar, save the
5211 previous value for reinstating. */
5212
5213 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5214 {
5215 ptr++;
5216 zeroreqchar = reqchar;
5217 zeroreqcharflags = reqcharflags;
5218
5219 if (negate_class)
5220 {
5221 #ifdef SUPPORT_UCP
5222 int d;
5223 #endif
5224 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5225 zerofirstchar = firstchar;
5226 zerofirstcharflags = firstcharflags;
5227
5228 /* For caseless UTF-8 mode when UCP support is available, check
5229 whether this character has more than one other case. If so, generate
5230 a special OP_NOTPROP item instead of OP_NOTI. */
5231
5232 #ifdef SUPPORT_UCP
5233 if (utf && (options & PCRE_CASELESS) != 0 &&
5234 (d = UCD_CASESET(c)) != 0)
5235 {
5236 *code++ = OP_NOTPROP;
5237 *code++ = PT_CLIST;
5238 *code++ = d;
5239 }
5240 else
5241 #endif
5242 /* Char has only one other case, or UCP not available */
5243
5244 {
5245 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5246 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5247 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5248 code += PRIV(ord2utf)(c, code);
5249 else
5250 #endif
5251 *code++ = c;
5252 }
5253
5254 /* We are finished with this character class */
5255
5256 goto END_CLASS;
5257 }
5258
5259 /* For a single, positive character, get the value into mcbuffer, and
5260 then we can handle this with the normal one-character code. */
5261
5262 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5263 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5264 mclength = PRIV(ord2utf)(c, mcbuffer);
5265 else
5266 #endif
5267 {
5268 mcbuffer[0] = c;
5269 mclength = 1;
5270 }
5271 goto ONE_CHAR;
5272 } /* End of 1-char optimization */
5273
5274 /* There is more than one character in the class, or an XCLASS item
5275 has been generated. Add this character to the class. */
5276
5277 class_has_8bitchar +=
5278 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5279 }
5280
5281 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5282 If we are at the end of an internal nested string, revert to the outer
5283 string. */
5284
5285 while (((c = *(++ptr)) != CHAR_NULL ||
5286 (nestptr != NULL &&
5287 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5288 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5289
5290 /* Check for missing terminating ']' */
5291
5292 if (c == CHAR_NULL)
5293 {
5294 *errorcodeptr = ERR6;
5295 goto FAILED;
5296 }
5297
5298 /* We will need an XCLASS if data has been placed in class_uchardata. In
5299 the second phase this is a sufficient test. However, in the pre-compile
5300 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5301 only if the very last character in the class needs XCLASS will it contain
5302 anything at this point. For this reason, xclass gets set TRUE above when
5303 uchar_classdata is emptied, and that's why this code is the way it is here
5304 instead of just doing a test on class_uchardata below. */
5305
5306 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5307 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5308 #endif
5309
5310 /* If this is the first thing in the branch, there can be no first char
5311 setting, whatever the repeat count. Any reqchar setting must remain
5312 unchanged after any kind of repeat. */
5313
5314 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5315 zerofirstchar = firstchar;
5316 zerofirstcharflags = firstcharflags;
5317 zeroreqchar = reqchar;
5318 zeroreqcharflags = reqcharflags;
5319
5320 /* If there are characters with values > 255, we have to compile an
5321 extended class, with its own opcode, unless there was a negated special
5322 such as \S in the class, and PCRE_UCP is not set, because in that case all
5323 characters > 255 are in the class, so any that were explicitly given as
5324 well can be ignored. If (when there are explicit characters > 255 that must
5325 be listed) there are no characters < 256, we can omit the bitmap in the
5326 actual compiled code. */
5327
5328 #ifdef SUPPORT_UTF
5329 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5330 #elif !defined COMPILE_PCRE8
5331 if (xclass && !should_flip_negation)
5332 #endif
5333 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5334 {
5335 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5336 *code++ = OP_XCLASS;
5337 code += LINK_SIZE;
5338 *code = negate_class? XCL_NOT:0;
5339
5340 /* If the map is required, move up the extra data to make room for it;
5341 otherwise just move the code pointer to the end of the extra data. */
5342
5343 if (class_has_8bitchar > 0)
5344 {
5345 *code++ |= XCL_MAP;
5346 memmove(code + (32 / sizeof(pcre_uchar)), code,
5347 IN_UCHARS(class_uchardata - code));
5348 memcpy(code, classbits, 32);
5349 code = class_uchardata + (32 / sizeof(pcre_uchar));
5350 }
5351 else code = class_uchardata;
5352
5353 /* Now fill in the complete length of the item */
5354
5355 PUT(previous, 1, (int)(code - previous));
5356 break; /* End of class handling */
5357 }
5358 #endif
5359
5360 /* If there are no characters > 255, or they are all to be included or
5361 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5362 whole class was negated and whether there were negative specials such as \S
5363 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5364 negating it if necessary. */
5365
5366 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5367 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5368 {
5369 if (negate_class)
5370 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5371 memcpy(code, classbits, 32);
5372 }
5373 code += 32 / sizeof(pcre_uchar);
5374
5375 END_CLASS:
5376 break;
5377
5378
5379 /* ===================================================================*/
5380 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5381 has been tested above. */
5382
5383 case CHAR_LEFT_CURLY_BRACKET:
5384 if (!is_quantifier) goto NORMAL_CHAR;
5385 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5386 if (*errorcodeptr != 0) goto FAILED;
5387 goto REPEAT;
5388
5389 case CHAR_ASTERISK:
5390 repeat_min = 0;
5391 repeat_max = -1;
5392 goto REPEAT;
5393
5394 case CHAR_PLUS:
5395 repeat_min = 1;
5396 repeat_max = -1;
5397 goto REPEAT;
5398
5399 case CHAR_QUESTION_MARK:
5400 repeat_min = 0;
5401 repeat_max = 1;
5402
5403 REPEAT:
5404 if (previous == NULL)
5405 {
5406 *errorcodeptr = ERR9;
5407 goto FAILED;
5408 }
5409
5410 if (repeat_min == 0)
5411 {
5412 firstchar = zerofirstchar; /* Adjust for zero repeat */
5413 firstcharflags = zerofirstcharflags;
5414 reqchar = zeroreqchar; /* Ditto */
5415 reqcharflags = zeroreqcharflags;
5416 }
5417
5418 /* Remember whether this is a variable length repeat */
5419
5420 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5421
5422 op_type = 0; /* Default single-char op codes */
5423 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5424
5425 /* Save start of previous item, in case we have to move it up in order to
5426 insert something before it. */
5427
5428 tempcode = previous;
5429
5430 /* If the next character is '+', we have a possessive quantifier. This
5431 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5432 If the next character is '?' this is a minimizing repeat, by default,
5433 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5434 repeat type to the non-default. */
5435
5436 if (ptr[1] == CHAR_PLUS)
5437 {
5438 repeat_type = 0; /* Force greedy */
5439 possessive_quantifier = TRUE;
5440 ptr++;
5441 }
5442 else if (ptr[1] == CHAR_QUESTION_MARK)
5443 {
5444 repeat_type = greedy_non_default;
5445 ptr++;
5446 }
5447 else repeat_type = greedy_default;
5448
5449 /* If previous was a recursion call, wrap it in atomic brackets so that
5450 previous becomes the atomic group. All recursions were so wrapped in the
5451 past, but it no longer happens for non-repeated recursions. In fact, the
5452 repeated ones could be re-implemented independently so as not to need this,
5453 but for the moment we rely on the code for repeating groups. */
5454
5455 if (*previous == OP_RECURSE)
5456 {
5457 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5458 *previous = OP_ONCE;
5459 PUT(previous, 1, 2 + 2*LINK_SIZE);
5460 previous[2 + 2*LINK_SIZE] = OP_KET;
5461 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5462 code += 2 + 2 * LINK_SIZE;
5463 length_prevgroup = 3 + 3*LINK_SIZE;
5464
5465 /* When actually compiling, we need to check whether this was a forward
5466 reference, and if so, adjust the offset. */
5467
5468 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5469 {
5470 int offset = GET(cd->hwm, -LINK_SIZE);
5471 if (offset == previous + 1 - cd->start_code)
5472 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5473 }
5474 }
5475
5476 /* Now handle repetition for the different types of item. */
5477
5478 /* If previous was a character or negated character match, abolish the item
5479 and generate a repeat item instead. If a char item has a minimum of more
5480 than one, ensure that it is set in reqchar - it might not be if a sequence
5481 such as x{3} is the first thing in a branch because the x will have gone
5482 into firstchar instead. */
5483
5484 if (*previous == OP_CHAR || *previous == OP_CHARI
5485 || *previous == OP_NOT || *previous == OP_NOTI)
5486 {
5487 switch (*previous)
5488 {
5489 default: /* Make compiler happy. */
5490 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5491 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5492 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5493 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5494 }
5495
5496 /* Deal with UTF characters that take up more than one character. It's
5497 easier to write this out separately than try to macrify it. Use c to
5498 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5499 it's a length rather than a small character. */
5500
5501 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5502 if (utf && NOT_FIRSTCHAR(code[-1]))
5503 {
5504 pcre_uchar *lastchar = code - 1;
5505 BACKCHAR(lastchar);
5506 c = (int)(code - lastchar); /* Length of UTF-8 character */
5507 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5508 c |= UTF_LENGTH; /* Flag c as a length */
5509 }
5510 else
5511 #endif /* SUPPORT_UTF */
5512
5513 /* Handle the case of a single charater - either with no UTF support, or
5514 with UTF disabled, or for a single character UTF character. */
5515 {
5516 c = code[-1];
5517 if (*previous <= OP_CHARI && repeat_min > 1)
5518 {
5519 reqchar = c;
5520 reqcharflags = req_caseopt | cd->req_varyopt;
5521 }
5522 }
5523
5524 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5525 }
5526
5527 /* If previous was a character type match (\d or similar), abolish it and
5528 create a suitable repeat item. The code is shared with single-character
5529 repeats by setting op_type to add a suitable offset into repeat_type. Note
5530 the the Unicode property types will be present only when SUPPORT_UCP is
5531 defined, but we don't wrap the little bits of code here because it just
5532 makes it horribly messy. */
5533
5534 else if (*previous < OP_EODN)
5535 {
5536 pcre_uchar *oldcode;
5537 int prop_type, prop_value;
5538 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5539 c = *previous;
5540
5541 OUTPUT_SINGLE_REPEAT:
5542 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5543 {
5544 prop_type = previous[1];
5545 prop_value = previous[2];
5546 }
5547 else prop_type = prop_value = -1;
5548
5549 oldcode = code;
5550 code = previous; /* Usually overwrite previous item */
5551
5552 /* If the maximum is zero then the minimum must also be zero; Perl allows
5553 this case, so we do too - by simply omitting the item altogether. */
5554
5555 if (repeat_max == 0) goto END_REPEAT;
5556
5557 /* Combine the op_type with the repeat_type */
5558
5559 repeat_type += op_type;
5560
5561 /* A minimum of zero is handled either as the special case * or ?, or as
5562 an UPTO, with the maximum given. */
5563
5564 if (repeat_min == 0)
5565 {
5566 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5567 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5568 else
5569 {
5570 *code++ = OP_UPTO + repeat_type;
5571 PUT2INC(code, 0, repeat_max);
5572 }
5573 }
5574
5575 /* A repeat minimum of 1 is optimized into some special cases. If the
5576 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5577 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5578 one less than the maximum. */
5579
5580 else if (repeat_min == 1)
5581 {
5582 if (repeat_max == -1)
5583 *code++ = OP_PLUS + repeat_type;
5584 else
5585 {
5586 code = oldcode; /* leave previous item in place */
5587 if (repeat_max == 1) goto END_REPEAT;
5588 *code++ = OP_UPTO + repeat_type;
5589 PUT2INC(code, 0, repeat_max - 1);
5590 }
5591 }
5592
5593 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5594 handled as an EXACT followed by an UPTO. */
5595
5596 else
5597 {
5598 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5599 PUT2INC(code, 0, repeat_min);
5600
5601 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5602 we have to insert the character for the previous code. For a repeated
5603 Unicode property match, there are two extra bytes that define the
5604 required property. In UTF-8 mode, long characters have their length in
5605 c, with the UTF_LENGTH bit as a flag. */
5606
5607 if (repeat_max < 0)
5608 {
5609 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5610 if (utf && (c & UTF_LENGTH) != 0)
5611 {
5612 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5613 code += c & 7;
5614 }
5615 else
5616 #endif
5617 {
5618 *code++ = c;
5619 if (prop_type >= 0)
5620 {
5621 *code++ = prop_type;
5622 *code++ = prop_value;
5623 }
5624 }
5625 *code++ = OP_STAR + repeat_type;
5626 }
5627
5628 /* Else insert an UPTO if the max is greater than the min, again
5629 preceded by the character, for the previously inserted code. If the
5630 UPTO is just for 1 instance, we can use QUERY instead. */
5631
5632 else if (repeat_max != repeat_min)
5633 {
5634 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5635 if (utf && (c & UTF_LENGTH) != 0)
5636 {
5637 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5638 code += c & 7;
5639 }
5640 else
5641 #endif
5642 *code++ = c;
5643 if (prop_type >= 0)
5644 {
5645 *code++ = prop_type;
5646 *code++ = prop_value;
5647 }
5648 repeat_max -= repeat_min;
5649
5650 if (repeat_max == 1)
5651 {
5652 *code++ = OP_QUERY + repeat_type;
5653 }
5654 else
5655 {
5656 *code++ = OP_UPTO + repeat_type;
5657 PUT2INC(code, 0, repeat_max);
5658 }
5659 }
5660 }
5661
5662 /* The character or character type itself comes last in all cases. */
5663
5664 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5665 if (utf && (c & UTF_LENGTH) != 0)
5666 {
5667 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5668 code += c & 7;
5669 }
5670 else
5671 #endif
5672 *code++ = c;
5673
5674 /* For a repeated Unicode property match, there are two extra bytes that
5675 define the required property. */
5676
5677 #ifdef SUPPORT_UCP
5678 if (prop_type >= 0)
5679 {
5680 *code++ = prop_type;
5681 *code++ = prop_value;
5682 }
5683 #endif
5684 }
5685
5686 /* If previous was a character class or a back reference, we put the repeat
5687 stuff after it, but just skip the item if the repeat was {0,0}. */
5688
5689 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5690 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5691 *previous == OP_XCLASS ||
5692 #endif
5693 *previous == OP_REF || *previous == OP_REFI ||
5694 *previous == OP_DNREF || *previous == OP_DNREFI)
5695 {
5696 if (repeat_max == 0)
5697 {
5698 code = previous;
5699 goto END_REPEAT;
5700 }
5701
5702 if (repeat_min == 0 && repeat_max == -1)
5703 *code++ = OP_CRSTAR + repeat_type;
5704 else if (repeat_min == 1 && repeat_max == -1)
5705 *code++ = OP_CRPLUS + repeat_type;
5706 else if (repeat_min == 0 && repeat_max == 1)
5707 *code++ = OP_CRQUERY + repeat_type;
5708 else
5709 {
5710 *code++ = OP_CRRANGE + repeat_type;
5711 PUT2INC(code, 0, repeat_min);
5712 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5713 PUT2INC(code, 0, repeat_max);
5714 }
5715 }
5716
5717 /* If previous was a bracket group, we may have to replicate it in certain
5718 cases. Note that at this point we can encounter only the "basic" bracket
5719 opcodes such as BRA and CBRA, as this is the place where they get converted
5720 into the more special varieties such as BRAPOS and SBRA. A test for >=
5721 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5722 ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5723 repetition of assertions, but now it does, for Perl compatibility. */
5724
5725 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5726 {
5727 register int i;
5728 int len = (int)(code - previous);
5729 pcre_uchar *bralink = NULL;
5730 pcre_uchar *brazeroptr = NULL;
5731
5732 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5733 we just ignore the repeat. */
5734
5735 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5736 goto END_REPEAT;
5737
5738 /* There is no sense in actually repeating assertions. The only potential
5739 use of repetition is in cases when the assertion is optional. Therefore,
5740 if the minimum is greater than zero, just ignore the repeat. If the
5741 maximum is not not zero or one, set it to 1. */
5742
5743 if (*previous < OP_ONCE) /* Assertion */
5744 {
5745 if (repeat_min > 0) goto END_REPEAT;
5746 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5747 }
5748
5749 /* The case of a zero minimum is special because of the need to stick
5750 OP_BRAZERO in front of it, and because the group appears once in the
5751 data, whereas in other cases it appears the minimum number of times. For
5752 this reason, it is simplest to treat this case separately, as otherwise
5753 the code gets far too messy. There are several special subcases when the
5754 minimum is zero. */
5755
5756 if (repeat_min == 0)
5757 {
5758 /* If the maximum is also zero, we used to just omit the group from the
5759 output altogether, like this:
5760
5761 ** if (repeat_max == 0)
5762 ** {
5763 ** code = previous;
5764 ** goto END_REPEAT;
5765 ** }
5766
5767 However, that fails when a group or a subgroup within it is referenced
5768 as a subroutine from elsewhere in the pattern, so now we stick in
5769 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5770 don't have a list of which groups are referenced, we cannot do this
5771 selectively.
5772
5773 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5774 and do no more at this point. However, we do need to adjust any
5775 OP_RECURSE calls inside the group that refer to the group itself or any
5776 internal or forward referenced group, because the offset is from the
5777 start of the whole regex. Temporarily terminate the pattern while doing
5778 this. */
5779
5780 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5781 {
5782 *code = OP_END;
5783 adjust_recurse(previous, 1, utf, cd, save_hwm);
5784 memmove(previous + 1, previous, IN_UCHARS(len));
5785 code++;
5786 if (repeat_max == 0)
5787 {
5788 *previous++ = OP_SKIPZERO;
5789 goto END_REPEAT;
5790 }
5791 brazeroptr = previous; /* Save for possessive optimizing */
5792 *previous++ = OP_BRAZERO + repeat_type;
5793 }
5794
5795 /* If the maximum is greater than 1 and limited, we have to replicate
5796 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5797 The first one has to be handled carefully because it's the original
5798 copy, which has to be moved up. The remainder can be handled by code
5799 that is common with the non-zero minimum case below. We have to
5800 adjust the value or repeat_max, since one less copy is required. Once
5801 again, we may have to adjust any OP_RECURSE calls inside the group. */
5802
5803 else
5804 {
5805 int offset;
5806 *code = OP_END;
5807 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5808 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5809 code += 2 + LINK_SIZE;
5810 *previous++ = OP_BRAZERO + repeat_type;
5811 *previous++ = OP_BRA;
5812
5813 /* We chain together the bracket offset fields that have to be
5814 filled in later when the ends of the brackets are reached. */
5815
5816 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5817 bralink = previous;
5818 PUTINC(previous, 0, offset);
5819 }
5820
5821 repeat_max--;
5822 }
5823
5824 /* If the minimum is greater than zero, replicate the group as many
5825 times as necessary, and adjust the maximum to the number of subsequent
5826 copies that we need. If we set a first char from the group, and didn't
5827 set a required char, copy the latter from the former. If there are any
5828 forward reference subroutine calls in the group, there will be entries on
5829 the workspace list; replicate these with an appropriate increment. */
5830
5831 else
5832 {
5833 if (repeat_min > 1)
5834 {
5835 /* In the pre-compile phase, we don't actually do the replication. We
5836 just adjust the length as if we had. Do some paranoid checks for
5837 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5838 integer type when available, otherwise double. */
5839
5840 if (lengthptr != NULL)
5841 {
5842 int delta = (repeat_min - 1)*length_prevgroup;
5843 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5844 (INT64_OR_DOUBLE)length_prevgroup >
5845 (INT64_OR_DOUBLE)INT_MAX ||
5846 OFLOW_MAX - *lengthptr < delta)
5847 {
5848 *errorcodeptr = ERR20;
5849 goto FAILED;
5850 }
5851 *lengthptr += delta;
5852 }
5853
5854 /* This is compiling for real. If there is a set first byte for
5855 the group, and we have not yet set a "required byte", set it. Make
5856 sure there is enough workspace for copying forward references before
5857 doing the copy. */
5858
5859 else
5860 {
5861 if (groupsetfirstchar && reqcharflags < 0)
5862 {
5863 reqchar = firstchar;
5864 reqcharflags = firstcharflags;
5865 }
5866
5867 for (i = 1; i < repeat_min; i++)
5868 {
5869 pcre_uchar *hc;
5870 pcre_uchar *this_hwm = cd->hwm;
5871 memcpy(code, previous, IN_UCHARS(len));
5872
5873 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5874 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5875 {
5876 int save_offset = save_hwm - cd->start_workspace;
5877 int this_offset = this_hwm - cd->start_workspace;
5878 *errorcodeptr = expand_workspace(cd);
5879 if (*errorcodeptr != 0) goto FAILED;
5880 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5881 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5882 }
5883
5884 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5885 {
5886 PUT(cd->hwm, 0, GET(hc, 0) + len);
5887 cd->hwm += LINK_SIZE;
5888 }
5889 save_hwm = this_hwm;
5890 code += len;
5891 }
5892 }
5893 }
5894
5895 if (repeat_max > 0) repeat_max -= repeat_min;
5896 }
5897
5898 /* This code is common to both the zero and non-zero minimum cases. If
5899 the maximum is limited, it replicates the group in a nested fashion,
5900 remembering the bracket starts on a stack. In the case of a zero minimum,
5901 the first one was set up above. In all cases the repeat_max now specifies
5902 the number of additional copies needed. Again, we must remember to
5903 replicate entries on the forward reference list. */
5904
5905 if (repeat_max >= 0)
5906 {
5907 /* In the pre-compile phase, we don't actually do the replication. We
5908 just adjust the length as if we had. For each repetition we must add 1
5909 to the length for BRAZERO and for all but the last repetition we must
5910 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5911 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5912 a 64-bit integer type when available, otherwise double. */
5913
5914 if (lengthptr != NULL && repeat_max > 0)
5915 {
5916 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5917 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5918 if ((INT64_OR_DOUBLE)repeat_max *
5919 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5920 > (INT64_OR_DOUBLE)INT_MAX ||
5921 OFLOW_MAX - *lengthptr < delta)
5922 {
5923 *errorcodeptr = ERR20;
5924 goto FAILED;
5925 }
5926 *lengthptr += delta;
5927 }
5928
5929 /* This is compiling for real */
5930
5931 else for (i = repeat_max - 1; i >= 0; i--)
5932 {
5933 pcre_uchar *hc;
5934 pcre_uchar *this_hwm = cd->hwm;
5935
5936 *code++ = OP_BRAZERO + repeat_type;
5937
5938 /* All but the final copy start a new nesting, maintaining the
5939 chain of brackets outstanding. */
5940
5941 if (i != 0)
5942 {
5943 int offset;
5944 *code++ = OP_BRA;
5945 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5946 bralink = code;
5947 PUTINC(code, 0, offset);
5948 }
5949
5950 memcpy(code, previous, IN_UCHARS(len));
5951
5952 /* Ensure there is enough workspace for forward references before
5953 copying them. */
5954
5955 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5956 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5957 {
5958 int save_offset = save_hwm - cd->start_workspace;
5959 int this_offset = this_hwm - cd->start_workspace;
5960 *errorcodeptr = expand_workspace(cd);
5961 if (*errorcodeptr != 0) goto FAILED;
5962 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5963 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5964 }
5965
5966 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5967 {
5968 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5969 cd->hwm += LINK_SIZE;
5970 }
5971 save_hwm = this_hwm;
5972 code += len;
5973 }
5974
5975 /* Now chain through the pending brackets, and fill in their length
5976 fields (which are holding the chain links pro tem). */
5977
5978 while (bralink != NULL)
5979 {
5980 int oldlinkoffset;
5981 int offset = (int)(code - bralink + 1);
5982 pcre_uchar *bra = code - offset;
5983 oldlinkoffset = GET(bra, 1);
5984 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5985 *code++ = OP_KET;
5986 PUTINC(code, 0, offset);
5987 PUT(bra, 1, offset);
5988 }
5989 }
5990
5991 /* If the maximum is unlimited, set a repeater in the final copy. For
5992 ONCE brackets, that's all we need to do. However, possessively repeated
5993 ONCE brackets can be converted into non-capturing brackets, as the
5994 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5995 deal with possessive ONCEs specially.
5996
5997 Otherwise, when we are doing the actual compile phase, check to see
5998 whether this group is one that could match an empty string. If so,
5999 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6000 that runtime checking can be done. [This check is also applied to ONCE
6001 groups at runtime, but in a different way.]
6002
6003 Then, if the quantifier was possessive and the bracket is not a
6004 conditional, we convert the BRA code to the POS form, and the KET code to
6005 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6006 subpattern at both the start and at the end.) The use of special opcodes
6007 makes it possible to reduce greatly the stack usage in pcre_exec(). If
6008 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6009
6010 Then, if the minimum number of matches is 1 or 0, cancel the possessive
6011 flag so that the default action below, of wrapping everything inside
6012 atomic brackets, does not happen. When the minimum is greater than 1,
6013 there will be earlier copies of the group, and so we still have to wrap
6014 the whole thing. */
6015
6016 else
6017 {
6018 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6019 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6020
6021 /* Convert possessive ONCE brackets to non-capturing */
6022
6023 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6024 possessive_quantifier) *bracode = OP_BRA;
6025
6026 /* For non-possessive ONCE brackets, all we need to do is to
6027 set the KET. */
6028
6029 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6030 *ketcode = OP_KETRMAX + repeat_type;
6031
6032 /* Handle non-ONCE brackets and possessive ONCEs (which have been
6033 converted to non-capturing above). */
6034
6035 else
6036 {
6037 /* In the compile phase, check for empty string matching. */
6038
6039 if (lengthptr == NULL)
6040 {
6041 pcre_uchar *scode = bracode;
6042 do
6043 {
6044 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6045 {
6046 *bracode += OP_SBRA - OP_BRA;
6047 break;
6048 }
6049 scode += GET(scode, 1);
6050 }
6051 while (*scode == OP_ALT);
6052 }
6053
6054 /* Handle possessive quantifiers. */
6055
6056 if (possessive_quantifier)
6057 {
6058 /* For COND brackets, we wrap the whole thing in a possessively
6059 repeated non-capturing bracket, because we have not invented POS
6060 versions of the COND opcodes. Because we are moving code along, we
6061 must ensure that any pending recursive references are updated. */
6062
6063 if (*bracode == OP_COND || *bracode == OP_SCOND)
6064 {
6065 int nlen = (int)(code - bracode);
6066 *code = OP_END;
6067 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
6068 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6069 code += 1 + LINK_SIZE;
6070 nlen += 1 + LINK_SIZE;
6071 *bracode = OP_BRAPOS;
6072 *code++ = OP_KETRPOS;
6073 PUTINC(code, 0, nlen);
6074 PUT(bracode, 1, nlen);
6075 }
6076
6077 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6078
6079 else
6080 {
6081 *bracode += 1; /* Switch to xxxPOS opcodes */
6082 *ketcode = OP_KETRPOS;
6083 }
6084
6085 /* If the minimum is zero, mark it as possessive, then unset the
6086 possessive flag when the minimum is 0 or 1. */
6087
6088 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6089 if (repeat_min < 2) possessive_quantifier = FALSE;
6090 }
6091
6092 /* Non-possessive quantifier */
6093
6094 else *ketcode = OP_KETRMAX + repeat_type;
6095 }
6096 }
6097 }
6098
6099 /* If previous is OP_FAIL, it was generated by an empty class [] in
6100 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6101 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6102 error above. We can just ignore the repeat in JS case. */
6103
6104 else if (*previous == OP_FAIL) goto END_REPEAT;
6105
6106 /* Else there's some kind of shambles */
6107
6108 else
6109 {
6110 *errorcodeptr = ERR11;
6111 goto FAILED;
6112 }
6113
6114 /* If the character following a repeat is '+', possessive_quantifier is
6115 TRUE. For some opcodes, there are special alternative opcodes for this
6116 case. For anything else, we wrap the entire repeated item inside OP_ONCE
6117 brackets. Logically, the '+' notation is just syntactic sugar, taken from
6118 Sun's Java package, but the special opcodes can optimize it.
6119
6120 Some (but not all) possessively repeated subpatterns have already been
6121 completely handled in the code just above. For them, possessive_quantifier
6122 is always FALSE at this stage. Note that the repeated item starts at
6123 tempcode, not at previous, which might be the first part of a string whose
6124 (former) last char we repeated. */
6125
6126 if (possessive_quantifier)
6127 {
6128 int len;
6129
6130 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6131 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6132 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6133 remains is greater than zero, there's a further opcode that can be
6134 handled. If not, do nothing, leaving the EXACT alone. */
6135
6136 switch(*tempcode)
6137 {
6138 case OP_TYPEEXACT:
6139 tempcode += PRIV(OP_lengths)[*tempcode] +
6140 ((tempcode[1 + IMM2_SIZE] == OP_PROP
6141 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6142 break;
6143
6144 /* CHAR opcodes are used for exacts whose count is 1. */
6145
6146 case OP_CHAR:
6147 case OP_CHARI:
6148 case OP_NOT:
6149 case OP_NOTI:
6150 case OP_EXACT:
6151 case OP_EXACTI:
6152 case OP_NOTEXACT:
6153 case OP_NOTEXACTI:
6154 tempcode += PRIV(OP_lengths)[*tempcode];
6155 #ifdef SUPPORT_UTF
6156 if (utf && HAS_EXTRALEN(tempcode[-1]))
6157 tempcode += GET_EXTRALEN(tempcode[-1]);
6158 #endif
6159 break;
6160
6161 /* For the class opcodes, the repeat operator appears at the end;
6162 adjust tempcode to point to it. */
6163
6164 case OP_CLASS:
6165 case OP_NCLASS:
6166 tempcode += 1 + 32/sizeof(pcre_uchar);
6167 break;
6168
6169 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6170 case OP_XCLASS:
6171 tempcode += GET(tempcode, 1);
6172 break;
6173 #endif
6174 }
6175
6176 /* If tempcode is equal to code (which points to the end of the repeated
6177 item), it means we have skipped an EXACT item but there is no following
6178 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6179 all other cases, tempcode will be pointing to the repeat opcode, and will
6180 be less than code, so the value of len will be greater than 0. */
6181
6182 len = (int)(code - tempcode);
6183 if (len > 0)
6184 {
6185 unsigned int repcode = *tempcode;
6186
6187 /* There is a table for possessifying opcodes, all of which are less
6188 than OP_CALLOUT. A zero entry means there is no possessified version.
6189 */
6190
6191 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6192 *tempcode = opcode_possessify[repcode];
6193
6194 /* For opcode without a special possessified version, wrap the item in
6195 ONCE brackets. Because we are moving code along, we must ensure that any
6196 pending recursive references are updated. */
6197
6198 else
6199 {
6200 *code = OP_END;
6201 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6202 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6203 code += 1 + LINK_SIZE;
6204 len += 1 + LINK_SIZE;
6205 tempcode[0] = OP_ONCE;
6206 *code++ = OP_KET;
6207 PUTINC(code, 0, len);
6208 PUT(tempcode, 1, len);
6209 }
6210 }
6211
6212 #ifdef NEVER
6213 if (len > 0) switch (*tempcode)
6214 {
6215 case OP_STAR: *tempcode = OP_POSSTAR; break;
6216 case OP_PLUS: *tempcode = OP_POSPLUS; break;
6217 case OP_QUERY: *tempcode = OP_POSQUERY; break;
6218 case OP_UPTO: *tempcode = OP_POSUPTO; break;
6219
6220 case OP_STARI: *tempcode = OP_POSSTARI; break;
6221 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
6222 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6223 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
6224
6225 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
6226 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
6227 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6228 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
6229
6230 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
6231 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
6232 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6233 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
6234
6235 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
6236 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
6237 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6238 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
6239
6240 case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break;
6241 case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break;
6242 case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break;
6243 case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break;
6244
6245 /* Because we are moving code along, we must ensure that any
6246 pending recursive references are updated. */
6247
6248 default:
6249 *code = OP_END;
6250 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6251 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6252 code += 1 + LINK_SIZE;
6253 len += 1 + LINK_SIZE;
6254 tempcode[0] = OP_ONCE;
6255 *code++ = OP_KET;
6256 PUTINC(code, 0, len);
6257 PUT(tempcode, 1, len);
6258 break;
6259 }
6260 #endif
6261 }
6262
6263 /* In all case we no longer have a previous item. We also set the
6264 "follows varying string" flag for subsequently encountered reqchars if
6265 it isn't already set and we have just passed a varying length item. */
6266
6267 END_REPEAT:
6268 previous = NULL;
6269 cd->req_varyopt |= reqvary;
6270 break;
6271
6272
6273 /* ===================================================================*/
6274 /* Start of nested parenthesized sub-expression, or comment or lookahead or
6275 lookbehind or option setting or condition or all the other extended
6276 parenthesis forms. */
6277
6278 case CHAR_LEFT_PARENTHESIS:
6279 newoptions = options;
6280 skipbytes = 0;
6281 bravalue = OP_CBRA;
6282 save_hwm = cd->hwm;
6283 reset_bracount = FALSE;
6284
6285 /* First deal with various "verbs" that can be introduced by '*'. */
6286
6287 ptr++;
6288 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6289 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6290 {
6291 int i, namelen;
6292 int arglen = 0;
6293 const char *vn = verbnames;
6294 const pcre_uchar *name = ptr + 1;
6295 const pcre_uchar *arg = NULL;
6296 previous = NULL;
6297 ptr++;
6298 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6299 namelen = (int)(ptr - name);
6300
6301 /* It appears that Perl allows any characters whatsoever, other than
6302 a closing parenthesis, to appear in arguments, so we no longer insist on
6303 letters, digits, and underscores. */
6304
6305 if (*ptr == CHAR_COLON)
6306 {
6307 arg = ++ptr;
6308 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6309 arglen = (int)(ptr - arg);
6310 if ((unsigned int)arglen > MAX_MARK)
6311 {
6312 *errorcodeptr = ERR75;
6313 goto FAILED;
6314 }
6315 }
6316
6317 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6318 {
6319 *errorcodeptr = ERR60;
6320 goto FAILED;
6321 }
6322
6323 /* Scan the table of verb names */
6324
6325 for (i = 0; i < verbcount; i++)
6326 {
6327 if (namelen == verbs[i].len &&
6328 STRNCMP_UC_C8(name, vn, namelen) == 0)
6329 {
6330 int setverb;
6331
6332 /* Check for open captures before ACCEPT and convert it to
6333 ASSERT_ACCEPT if in an assertion. */
6334
6335 if (verbs[i].op == OP_ACCEPT)
6336 {
6337 open_capitem *oc;
6338 if (arglen != 0)
6339 {
6340 *errorcodeptr = ERR59;
6341 goto FAILED;
6342 }
6343 cd->had_accept = TRUE;
6344 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6345 {
6346 *code++ = OP_CLOSE;
6347 PUT2INC(code, 0, oc->number);
6348 }
6349 setverb = *code++ =
6350 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6351
6352 /* Do not set firstchar after *ACCEPT */
6353 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6354 }
6355
6356 /* Handle other cases with/without an argument */
6357
6358 else if (arglen == 0)
6359 {
6360 if (verbs[i].op < 0) /* Argument is mandatory */
6361 {
6362 *errorcodeptr = ERR66;
6363 goto FAILED;
6364 }
6365 setverb = *code++ = verbs[i].op;
6366 }
6367
6368 else
6369 {
6370 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6371 {
6372 *errorcodeptr = ERR59;
6373 goto FAILED;
6374 }
6375 setverb = *code++ = verbs[i].op_arg;
6376 *code++ = arglen;
6377 memcpy(code, arg, IN_UCHARS(arglen));
6378 code += arglen;
6379 *code++ = 0;
6380 }
6381
6382 switch (setverb)
6383 {
6384 case OP_THEN:
6385 case OP_THEN_ARG:
6386 cd->external_flags |= PCRE_HASTHEN;
6387 break;
6388
6389 case OP_PRUNE:
6390 case OP_PRUNE_ARG:
6391 case OP_SKIP:
6392 case OP_SKIP_ARG:
6393 cd->had_pruneorskip = TRUE;
6394 break;
6395 }
6396
6397 break; /* Found verb, exit loop */
6398 }
6399
6400 vn += verbs[i].len + 1;
6401 }
6402
6403 if (i < verbcount) continue; /* Successfully handled a verb */
6404 *errorcodeptr = ERR60; /* Verb not recognized */
6405 goto FAILED;
6406 }
6407
6408 /* Deal with the extended parentheses; all are introduced by '?', and the
6409 appearance of any of them means that this is not a capturing group. */
6410
6411 else if (*ptr == CHAR_QUESTION_MARK)
6412 {
6413 int i, set, unset, namelen;
6414 int *optset;
6415 const pcre_uchar *name;
6416 pcre_uchar *slot;
6417
6418 switch (*(++ptr))
6419 {
6420 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
6421 ptr++;
6422 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6423 if (*ptr == CHAR_NULL)
6424 {
6425 *errorcodeptr = ERR18;
6426 goto FAILED;
6427 }
6428 continue;
6429
6430
6431 /* ------------------------------------------------------------ */
6432 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6433 reset_bracount = TRUE;
6434 /* Fall through */
6435
6436 /* ------------------------------------------------------------ */
6437 case CHAR_COLON: /* Non-capturing bracket */
6438 bravalue = OP_BRA;
6439 ptr++;
6440 break;
6441
6442
6443 /* ------------------------------------------------------------ */
6444 case CHAR_LEFT_PARENTHESIS:
6445 bravalue = OP_COND; /* Conditional group */
6446 tempptr = ptr;
6447
6448 /* A condition can be an assertion, a number (referring to a numbered
6449 group), a name (referring to a named group), or 'R', referring to
6450 recursion. R<digits> and R&name are also permitted for recursion tests.
6451
6452 There are several syntaxes for testing a named group: (?(name)) is used
6453 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
6454
6455 There are two unfortunate ambiguities, caused by history. (a) 'R' can
6456 be the recursive thing or the name 'R' (and similarly for 'R' followed
6457 by digits), and (b) a number could be a name that consists of digits.
6458 In both cases, we look for a name first; if not found, we try the other
6459 cases.
6460
6461 For compatibility with auto-callouts, we allow a callout to be
6462 specified before a condition that is an assertion. First, check for the
6463 syntax of a callout; if found, adjust the temporary pointer that is
6464 used to check for an assertion condition. That's all that is needed! */
6465
6466 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6467 {
6468 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6469 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6470 tempptr += i + 1;
6471 }
6472
6473 /* For conditions that are assertions, check the syntax, and then exit
6474 the switch. This will take control down to where bracketed groups,
6475 including assertions, are processed. */
6476
6477 if (tempptr[1] == CHAR_QUESTION_MARK &&
6478 (tempptr[2] == CHAR_EQUALS_SIGN ||
6479 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6480 tempptr[2] == CHAR_LESS_THAN_SIGN))
6481 break;
6482
6483 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6484 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6485
6486 code[1+LINK_SIZE] = OP_CREF;
6487 skipbytes = 1+IMM2_SIZE;
6488 refsign = -1;
6489
6490 /* Check for a test for recursion in a named group. */
6491
6492 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
6493 {
6494 terminator = -1;
6495 ptr += 2;
6496 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6497 }
6498
6499 /* Check for a test for a named group's having been set, using the Perl
6500 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6501 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6502 consist entirely of digits, there is scope for ambiguity. */
6503
6504 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6505 {
6506 terminator = CHAR_GREATER_THAN_SIGN;
6507 ptr++;
6508 }
6509 else if (ptr[1] == CHAR_APOSTROPHE)
6510 {
6511 terminator = CHAR_APOSTROPHE;
6512 ptr++;
6513 }
6514 else
6515 {
6516 terminator = CHAR_NULL;
6517 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6518 }
6519
6520 /* When a name is one of a number of duplicates, a different opcode is
6521 used and it needs more memory. Unfortunately we cannot tell whether a
6522 name is a duplicate in the first pass, so we have to allow for more
6523 memory except when we know it is a relative numerical reference. */
6524
6525 if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6526
6527 /* We now expect to read a name (possibly all digits); any thing else
6528 is an error. In the case of all digits, also get it as a number. */
6529
6530 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6531 {
6532 ptr += 1; /* To get the right offset */
6533 *errorcodeptr = ERR28;
6534 goto FAILED;
6535 }
6536
6537 recno = 0;
6538 name = ++ptr;
6539 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6540 {
6541 if (recno >= 0)
6542 recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
6543 ptr++;
6544 }
6545 namelen = (int)(ptr - name);
6546
6547 /* Check the terminator */
6548
6549 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6550 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6551 {
6552 ptr--; /* Error offset */
6553 *errorcodeptr = ERR26;
6554 goto FAILED;
6555 }
6556
6557 /* Do no further checking in the pre-compile phase. */
6558
6559 if (lengthptr != NULL) break;
6560
6561 /* In the real compile we do the work of looking for the actual
6562 reference. If the string started with "+" or "-" we require the rest to
6563 be digits, in which case recno will be set. */
6564
6565 if (refsign > 0)
6566 {
6567 if (recno <= 0)
6568 {
6569 *errorcodeptr = ERR58;
6570 goto FAILED;
6571 }
6572 recno = (refsign == CHAR_MINUS)?
6573 cd->bracount - recno + 1 : recno +cd->bracount;
6574 if (recno <= 0 || recno > cd->final_bracount)
6575 {
6576 *errorcodeptr = ERR15;
6577 goto FAILED;
6578 }
6579 PUT2(code, 2+LINK_SIZE, recno);
6580 break;
6581 }
6582
6583 /* Otherwise (did not start with "+" or "-"), start by looking for the
6584 name. */
6585
6586 slot = cd->name_table;
6587 for (i = 0; i < cd->names_found; i++)
6588 {
6589 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6590 slot += cd->name_entry_size;
6591 }
6592
6593 /* Found the named subpattern. If the name is duplicated, add one to
6594 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6595 appropriate data values. Otherwise, just insert the unique subpattern
6596 number. */
6597
6598 if (i < cd->names_found)
6599 {
6600 int offset = i++;
6601 int count = 1;
6602 recno = GET2(slot, 0); /* Number from first found */
6603 for (; i < cd->names_found; i++)
6604 {
6605 slot += cd->name_entry_size;
6606 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6607 count++;
6608 }
6609 if (count > 1)
6610 {
6611 PUT2(code, 2+LINK_SIZE, offset);
6612 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6613 skipbytes += IMM2_SIZE;
6614 code[1+LINK_SIZE]++;
6615 }
6616 else /* Not a duplicated name */
6617 {
6618 PUT2(code, 2+LINK_SIZE, recno);
6619 }
6620 }
6621
6622 /* If terminator == CHAR_NULL it means that the name followed directly
6623 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6624 are some further alternatives to try. For the cases where terminator !=
6625 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
6626 now checked all the possibilities, so give an error. */
6627
6628 else if (terminator != CHAR_NULL)
6629 {
6630 *errorcodeptr = ERR15;
6631 goto FAILED;
6632 }
6633
6634 /* Check for (?(R) for recursion. Allow digits after R to specify a
6635 specific group number. */
6636
6637 else if (*name == CHAR_R)
6638 {
6639 recno = 0;
6640 for (i = 1; i < namelen; i++)
6641 {
6642 if (!IS_DIGIT(name[i]))
6643 {
6644 *errorcodeptr = ERR15;
6645 goto FAILED;
6646 }
6647 recno = recno * 10 + name[i] - CHAR_0;
6648 }
6649 if (recno == 0) recno = RREF_ANY;
6650 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
6651 PUT2(code, 2+LINK_SIZE, recno);
6652 }
6653
6654 /* Similarly, check for the (?(DEFINE) "condition", which is always
6655 false. */
6656
6657 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6658 {
6659 code[1+LINK_SIZE] = OP_DEF;
6660 skipbytes = 1;
6661 }
6662