/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1393 - (show annotations)
Fri Nov 8 16:37:21 2013 UTC (5 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 307102 byte(s)
Allow quantifiers on (?!) so as to be the same as other assertions.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Tables of names of POSIX character classes and their lengths. The names are
264 now all in a single string, to reduce the number of relocations when a shared
265 library is dynamically loaded. The list of lengths is terminated by a zero
266 length entry. The first three must be alpha, lower, upper, as this is assumed
267 for handling case independence. The indices for graph, print, and punct are
268 needed, so identify them. */
269
270 static const char posix_names[] =
271 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
272 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
273 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
274 STRING_word0 STRING_xdigit;
275
276 static const pcre_uint8 posix_name_lengths[] = {
277 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
278
279 #define PC_GRAPH 8
280 #define PC_PRINT 9
281 #define PC_PUNCT 10
282
283
284 /* Table of class bit maps for each POSIX class. Each class is formed from a
285 base map, with an optional addition or removal of another map. Then, for some
286 classes, there is some additional tweaking: for [:blank:] the vertical space
287 characters are removed, and for [:alpha:] and [:alnum:] the underscore
288 character is removed. The triples in the table consist of the base map offset,
289 second map offset or -1 if no second map, and a non-negative value for map
290 addition or a negative value for map subtraction (if there are two maps). The
291 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
292 remove vertical space characters, 2 => remove underscore. */
293
294 static const int posix_class_maps[] = {
295 cbit_word, cbit_digit, -2, /* alpha */
296 cbit_lower, -1, 0, /* lower */
297 cbit_upper, -1, 0, /* upper */
298 cbit_word, -1, 2, /* alnum - word without underscore */
299 cbit_print, cbit_cntrl, 0, /* ascii */
300 cbit_space, -1, 1, /* blank - a GNU extension */
301 cbit_cntrl, -1, 0, /* cntrl */
302 cbit_digit, -1, 0, /* digit */
303 cbit_graph, -1, 0, /* graph */
304 cbit_print, -1, 0, /* print */
305 cbit_punct, -1, 0, /* punct */
306 cbit_space, -1, 0, /* space */
307 cbit_word, -1, 0, /* word - a Perl extension */
308 cbit_xdigit,-1, 0 /* xdigit */
309 };
310
311 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
312 Unicode property escapes. */
313
314 #ifdef SUPPORT_UCP
315 static const pcre_uchar string_PNd[] = {
316 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
317 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
318 static const pcre_uchar string_pNd[] = {
319 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
320 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
321 static const pcre_uchar string_PXsp[] = {
322 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
323 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
324 static const pcre_uchar string_pXsp[] = {
325 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
326 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
327 static const pcre_uchar string_PXwd[] = {
328 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
329 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
330 static const pcre_uchar string_pXwd[] = {
331 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333
334 static const pcre_uchar *substitutes[] = {
335 string_PNd, /* \D */
336 string_pNd, /* \d */
337 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
338 string_pXsp, /* \s */ /* space and POSIX space are the same. */
339 string_PXwd, /* \W */
340 string_pXwd /* \w */
341 };
342
343 /* The POSIX class substitutes must be in the order of the POSIX class names,
344 defined above, and there are both positive and negative cases. NULL means no
345 general substitute of a Unicode property escape (\p or \P). However, for some
346 POSIX classes (e.g. graph, print, punct) a special property code is compiled
347 directly. */
348
349 static const pcre_uchar string_pL[] = {
350 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
352 static const pcre_uchar string_pLl[] = {
353 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
354 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
355 static const pcre_uchar string_pLu[] = {
356 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
357 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
358 static const pcre_uchar string_pXan[] = {
359 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
360 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
361 static const pcre_uchar string_h[] = {
362 CHAR_BACKSLASH, CHAR_h, '\0' };
363 static const pcre_uchar string_pXps[] = {
364 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
365 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
366 static const pcre_uchar string_PL[] = {
367 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
368 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
369 static const pcre_uchar string_PLl[] = {
370 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372 static const pcre_uchar string_PLu[] = {
373 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
374 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
375 static const pcre_uchar string_PXan[] = {
376 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
377 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
378 static const pcre_uchar string_H[] = {
379 CHAR_BACKSLASH, CHAR_H, '\0' };
380 static const pcre_uchar string_PXps[] = {
381 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
382 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
383
384 static const pcre_uchar *posix_substitutes[] = {
385 string_pL, /* alpha */
386 string_pLl, /* lower */
387 string_pLu, /* upper */
388 string_pXan, /* alnum */
389 NULL, /* ascii */
390 string_h, /* blank */
391 NULL, /* cntrl */
392 string_pNd, /* digit */
393 NULL, /* graph */
394 NULL, /* print */
395 NULL, /* punct */
396 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
397 string_pXwd, /* word */ /* Perl and POSIX space are the same */
398 NULL, /* xdigit */
399 /* Negated cases */
400 string_PL, /* ^alpha */
401 string_PLl, /* ^lower */
402 string_PLu, /* ^upper */
403 string_PXan, /* ^alnum */
404 NULL, /* ^ascii */
405 string_H, /* ^blank */
406 NULL, /* ^cntrl */
407 string_PNd, /* ^digit */
408 NULL, /* ^graph */
409 NULL, /* ^print */
410 NULL, /* ^punct */
411 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
412 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
413 NULL /* ^xdigit */
414 };
415 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
416 #endif
417
418 #define STRING(a) # a
419 #define XSTRING(s) STRING(s)
420
421 /* The texts of compile-time error messages. These are "char *" because they
422 are passed to the outside world. Do not ever re-use any error number, because
423 they are documented. Always add a new error instead. Messages marked DEAD below
424 are no longer used. This used to be a table of strings, but in order to reduce
425 the number of relocations needed when a shared library is loaded dynamically,
426 it is now one long string. We cannot use a table of offsets, because the
427 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
428 simply count through to the one we want - this isn't a performance issue
429 because these strings are used only when there is a compilation error.
430
431 Each substring ends with \0 to insert a null character. This includes the final
432 substring, so that the whole string ends with \0\0, which can be detected when
433 counting through. */
434
435 static const char error_texts[] =
436 "no error\0"
437 "\\ at end of pattern\0"
438 "\\c at end of pattern\0"
439 "unrecognized character follows \\\0"
440 "numbers out of order in {} quantifier\0"
441 /* 5 */
442 "number too big in {} quantifier\0"
443 "missing terminating ] for character class\0"
444 "invalid escape sequence in character class\0"
445 "range out of order in character class\0"
446 "nothing to repeat\0"
447 /* 10 */
448 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
449 "internal error: unexpected repeat\0"
450 "unrecognized character after (? or (?-\0"
451 "POSIX named classes are supported only within a class\0"
452 "missing )\0"
453 /* 15 */
454 "reference to non-existent subpattern\0"
455 "erroffset passed as NULL\0"
456 "unknown option bit(s) set\0"
457 "missing ) after comment\0"
458 "parentheses nested too deeply\0" /** DEAD **/
459 /* 20 */
460 "regular expression is too large\0"
461 "failed to get memory\0"
462 "unmatched parentheses\0"
463 "internal error: code overflow\0"
464 "unrecognized character after (?<\0"
465 /* 25 */
466 "lookbehind assertion is not fixed length\0"
467 "malformed number or name after (?(\0"
468 "conditional group contains more than two branches\0"
469 "assertion expected after (?(\0"
470 "(?R or (?[+-]digits must be followed by )\0"
471 /* 30 */
472 "unknown POSIX class name\0"
473 "POSIX collating elements are not supported\0"
474 "this version of PCRE is compiled without UTF support\0"
475 "spare error\0" /** DEAD **/
476 "character value in \\x{} or \\o{} is too large\0"
477 /* 35 */
478 "invalid condition (?(0)\0"
479 "\\C not allowed in lookbehind assertion\0"
480 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
481 "number after (?C is > 255\0"
482 "closing ) for (?C expected\0"
483 /* 40 */
484 "recursive call could loop indefinitely\0"
485 "unrecognized character after (?P\0"
486 "syntax error in subpattern name (missing terminator)\0"
487 "two named subpatterns have the same name\0"
488 "invalid UTF-8 string\0"
489 /* 45 */
490 "support for \\P, \\p, and \\X has not been compiled\0"
491 "malformed \\P or \\p sequence\0"
492 "unknown property name after \\P or \\p\0"
493 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
494 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
495 /* 50 */
496 "repeated subpattern is too long\0" /** DEAD **/
497 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
498 "internal error: overran compiling workspace\0"
499 "internal error: previously-checked referenced subpattern not found\0"
500 "DEFINE group contains more than one branch\0"
501 /* 55 */
502 "repeating a DEFINE group is not allowed\0" /** DEAD **/
503 "inconsistent NEWLINE options\0"
504 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
505 "a numbered reference must not be zero\0"
506 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
507 /* 60 */
508 "(*VERB) not recognized or malformed\0"
509 "number is too big\0"
510 "subpattern name expected\0"
511 "digit expected after (?+\0"
512 "] is an invalid data character in JavaScript compatibility mode\0"
513 /* 65 */
514 "different names for subpatterns of the same number are not allowed\0"
515 "(*MARK) must have an argument\0"
516 "this version of PCRE is not compiled with Unicode property support\0"
517 "\\c must be followed by an ASCII character\0"
518 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
519 /* 70 */
520 "internal error: unknown opcode in find_fixedlength()\0"
521 "\\N is not supported in a class\0"
522 "too many forward references\0"
523 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
524 "invalid UTF-16 string\0"
525 /* 75 */
526 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
527 "character value in \\u.... sequence is too large\0"
528 "invalid UTF-32 string\0"
529 "setting UTF is disabled by the application\0"
530 "non-hex character in \\x{} (closing brace missing?)\0"
531 /* 80 */
532 "non-octal character in \\o{} (closing brace missing?)\0"
533 "missing opening brace after \\o\0"
534 "parentheses are too deeply nested\0"
535 "invalid range in character class\0"
536 ;
537
538 /* Table to identify digits and hex digits. This is used when compiling
539 patterns. Note that the tables in chartables are dependent on the locale, and
540 may mark arbitrary characters as digits - but the PCRE compiling code expects
541 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
542 a private table here. It costs 256 bytes, but it is a lot faster than doing
543 character value tests (at least in some simple cases I timed), and in some
544 applications one wants PCRE to compile efficiently as well as match
545 efficiently.
546
547 For convenience, we use the same bit definitions as in chartables:
548
549 0x04 decimal digit
550 0x08 hexadecimal digit
551
552 Then we can use ctype_digit and ctype_xdigit in the code. */
553
554 /* Using a simple comparison for decimal numbers rather than a memory read
555 is much faster, and the resulting code is simpler (the compiler turns it
556 into a subtraction and unsigned comparison). */
557
558 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
559
560 #ifndef EBCDIC
561
562 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
563 UTF-8 mode. */
564
565 static const pcre_uint8 digitab[] =
566 {
567 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
568 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
569 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
570 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
571 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
572 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
573 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
574 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
575 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
579 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
580 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
599
600 #else
601
602 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
603
604 static const pcre_uint8 digitab[] =
605 {
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
622 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
624 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
627 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
630 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
636 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
637 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
638
639 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
640 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
641 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
642 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
644 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
648 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
649 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
651 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
653 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
656 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
657 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
658 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
659 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
660 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
661 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
662 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
663 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
664 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
665 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
666 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
667 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
668 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
669 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
670 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
671 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
672 #endif
673
674
675 /* This table is used to check whether auto-possessification is possible
676 between adjacent character-type opcodes. The left-hand (repeated) opcode is
677 used to select the row, and the right-hand opcode is use to select the column.
678 A value of 1 means that auto-possessification is OK. For example, the second
679 value in the first row means that \D+\d can be turned into \D++\d.
680
681 The Unicode property types (\P and \p) have to be present to fill out the table
682 because of what their opcode values are, but the table values should always be
683 zero because property types are handled separately in the code. The last four
684 columns apply to items that cannot be repeated, so there is no need to have
685 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
686 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
687
688 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
689 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
690
691 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
692 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
693 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
694 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
695 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
696 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
697 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
698 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
699 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
700 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
701 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
702 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
703 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
704 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
705 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
706 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
707 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
708 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
709 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
710 };
711
712
713 /* This table is used to check whether auto-possessification is possible
714 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
715 left-hand (repeated) opcode is used to select the row, and the right-hand
716 opcode is used to select the column. The values are as follows:
717
718 0 Always return FALSE (never auto-possessify)
719 1 Character groups are distinct (possessify if both are OP_PROP)
720 2 Check character categories in the same group (general or particular)
721 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
722
723 4 Check left general category vs right particular category
724 5 Check right general category vs left particular category
725
726 6 Left alphanum vs right general category
727 7 Left space vs right general category
728 8 Left word vs right general category
729
730 9 Right alphanum vs left general category
731 10 Right space vs left general category
732 11 Right word vs left general category
733
734 12 Left alphanum vs right particular category
735 13 Left space vs right particular category
736 14 Left word vs right particular category
737
738 15 Right alphanum vs left particular category
739 16 Right space vs left particular category
740 17 Right word vs left particular category
741 */
742
743 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
744 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
745 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
746 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
747 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
748 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
749 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
750 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
751 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
752 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
753 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
754 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
755 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
756 };
757
758 /* This table is used to check whether auto-possessification is possible
759 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
760 specifies a general category and the other specifies a particular category. The
761 row is selected by the general category and the column by the particular
762 category. The value is 1 if the particular category is not part of the general
763 category. */
764
765 static const pcre_uint8 catposstab[7][30] = {
766 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
767 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
768 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
769 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
770 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
771 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
772 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
773 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
774 };
775
776 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
777 a general or particular category. The properties in each row are those
778 that apply to the character set in question. Duplication means that a little
779 unnecessary work is done when checking, but this keeps things much simpler
780 because they can all use the same code. For more details see the comment where
781 this table is used.
782
783 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
784 "space", but from Perl 5.18 it's included, so both categories are treated the
785 same here. */
786
787 static const pcre_uint8 posspropstab[3][4] = {
788 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
789 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
790 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
791 };
792
793 /* This table is used when converting repeating opcodes into possessified
794 versions as a result of an explicit possessive quantifier such as ++. A zero
795 value means there is no possessified version - in those cases the item in
796 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
797 because all relevant opcodes are less than that. */
798
799 static const pcre_uint8 opcode_possessify[] = {
800 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
801 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
802
803 0, /* NOTI */
804 OP_POSSTAR, 0, /* STAR, MINSTAR */
805 OP_POSPLUS, 0, /* PLUS, MINPLUS */
806 OP_POSQUERY, 0, /* QUERY, MINQUERY */
807 OP_POSUPTO, 0, /* UPTO, MINUPTO */
808 0, /* EXACT */
809 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
810
811 OP_POSSTARI, 0, /* STARI, MINSTARI */
812 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
813 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
814 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
815 0, /* EXACTI */
816 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
817
818 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
819 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
820 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
821 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
822 0, /* NOTEXACT */
823 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
824
825 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
826 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
827 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
828 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
829 0, /* NOTEXACTI */
830 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
831
832 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
833 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
834 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
835 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
836 0, /* TYPEEXACT */
837 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
838
839 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
840 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
841 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
842 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
843 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
844
845 0, 0, 0, /* CLASS, NCLASS, XCLASS */
846 0, 0, /* REF, REFI */
847 0, 0, /* DNREF, DNREFI */
848 0, 0 /* RECURSE, CALLOUT */
849 };
850
851
852
853 /*************************************************
854 * Find an error text *
855 *************************************************/
856
857 /* The error texts are now all in one long string, to save on relocations. As
858 some of the text is of unknown length, we can't use a table of offsets.
859 Instead, just count through the strings. This is not a performance issue
860 because it happens only when there has been a compilation error.
861
862 Argument: the error number
863 Returns: pointer to the error string
864 */
865
866 static const char *
867 find_error_text(int n)
868 {
869 const char *s = error_texts;
870 for (; n > 0; n--)
871 {
872 while (*s++ != CHAR_NULL) {};
873 if (*s == CHAR_NULL) return "Error text not found (please report)";
874 }
875 return s;
876 }
877
878
879
880 /*************************************************
881 * Expand the workspace *
882 *************************************************/
883
884 /* This function is called during the second compiling phase, if the number of
885 forward references fills the existing workspace, which is originally a block on
886 the stack. A larger block is obtained from malloc() unless the ultimate limit
887 has been reached or the increase will be rather small.
888
889 Argument: pointer to the compile data block
890 Returns: 0 if all went well, else an error number
891 */
892
893 static int
894 expand_workspace(compile_data *cd)
895 {
896 pcre_uchar *newspace;
897 int newsize = cd->workspace_size * 2;
898
899 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
900 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
901 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
902 return ERR72;
903
904 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
905 if (newspace == NULL) return ERR21;
906 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
907 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
908 if (cd->workspace_size > COMPILE_WORK_SIZE)
909 (PUBL(free))((void *)cd->start_workspace);
910 cd->start_workspace = newspace;
911 cd->workspace_size = newsize;
912 return 0;
913 }
914
915
916
917 /*************************************************
918 * Check for counted repeat *
919 *************************************************/
920
921 /* This function is called when a '{' is encountered in a place where it might
922 start a quantifier. It looks ahead to see if it really is a quantifier or not.
923 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
924 where the ddds are digits.
925
926 Arguments:
927 p pointer to the first char after '{'
928
929 Returns: TRUE or FALSE
930 */
931
932 static BOOL
933 is_counted_repeat(const pcre_uchar *p)
934 {
935 if (!IS_DIGIT(*p)) return FALSE;
936 p++;
937 while (IS_DIGIT(*p)) p++;
938 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
939
940 if (*p++ != CHAR_COMMA) return FALSE;
941 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
942
943 if (!IS_DIGIT(*p)) return FALSE;
944 p++;
945 while (IS_DIGIT(*p)) p++;
946
947 return (*p == CHAR_RIGHT_CURLY_BRACKET);
948 }
949
950
951
952 /*************************************************
953 * Handle escapes *
954 *************************************************/
955
956 /* This function is called when a \ has been encountered. It either returns a
957 positive value for a simple escape such as \n, or 0 for a data character which
958 will be placed in chptr. A backreference to group n is returned as negative n.
959 When UTF-8 is enabled, a positive value greater than 255 may be returned in
960 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
961 character of the escape sequence.
962
963 Arguments:
964 ptrptr points to the pattern position pointer
965 chptr points to a returned data character
966 errorcodeptr points to the errorcode variable
967 bracount number of previous extracting brackets
968 options the options bits
969 isclass TRUE if inside a character class
970
971 Returns: zero => a data character
972 positive => a special escape sequence
973 negative => a back reference
974 on error, errorcodeptr is set
975 */
976
977 static int
978 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
979 int bracount, int options, BOOL isclass)
980 {
981 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
982 BOOL utf = (options & PCRE_UTF8) != 0;
983 const pcre_uchar *ptr = *ptrptr + 1;
984 pcre_uint32 c;
985 int escape = 0;
986 int i;
987
988 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
989 ptr--; /* Set pointer back to the last byte */
990
991 /* If backslash is at the end of the pattern, it's an error. */
992
993 if (c == CHAR_NULL) *errorcodeptr = ERR1;
994
995 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
996 in a table. A non-zero result is something that can be returned immediately.
997 Otherwise further processing may be required. */
998
999 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1000 /* Not alphanumeric */
1001 else if (c < CHAR_0 || c > CHAR_z) {}
1002 else if ((i = escapes[c - CHAR_0]) != 0)
1003 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1004
1005 #else /* EBCDIC coding */
1006 /* Not alphanumeric */
1007 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1008 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1009 #endif
1010
1011 /* Escapes that need further processing, or are illegal. */
1012
1013 else
1014 {
1015 const pcre_uchar *oldptr;
1016 BOOL braced, negated, overflow;
1017 int s;
1018
1019 switch (c)
1020 {
1021 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1022 error. */
1023
1024 case CHAR_l:
1025 case CHAR_L:
1026 *errorcodeptr = ERR37;
1027 break;
1028
1029 case CHAR_u:
1030 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1031 {
1032 /* In JavaScript, \u must be followed by four hexadecimal numbers.
1033 Otherwise it is a lowercase u letter. */
1034 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1035 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1036 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1037 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1038 {
1039 c = 0;
1040 for (i = 0; i < 4; ++i)
1041 {
1042 register pcre_uint32 cc = *(++ptr);
1043 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1044 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1045 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1046 #else /* EBCDIC coding */
1047 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1048 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1049 #endif
1050 }
1051
1052 #if defined COMPILE_PCRE8
1053 if (c > (utf ? 0x10ffffU : 0xffU))
1054 #elif defined COMPILE_PCRE16
1055 if (c > (utf ? 0x10ffffU : 0xffffU))
1056 #elif defined COMPILE_PCRE32
1057 if (utf && c > 0x10ffffU)
1058 #endif
1059 {
1060 *errorcodeptr = ERR76;
1061 }
1062 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1063 }
1064 }
1065 else
1066 *errorcodeptr = ERR37;
1067 break;
1068
1069 case CHAR_U:
1070 /* In JavaScript, \U is an uppercase U letter. */
1071 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1072 break;
1073
1074 /* In a character class, \g is just a literal "g". Outside a character
1075 class, \g must be followed by one of a number of specific things:
1076
1077 (1) A number, either plain or braced. If positive, it is an absolute
1078 backreference. If negative, it is a relative backreference. This is a Perl
1079 5.10 feature.
1080
1081 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1082 is part of Perl's movement towards a unified syntax for back references. As
1083 this is synonymous with \k{name}, we fudge it up by pretending it really
1084 was \k.
1085
1086 (3) For Oniguruma compatibility we also support \g followed by a name or a
1087 number either in angle brackets or in single quotes. However, these are
1088 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1089 the ESC_g code (cf \k). */
1090
1091 case CHAR_g:
1092 if (isclass) break;
1093 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1094 {
1095 escape = ESC_g;
1096 break;
1097 }
1098
1099 /* Handle the Perl-compatible cases */
1100
1101 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1102 {
1103 const pcre_uchar *p;
1104 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1105 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1106 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1107 {
1108 escape = ESC_k;
1109 break;
1110 }
1111 braced = TRUE;
1112 ptr++;
1113 }
1114 else braced = FALSE;
1115
1116 if (ptr[1] == CHAR_MINUS)
1117 {
1118 negated = TRUE;
1119 ptr++;
1120 }
1121 else negated = FALSE;
1122
1123 /* The integer range is limited by the machine's int representation. */
1124 s = 0;
1125 overflow = FALSE;
1126 while (IS_DIGIT(ptr[1]))
1127 {
1128 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1129 {
1130 overflow = TRUE;
1131 break;
1132 }
1133 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1134 }
1135 if (overflow) /* Integer overflow */
1136 {
1137 while (IS_DIGIT(ptr[1]))
1138 ptr++;
1139 *errorcodeptr = ERR61;
1140 break;
1141 }
1142
1143 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1144 {
1145 *errorcodeptr = ERR57;
1146 break;
1147 }
1148
1149 if (s == 0)
1150 {
1151 *errorcodeptr = ERR58;
1152 break;
1153 }
1154
1155 if (negated)
1156 {
1157 if (s > bracount)
1158 {
1159 *errorcodeptr = ERR15;
1160 break;
1161 }
1162 s = bracount - (s - 1);
1163 }
1164
1165 escape = -s;
1166 break;
1167
1168 /* The handling of escape sequences consisting of a string of digits
1169 starting with one that is not zero is not straightforward. Perl has changed
1170 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1171 recommended to avoid the ambiguities in the old syntax.
1172
1173 Outside a character class, the digits are read as a decimal number. If the
1174 number is less than 8 (used to be 10), or if there are that many previous
1175 extracting left brackets, then it is a back reference. Otherwise, up to
1176 three octal digits are read to form an escaped byte. Thus \123 is likely to
1177 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1178 the octal value is greater than 377, the least significant 8 bits are
1179 taken. \8 and \9 are treated as the literal characters 8 and 9.
1180
1181 Inside a character class, \ followed by a digit is always either a literal
1182 8 or 9 or an octal number. */
1183
1184 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1185 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1186
1187 if (!isclass)
1188 {
1189 oldptr = ptr;
1190 /* The integer range is limited by the machine's int representation. */
1191 s = (int)(c -CHAR_0);
1192 overflow = FALSE;
1193 while (IS_DIGIT(ptr[1]))
1194 {
1195 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1196 {
1197 overflow = TRUE;
1198 break;
1199 }
1200 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1201 }
1202 if (overflow) /* Integer overflow */
1203 {
1204 while (IS_DIGIT(ptr[1]))
1205 ptr++;
1206 *errorcodeptr = ERR61;
1207 break;
1208 }
1209 if (s < 8 || s <= bracount) /* Check for back reference */
1210 {
1211 escape = -s;
1212 break;
1213 }
1214 ptr = oldptr; /* Put the pointer back and fall through */
1215 }
1216
1217 /* Handle a digit following \ when the number is not a back reference. If
1218 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1219 then treat the digit as a following literal. At least by Perl 5.18 this
1220 changed so as not to insert the binary zero. */
1221
1222 if ((c = *ptr) >= CHAR_8) break;
1223
1224 /* Fall through with a digit less than 8 */
1225
1226 /* \0 always starts an octal number, but we may drop through to here with a
1227 larger first octal digit. The original code used just to take the least
1228 significant 8 bits of octal numbers (I think this is what early Perls used
1229 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1230 but no more than 3 octal digits. */
1231
1232 case CHAR_0:
1233 c -= CHAR_0;
1234 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1235 c = c * 8 + *(++ptr) - CHAR_0;
1236 #ifdef COMPILE_PCRE8
1237 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1238 #endif
1239 break;
1240
1241 /* \o is a relatively new Perl feature, supporting a more general way of
1242 specifying character codes in octal. The only supported form is \o{ddd}. */
1243
1244 case CHAR_o:
1245 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1246 {
1247 ptr += 2;
1248 c = 0;
1249 overflow = FALSE;
1250 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1251 {
1252 register pcre_uint32 cc = *ptr++;
1253 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1254 #ifdef COMPILE_PCRE32
1255 if (c >= 0x20000000l) { overflow = TRUE; break; }
1256 #endif
1257 c = (c << 3) + cc - CHAR_0 ;
1258 #if defined COMPILE_PCRE8
1259 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1260 #elif defined COMPILE_PCRE16
1261 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1262 #elif defined COMPILE_PCRE32
1263 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1264 #endif
1265 }
1266 if (overflow)
1267 {
1268 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1269 *errorcodeptr = ERR34;
1270 }
1271 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1272 {
1273 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1274 }
1275 else *errorcodeptr = ERR80;
1276 }
1277 break;
1278
1279 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1280 numbers. Otherwise it is a lowercase x letter. */
1281
1282 case CHAR_x:
1283 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1284 {
1285 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1286 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1287 {
1288 c = 0;
1289 for (i = 0; i < 2; ++i)
1290 {
1291 register pcre_uint32 cc = *(++ptr);
1292 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1293 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1294 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1295 #else /* EBCDIC coding */
1296 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1297 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1298 #endif
1299 }
1300 }
1301 } /* End JavaScript handling */
1302
1303 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1304 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1305 digits. If not, { used to be treated as a data character. However, Perl
1306 seems to read hex digits up to the first non-such, and ignore the rest, so
1307 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1308 now gives an error. */
1309
1310 else
1311 {
1312 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1313 {
1314 ptr += 2;
1315 c = 0;
1316 overflow = FALSE;
1317 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1318 {
1319 register pcre_uint32 cc = *ptr++;
1320 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1321
1322 #ifdef COMPILE_PCRE32
1323 if (c >= 0x10000000l) { overflow = TRUE; break; }
1324 #endif
1325
1326 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1327 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1328 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1329 #else /* EBCDIC coding */
1330 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1331 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1332 #endif
1333
1334 #if defined COMPILE_PCRE8
1335 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1336 #elif defined COMPILE_PCRE16
1337 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1338 #elif defined COMPILE_PCRE32
1339 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1340 #endif
1341 }
1342
1343 if (overflow)
1344 {
1345 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1346 *errorcodeptr = ERR34;
1347 }
1348
1349 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1350 {
1351 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1352 }
1353
1354 /* If the sequence of hex digits does not end with '}', give an error.
1355 We used just to recognize this construct and fall through to the normal
1356 \x handling, but nowadays Perl gives an error, which seems much more
1357 sensible, so we do too. */
1358
1359 else *errorcodeptr = ERR79;
1360 } /* End of \x{} processing */
1361
1362 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1363
1364 else
1365 {
1366 c = 0;
1367 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1368 {
1369 pcre_uint32 cc; /* Some compilers don't like */
1370 cc = *(++ptr); /* ++ in initializers */
1371 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1372 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1373 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1374 #else /* EBCDIC coding */
1375 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1376 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1377 #endif
1378 }
1379 } /* End of \xdd handling */
1380 } /* End of Perl-style \x handling */
1381 break;
1382
1383 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1384 An error is given if the byte following \c is not an ASCII character. This
1385 coding is ASCII-specific, but then the whole concept of \cx is
1386 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1387
1388 case CHAR_c:
1389 c = *(++ptr);
1390 if (c == CHAR_NULL)
1391 {
1392 *errorcodeptr = ERR2;
1393 break;
1394 }
1395 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1396 if (c > 127) /* Excludes all non-ASCII in either mode */
1397 {
1398 *errorcodeptr = ERR68;
1399 break;
1400 }
1401 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1402 c ^= 0x40;
1403 #else /* EBCDIC coding */
1404 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1405 c ^= 0xC0;
1406 #endif
1407 break;
1408
1409 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1410 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1411 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1412 odd, but there used to be some cases other than the default, and there may
1413 be again in future, so I haven't "optimized" it. */
1414
1415 default:
1416 if ((options & PCRE_EXTRA) != 0) switch(c)
1417 {
1418 default:
1419 *errorcodeptr = ERR3;
1420 break;
1421 }
1422 break;
1423 }
1424 }
1425
1426 /* Perl supports \N{name} for character names, as well as plain \N for "not
1427 newline". PCRE does not support \N{name}. However, it does support
1428 quantification such as \N{2,3}. */
1429
1430 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1431 !is_counted_repeat(ptr+2))
1432 *errorcodeptr = ERR37;
1433
1434 /* If PCRE_UCP is set, we change the values for \d etc. */
1435
1436 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1437 escape += (ESC_DU - ESC_D);
1438
1439 /* Set the pointer to the final character before returning. */
1440
1441 *ptrptr = ptr;
1442 *chptr = c;
1443 return escape;
1444 }
1445
1446
1447
1448 #ifdef SUPPORT_UCP
1449 /*************************************************
1450 * Handle \P and \p *
1451 *************************************************/
1452
1453 /* This function is called after \P or \p has been encountered, provided that
1454 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1455 pointing at the P or p. On exit, it is pointing at the final character of the
1456 escape sequence.
1457
1458 Argument:
1459 ptrptr points to the pattern position pointer
1460 negptr points to a boolean that is set TRUE for negation else FALSE
1461 ptypeptr points to an unsigned int that is set to the type value
1462 pdataptr points to an unsigned int that is set to the detailed property value
1463 errorcodeptr points to the error code variable
1464
1465 Returns: TRUE if the type value was found, or FALSE for an invalid type
1466 */
1467
1468 static BOOL
1469 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1470 unsigned int *pdataptr, int *errorcodeptr)
1471 {
1472 pcre_uchar c;
1473 int i, bot, top;
1474 const pcre_uchar *ptr = *ptrptr;
1475 pcre_uchar name[32];
1476
1477 c = *(++ptr);
1478 if (c == CHAR_NULL) goto ERROR_RETURN;
1479
1480 *negptr = FALSE;
1481
1482 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1483 negation. */
1484
1485 if (c == CHAR_LEFT_CURLY_BRACKET)
1486 {
1487 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1488 {
1489 *negptr = TRUE;
1490 ptr++;
1491 }
1492 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1493 {
1494 c = *(++ptr);
1495 if (c == CHAR_NULL) goto ERROR_RETURN;
1496 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1497 name[i] = c;
1498 }
1499 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1500 name[i] = 0;
1501 }
1502
1503 /* Otherwise there is just one following character */
1504
1505 else
1506 {
1507 name[0] = c;
1508 name[1] = 0;
1509 }
1510
1511 *ptrptr = ptr;
1512
1513 /* Search for a recognized property name using binary chop */
1514
1515 bot = 0;
1516 top = PRIV(utt_size);
1517
1518 while (bot < top)
1519 {
1520 int r;
1521 i = (bot + top) >> 1;
1522 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1523 if (r == 0)
1524 {
1525 *ptypeptr = PRIV(utt)[i].type;
1526 *pdataptr = PRIV(utt)[i].value;
1527 return TRUE;
1528 }
1529 if (r > 0) bot = i + 1; else top = i;
1530 }
1531
1532 *errorcodeptr = ERR47;
1533 *ptrptr = ptr;
1534 return FALSE;
1535
1536 ERROR_RETURN:
1537 *errorcodeptr = ERR46;
1538 *ptrptr = ptr;
1539 return FALSE;
1540 }
1541 #endif
1542
1543
1544
1545 /*************************************************
1546 * Read repeat counts *
1547 *************************************************/
1548
1549 /* Read an item of the form {n,m} and return the values. This is called only
1550 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1551 so the syntax is guaranteed to be correct, but we need to check the values.
1552
1553 Arguments:
1554 p pointer to first char after '{'
1555 minp pointer to int for min
1556 maxp pointer to int for max
1557 returned as -1 if no max
1558 errorcodeptr points to error code variable
1559
1560 Returns: pointer to '}' on success;
1561 current ptr on error, with errorcodeptr set non-zero
1562 */
1563
1564 static const pcre_uchar *
1565 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1566 {
1567 int min = 0;
1568 int max = -1;
1569
1570 /* Read the minimum value and do a paranoid check: a negative value indicates
1571 an integer overflow. */
1572
1573 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1574 if (min < 0 || min > 65535)
1575 {
1576 *errorcodeptr = ERR5;
1577 return p;
1578 }
1579
1580 /* Read the maximum value if there is one, and again do a paranoid on its size.
1581 Also, max must not be less than min. */
1582
1583 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1584 {
1585 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1586 {
1587 max = 0;
1588 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1589 if (max < 0 || max > 65535)
1590 {
1591 *errorcodeptr = ERR5;
1592 return p;
1593 }
1594 if (max < min)
1595 {
1596 *errorcodeptr = ERR4;
1597 return p;
1598 }
1599 }
1600 }
1601
1602 /* Fill in the required variables, and pass back the pointer to the terminating
1603 '}'. */
1604
1605 *minp = min;
1606 *maxp = max;
1607 return p;
1608 }
1609
1610
1611
1612 /*************************************************
1613 * Find first significant op code *
1614 *************************************************/
1615
1616 /* This is called by several functions that scan a compiled expression looking
1617 for a fixed first character, or an anchoring op code etc. It skips over things
1618 that do not influence this. For some calls, it makes sense to skip negative
1619 forward and all backward assertions, and also the \b assertion; for others it
1620 does not.
1621
1622 Arguments:
1623 code pointer to the start of the group
1624 skipassert TRUE if certain assertions are to be skipped
1625
1626 Returns: pointer to the first significant opcode
1627 */
1628
1629 static const pcre_uchar*
1630 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1631 {
1632 for (;;)
1633 {
1634 switch ((int)*code)
1635 {
1636 case OP_ASSERT_NOT:
1637 case OP_ASSERTBACK:
1638 case OP_ASSERTBACK_NOT:
1639 if (!skipassert) return code;
1640 do code += GET(code, 1); while (*code == OP_ALT);
1641 code += PRIV(OP_lengths)[*code];
1642 break;
1643
1644 case OP_WORD_BOUNDARY:
1645 case OP_NOT_WORD_BOUNDARY:
1646 if (!skipassert) return code;
1647 /* Fall through */
1648
1649 case OP_CALLOUT:
1650 case OP_CREF:
1651 case OP_DNCREF:
1652 case OP_RREF:
1653 case OP_DNRREF:
1654 case OP_DEF:
1655 code += PRIV(OP_lengths)[*code];
1656 break;
1657
1658 default:
1659 return code;
1660 }
1661 }
1662 /* Control never reaches here */
1663 }
1664
1665
1666
1667 /*************************************************
1668 * Find the fixed length of a branch *
1669 *************************************************/
1670
1671 /* Scan a branch and compute the fixed length of subject that will match it,
1672 if the length is fixed. This is needed for dealing with backward assertions.
1673 In UTF8 mode, the result is in characters rather than bytes. The branch is
1674 temporarily terminated with OP_END when this function is called.
1675
1676 This function is called when a backward assertion is encountered, so that if it
1677 fails, the error message can point to the correct place in the pattern.
1678 However, we cannot do this when the assertion contains subroutine calls,
1679 because they can be forward references. We solve this by remembering this case
1680 and doing the check at the end; a flag specifies which mode we are running in.
1681
1682 Arguments:
1683 code points to the start of the pattern (the bracket)
1684 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1685 atend TRUE if called when the pattern is complete
1686 cd the "compile data" structure
1687
1688 Returns: the fixed length,
1689 or -1 if there is no fixed length,
1690 or -2 if \C was encountered (in UTF-8 mode only)
1691 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1692 or -4 if an unknown opcode was encountered (internal error)
1693 */
1694
1695 static int
1696 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1697 {
1698 int length = -1;
1699
1700 register int branchlength = 0;
1701 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1702
1703 /* Scan along the opcodes for this branch. If we get to the end of the
1704 branch, check the length against that of the other branches. */
1705
1706 for (;;)
1707 {
1708 int d;
1709 pcre_uchar *ce, *cs;
1710 register pcre_uchar op = *cc;
1711
1712 switch (op)
1713 {
1714 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1715 OP_BRA (normal non-capturing bracket) because the other variants of these
1716 opcodes are all concerned with unlimited repeated groups, which of course
1717 are not of fixed length. */
1718
1719 case OP_CBRA:
1720 case OP_BRA:
1721 case OP_ONCE:
1722 case OP_ONCE_NC:
1723 case OP_COND:
1724 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1725 if (d < 0) return d;
1726 branchlength += d;
1727 do cc += GET(cc, 1); while (*cc == OP_ALT);
1728 cc += 1 + LINK_SIZE;
1729 break;
1730
1731 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1732 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1733 an ALT. If it is END it's the end of the outer call. All can be handled by
1734 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1735 because they all imply an unlimited repeat. */
1736
1737 case OP_ALT:
1738 case OP_KET:
1739 case OP_END:
1740 case OP_ACCEPT:
1741 case OP_ASSERT_ACCEPT:
1742 if (length < 0) length = branchlength;
1743 else if (length != branchlength) return -1;
1744 if (*cc != OP_ALT) return length;
1745 cc += 1 + LINK_SIZE;
1746 branchlength = 0;
1747 break;
1748
1749 /* A true recursion implies not fixed length, but a subroutine call may
1750 be OK. If the subroutine is a forward reference, we can't deal with
1751 it until the end of the pattern, so return -3. */
1752
1753 case OP_RECURSE:
1754 if (!atend) return -3;
1755 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1756 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1757 if (cc > cs && cc < ce) return -1; /* Recursion */
1758 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1759 if (d < 0) return d;
1760 branchlength += d;
1761 cc += 1 + LINK_SIZE;
1762 break;
1763
1764 /* Skip over assertive subpatterns */
1765
1766 case OP_ASSERT:
1767 case OP_ASSERT_NOT:
1768 case OP_ASSERTBACK:
1769 case OP_ASSERTBACK_NOT:
1770 do cc += GET(cc, 1); while (*cc == OP_ALT);
1771 cc += PRIV(OP_lengths)[*cc];
1772 break;
1773
1774 /* Skip over things that don't match chars */
1775
1776 case OP_MARK:
1777 case OP_PRUNE_ARG:
1778 case OP_SKIP_ARG:
1779 case OP_THEN_ARG:
1780 cc += cc[1] + PRIV(OP_lengths)[*cc];
1781 break;
1782
1783 case OP_CALLOUT:
1784 case OP_CIRC:
1785 case OP_CIRCM:
1786 case OP_CLOSE:
1787 case OP_COMMIT:
1788 case OP_CREF:
1789 case OP_DEF:
1790 case OP_DNCREF:
1791 case OP_DNRREF:
1792 case OP_DOLL:
1793 case OP_DOLLM:
1794 case OP_EOD:
1795 case OP_EODN:
1796 case OP_FAIL:
1797 case OP_NOT_WORD_BOUNDARY:
1798 case OP_PRUNE:
1799 case OP_REVERSE:
1800 case OP_RREF:
1801 case OP_SET_SOM:
1802 case OP_SKIP:
1803 case OP_SOD:
1804 case OP_SOM:
1805 case OP_THEN:
1806 case OP_WORD_BOUNDARY:
1807 cc += PRIV(OP_lengths)[*cc];
1808 break;
1809
1810 /* Handle literal characters */
1811
1812 case OP_CHAR:
1813 case OP_CHARI:
1814 case OP_NOT:
1815 case OP_NOTI:
1816 branchlength++;
1817 cc += 2;
1818 #ifdef SUPPORT_UTF
1819 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1820 #endif
1821 break;
1822
1823 /* Handle exact repetitions. The count is already in characters, but we
1824 need to skip over a multibyte character in UTF8 mode. */
1825
1826 case OP_EXACT:
1827 case OP_EXACTI:
1828 case OP_NOTEXACT:
1829 case OP_NOTEXACTI:
1830 branchlength += (int)GET2(cc,1);
1831 cc += 2 + IMM2_SIZE;
1832 #ifdef SUPPORT_UTF
1833 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1834 #endif
1835 break;
1836
1837 case OP_TYPEEXACT:
1838 branchlength += GET2(cc,1);
1839 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1840 cc += 2;
1841 cc += 1 + IMM2_SIZE + 1;
1842 break;
1843
1844 /* Handle single-char matchers */
1845
1846 case OP_PROP:
1847 case OP_NOTPROP:
1848 cc += 2;
1849 /* Fall through */
1850
1851 case OP_HSPACE:
1852 case OP_VSPACE:
1853 case OP_NOT_HSPACE:
1854 case OP_NOT_VSPACE:
1855 case OP_NOT_DIGIT:
1856 case OP_DIGIT:
1857 case OP_NOT_WHITESPACE:
1858 case OP_WHITESPACE:
1859 case OP_NOT_WORDCHAR:
1860 case OP_WORDCHAR:
1861 case OP_ANY:
1862 case OP_ALLANY:
1863 branchlength++;
1864 cc++;
1865 break;
1866
1867 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1868 otherwise \C is coded as OP_ALLANY. */
1869
1870 case OP_ANYBYTE:
1871 return -2;
1872
1873 /* Check a class for variable quantification */
1874
1875 case OP_CLASS:
1876 case OP_NCLASS:
1877 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1878 case OP_XCLASS:
1879 /* The original code caused an unsigned overflow in 64 bit systems,
1880 so now we use a conditional statement. */
1881 if (op == OP_XCLASS)
1882 cc += GET(cc, 1);
1883 else
1884 cc += PRIV(OP_lengths)[OP_CLASS];
1885 #else
1886 cc += PRIV(OP_lengths)[OP_CLASS];
1887 #endif
1888
1889 switch (*cc)
1890 {
1891 case OP_CRSTAR:
1892 case OP_CRMINSTAR:
1893 case OP_CRPLUS:
1894 case OP_CRMINPLUS:
1895 case OP_CRQUERY:
1896 case OP_CRMINQUERY:
1897 case OP_CRPOSSTAR:
1898 case OP_CRPOSPLUS:
1899 case OP_CRPOSQUERY:
1900 return -1;
1901
1902 case OP_CRRANGE:
1903 case OP_CRMINRANGE:
1904 case OP_CRPOSRANGE:
1905 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1906 branchlength += (int)GET2(cc,1);
1907 cc += 1 + 2 * IMM2_SIZE;
1908 break;
1909
1910 default:
1911 branchlength++;
1912 }
1913 break;
1914
1915 /* Anything else is variable length */
1916
1917 case OP_ANYNL:
1918 case OP_BRAMINZERO:
1919 case OP_BRAPOS:
1920 case OP_BRAPOSZERO:
1921 case OP_BRAZERO:
1922 case OP_CBRAPOS:
1923 case OP_EXTUNI:
1924 case OP_KETRMAX:
1925 case OP_KETRMIN:
1926 case OP_KETRPOS:
1927 case OP_MINPLUS:
1928 case OP_MINPLUSI:
1929 case OP_MINQUERY:
1930 case OP_MINQUERYI:
1931 case OP_MINSTAR:
1932 case OP_MINSTARI:
1933 case OP_MINUPTO:
1934 case OP_MINUPTOI:
1935 case OP_NOTMINPLUS:
1936 case OP_NOTMINPLUSI:
1937 case OP_NOTMINQUERY:
1938 case OP_NOTMINQUERYI:
1939 case OP_NOTMINSTAR:
1940 case OP_NOTMINSTARI:
1941 case OP_NOTMINUPTO:
1942 case OP_NOTMINUPTOI:
1943 case OP_NOTPLUS:
1944 case OP_NOTPLUSI:
1945 case OP_NOTPOSPLUS:
1946 case OP_NOTPOSPLUSI:
1947 case OP_NOTPOSQUERY:
1948 case OP_NOTPOSQUERYI:
1949 case OP_NOTPOSSTAR:
1950 case OP_NOTPOSSTARI:
1951 case OP_NOTPOSUPTO:
1952 case OP_NOTPOSUPTOI:
1953 case OP_NOTQUERY:
1954 case OP_NOTQUERYI:
1955 case OP_NOTSTAR:
1956 case OP_NOTSTARI:
1957 case OP_NOTUPTO:
1958 case OP_NOTUPTOI:
1959 case OP_PLUS:
1960 case OP_PLUSI:
1961 case OP_POSPLUS:
1962 case OP_POSPLUSI:
1963 case OP_POSQUERY:
1964 case OP_POSQUERYI:
1965 case OP_POSSTAR:
1966 case OP_POSSTARI:
1967 case OP_POSUPTO:
1968 case OP_POSUPTOI:
1969 case OP_QUERY:
1970 case OP_QUERYI:
1971 case OP_REF:
1972 case OP_REFI:
1973 case OP_DNREF:
1974 case OP_DNREFI:
1975 case OP_SBRA:
1976 case OP_SBRAPOS:
1977 case OP_SCBRA:
1978 case OP_SCBRAPOS:
1979 case OP_SCOND:
1980 case OP_SKIPZERO:
1981 case OP_STAR:
1982 case OP_STARI:
1983 case OP_TYPEMINPLUS:
1984 case OP_TYPEMINQUERY:
1985 case OP_TYPEMINSTAR:
1986 case OP_TYPEMINUPTO:
1987 case OP_TYPEPLUS:
1988 case OP_TYPEPOSPLUS:
1989 case OP_TYPEPOSQUERY:
1990 case OP_TYPEPOSSTAR:
1991 case OP_TYPEPOSUPTO:
1992 case OP_TYPEQUERY:
1993 case OP_TYPESTAR:
1994 case OP_TYPEUPTO:
1995 case OP_UPTO:
1996 case OP_UPTOI:
1997 return -1;
1998
1999 /* Catch unrecognized opcodes so that when new ones are added they
2000 are not forgotten, as has happened in the past. */
2001
2002 default:
2003 return -4;
2004 }
2005 }
2006 /* Control never gets here */
2007 }
2008
2009
2010
2011 /*************************************************
2012 * Scan compiled regex for specific bracket *
2013 *************************************************/
2014
2015 /* This little function scans through a compiled pattern until it finds a
2016 capturing bracket with the given number, or, if the number is negative, an
2017 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2018 so that it can be called from pcre_study() when finding the minimum matching
2019 length.
2020
2021 Arguments:
2022 code points to start of expression
2023 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2024 number the required bracket number or negative to find a lookbehind
2025
2026 Returns: pointer to the opcode for the bracket, or NULL if not found
2027 */
2028
2029 const pcre_uchar *
2030 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2031 {
2032 for (;;)
2033 {
2034 register pcre_uchar c = *code;
2035
2036 if (c == OP_END) return NULL;
2037
2038 /* XCLASS is used for classes that cannot be represented just by a bit
2039 map. This includes negated single high-valued characters. The length in
2040 the table is zero; the actual length is stored in the compiled code. */
2041
2042 if (c == OP_XCLASS) code += GET(code, 1);
2043
2044 /* Handle recursion */
2045
2046 else if (c == OP_REVERSE)
2047 {
2048 if (number < 0) return (pcre_uchar *)code;
2049 code += PRIV(OP_lengths)[c];
2050 }
2051
2052 /* Handle capturing bracket */
2053
2054 else if (c == OP_CBRA || c == OP_SCBRA ||
2055 c == OP_CBRAPOS || c == OP_SCBRAPOS)
2056 {
2057 int n = (int)GET2(code, 1+LINK_SIZE);
2058 if (n == number) return (pcre_uchar *)code;
2059 code += PRIV(OP_lengths)[c];
2060 }
2061
2062 /* Otherwise, we can get the item's length from the table, except that for
2063 repeated character types, we have to test for \p and \P, which have an extra
2064 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2065 must add in its length. */
2066
2067 else
2068 {
2069 switch(c)
2070 {
2071 case OP_TYPESTAR:
2072 case OP_TYPEMINSTAR:
2073 case OP_TYPEPLUS:
2074 case OP_TYPEMINPLUS:
2075 case OP_TYPEQUERY:
2076 case OP_TYPEMINQUERY:
2077 case OP_TYPEPOSSTAR:
2078 case OP_TYPEPOSPLUS:
2079 case OP_TYPEPOSQUERY:
2080 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2081 break;
2082
2083 case OP_TYPEUPTO:
2084 case OP_TYPEMINUPTO:
2085 case OP_TYPEEXACT:
2086 case OP_TYPEPOSUPTO:
2087 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2088 code += 2;
2089 break;
2090
2091 case OP_MARK:
2092 case OP_PRUNE_ARG:
2093 case OP_SKIP_ARG:
2094 case OP_THEN_ARG:
2095 code += code[1];
2096 break;
2097 }
2098
2099 /* Add in the fixed length from the table */
2100
2101 code += PRIV(OP_lengths)[c];
2102
2103 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2104 a multi-byte character. The length in the table is a minimum, so we have to
2105 arrange to skip the extra bytes. */
2106
2107 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2108 if (utf) switch(c)
2109 {
2110 case OP_CHAR:
2111 case OP_CHARI:
2112 case OP_EXACT:
2113 case OP_EXACTI:
2114 case OP_UPTO:
2115 case OP_UPTOI:
2116 case OP_MINUPTO:
2117 case OP_MINUPTOI:
2118 case OP_POSUPTO:
2119 case OP_POSUPTOI:
2120 case OP_STAR:
2121 case OP_STARI:
2122 case OP_MINSTAR:
2123 case OP_MINSTARI:
2124 case OP_POSSTAR:
2125 case OP_POSSTARI:
2126 case OP_PLUS:
2127 case OP_PLUSI:
2128 case OP_MINPLUS:
2129 case OP_MINPLUSI:
2130 case OP_POSPLUS:
2131 case OP_POSPLUSI:
2132 case OP_QUERY:
2133 case OP_QUERYI:
2134 case OP_MINQUERY:
2135 case OP_MINQUERYI:
2136 case OP_POSQUERY:
2137 case OP_POSQUERYI:
2138 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2139 break;
2140 }
2141 #else
2142 (void)(utf); /* Keep compiler happy by referencing function argument */
2143 #endif
2144 }
2145 }
2146 }
2147
2148
2149
2150 /*************************************************
2151 * Scan compiled regex for recursion reference *
2152 *************************************************/
2153
2154 /* This little function scans through a compiled pattern until it finds an
2155 instance of OP_RECURSE.
2156
2157 Arguments:
2158 code points to start of expression
2159 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2160
2161 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2162 */
2163
2164 static const pcre_uchar *
2165 find_recurse(const pcre_uchar *code, BOOL utf)
2166 {
2167 for (;;)
2168 {
2169 register pcre_uchar c = *code;
2170 if (c == OP_END) return NULL;
2171 if (c == OP_RECURSE) return code;
2172
2173 /* XCLASS is used for classes that cannot be represented just by a bit
2174 map. This includes negated single high-valued characters. The length in
2175 the table is zero; the actual length is stored in the compiled code. */
2176
2177 if (c == OP_XCLASS) code += GET(code, 1);
2178
2179 /* Otherwise, we can get the item's length from the table, except that for
2180 repeated character types, we have to test for \p and \P, which have an extra
2181 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2182 must add in its length. */
2183
2184 else
2185 {
2186 switch(c)
2187 {
2188 case OP_TYPESTAR:
2189 case OP_TYPEMINSTAR:
2190 case OP_TYPEPLUS:
2191 case OP_TYPEMINPLUS:
2192 case OP_TYPEQUERY:
2193 case OP_TYPEMINQUERY:
2194 case OP_TYPEPOSSTAR:
2195 case OP_TYPEPOSPLUS:
2196 case OP_TYPEPOSQUERY:
2197 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2198 break;
2199
2200 case OP_TYPEPOSUPTO:
2201 case OP_TYPEUPTO:
2202 case OP_TYPEMINUPTO:
2203 case OP_TYPEEXACT:
2204 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2205 code += 2;
2206 break;
2207
2208 case OP_MARK:
2209 case OP_PRUNE_ARG:
2210 case OP_SKIP_ARG:
2211 case OP_THEN_ARG:
2212 code += code[1];
2213 break;
2214 }
2215
2216 /* Add in the fixed length from the table */
2217
2218 code += PRIV(OP_lengths)[c];
2219
2220 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2221 by a multi-byte character. The length in the table is a minimum, so we have
2222 to arrange to skip the extra bytes. */
2223
2224 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2225 if (utf) switch(c)
2226 {
2227 case OP_CHAR:
2228 case OP_CHARI:
2229 case OP_NOT:
2230 case OP_NOTI:
2231 case OP_EXACT:
2232 case OP_EXACTI:
2233 case OP_NOTEXACT:
2234 case OP_NOTEXACTI:
2235 case OP_UPTO:
2236 case OP_UPTOI:
2237 case OP_NOTUPTO:
2238 case OP_NOTUPTOI:
2239 case OP_MINUPTO:
2240 case OP_MINUPTOI:
2241 case OP_NOTMINUPTO:
2242 case OP_NOTMINUPTOI:
2243 case OP_POSUPTO:
2244 case OP_POSUPTOI:
2245 case OP_NOTPOSUPTO:
2246 case OP_NOTPOSUPTOI:
2247 case OP_STAR:
2248 case OP_STARI:
2249 case OP_NOTSTAR:
2250 case OP_NOTSTARI:
2251 case OP_MINSTAR:
2252 case OP_MINSTARI:
2253 case OP_NOTMINSTAR:
2254 case OP_NOTMINSTARI:
2255 case OP_POSSTAR:
2256 case OP_POSSTARI:
2257 case OP_NOTPOSSTAR:
2258 case OP_NOTPOSSTARI:
2259 case OP_PLUS:
2260 case OP_PLUSI:
2261 case OP_NOTPLUS:
2262 case OP_NOTPLUSI:
2263 case OP_MINPLUS:
2264 case OP_MINPLUSI:
2265 case OP_NOTMINPLUS:
2266 case OP_NOTMINPLUSI:
2267 case OP_POSPLUS:
2268 case OP_POSPLUSI:
2269 case OP_NOTPOSPLUS:
2270 case OP_NOTPOSPLUSI:
2271 case OP_QUERY:
2272 case OP_QUERYI:
2273 case OP_NOTQUERY:
2274 case OP_NOTQUERYI:
2275 case OP_MINQUERY:
2276 case OP_MINQUERYI:
2277 case OP_NOTMINQUERY:
2278 case OP_NOTMINQUERYI:
2279 case OP_POSQUERY:
2280 case OP_POSQUERYI:
2281 case OP_NOTPOSQUERY:
2282 case OP_NOTPOSQUERYI:
2283 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2284 break;
2285 }
2286 #else
2287 (void)(utf); /* Keep compiler happy by referencing function argument */
2288 #endif
2289 }
2290 }
2291 }
2292
2293
2294
2295 /*************************************************
2296 * Scan compiled branch for non-emptiness *
2297 *************************************************/
2298
2299 /* This function scans through a branch of a compiled pattern to see whether it
2300 can match the empty string or not. It is called from could_be_empty()
2301 below and from compile_branch() when checking for an unlimited repeat of a
2302 group that can match nothing. Note that first_significant_code() skips over
2303 backward and negative forward assertions when its final argument is TRUE. If we
2304 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2305 bracket whose current branch will already have been scanned.
2306
2307 Arguments:
2308 code points to start of search
2309 endcode points to where to stop
2310 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2311 cd contains pointers to tables etc.
2312 recurses chain of recurse_check to catch mutual recursion
2313
2314 Returns: TRUE if what is matched could be empty
2315 */
2316
2317 typedef struct recurse_check {
2318 struct recurse_check *prev;
2319 const pcre_uchar *group;
2320 } recurse_check;
2321
2322 static BOOL
2323 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2324 BOOL utf, compile_data *cd, recurse_check *recurses)
2325 {
2326 register pcre_uchar c;
2327 recurse_check this_recurse;
2328
2329 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2330 code < endcode;
2331 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2332 {
2333 const pcre_uchar *ccode;
2334
2335 c = *code;
2336
2337 /* Skip over forward assertions; the other assertions are skipped by
2338 first_significant_code() with a TRUE final argument. */
2339
2340 if (c == OP_ASSERT)
2341 {
2342 do code += GET(code, 1); while (*code == OP_ALT);
2343 c = *code;
2344 continue;
2345 }
2346
2347 /* For a recursion/subroutine call, if its end has been reached, which
2348 implies a backward reference subroutine call, we can scan it. If it's a
2349 forward reference subroutine call, we can't. To detect forward reference
2350 we have to scan up the list that is kept in the workspace. This function is
2351 called only when doing the real compile, not during the pre-compile that
2352 measures the size of the compiled pattern. */
2353
2354 if (c == OP_RECURSE)
2355 {
2356 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2357 BOOL empty_branch;
2358
2359 /* Test for forward reference or uncompleted reference. This is disabled
2360 when called to scan a completed pattern by setting cd->start_workspace to
2361 NULL. */
2362
2363 if (cd->start_workspace != NULL)
2364 {
2365 const pcre_uchar *tcode;
2366 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2367 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2368 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2369 }
2370
2371 /* If we are scanning a completed pattern, there are no forward references
2372 and all groups are complete. We need to detect whether this is a recursive
2373 call, as otherwise there will be an infinite loop. If it is a recursion,
2374 just skip over it. Simple recursions are easily detected. For mutual
2375 recursions we keep a chain on the stack. */
2376
2377 else
2378 {
2379 recurse_check *r = recurses;
2380 const pcre_uchar *endgroup = scode;
2381
2382 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2383 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2384
2385 for (r = recurses; r != NULL; r = r->prev)
2386 if (r->group == scode) break;
2387 if (r != NULL) continue; /* Mutual recursion */
2388 }
2389
2390 /* Completed reference; scan the referenced group, remembering it on the
2391 stack chain to detect mutual recursions. */
2392
2393 empty_branch = FALSE;
2394 this_recurse.prev = recurses;
2395 this_recurse.group = scode;
2396
2397 do
2398 {
2399 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2400 {
2401 empty_branch = TRUE;
2402 break;
2403 }
2404 scode += GET(scode, 1);
2405 }
2406 while (*scode == OP_ALT);
2407
2408 if (!empty_branch) return FALSE; /* All branches are non-empty */
2409 continue;
2410 }
2411
2412 /* Groups with zero repeats can of course be empty; skip them. */
2413
2414 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2415 c == OP_BRAPOSZERO)
2416 {
2417 code += PRIV(OP_lengths)[c];
2418 do code += GET(code, 1); while (*code == OP_ALT);
2419 c = *code;
2420 continue;
2421 }
2422
2423 /* A nested group that is already marked as "could be empty" can just be
2424 skipped. */
2425
2426 if (c == OP_SBRA || c == OP_SBRAPOS ||
2427 c == OP_SCBRA || c == OP_SCBRAPOS)
2428 {
2429 do code += GET(code, 1); while (*code == OP_ALT);
2430 c = *code;
2431 continue;
2432 }
2433
2434 /* For other groups, scan the branches. */
2435
2436 if (c == OP_BRA || c == OP_BRAPOS ||
2437 c == OP_CBRA || c == OP_CBRAPOS ||
2438 c == OP_ONCE || c == OP_ONCE_NC ||
2439 c == OP_COND)
2440 {
2441 BOOL empty_branch;
2442 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2443
2444 /* If a conditional group has only one branch, there is a second, implied,
2445 empty branch, so just skip over the conditional, because it could be empty.
2446 Otherwise, scan the individual branches of the group. */
2447
2448 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2449 code += GET(code, 1);
2450 else
2451 {
2452 empty_branch = FALSE;
2453 do
2454 {
2455 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2456 empty_branch = TRUE;
2457 code += GET(code, 1);
2458 }
2459 while (*code == OP_ALT);
2460 if (!empty_branch) return FALSE; /* All branches are non-empty */
2461 }
2462
2463 c = *code;
2464 continue;
2465 }
2466
2467 /* Handle the other opcodes */
2468
2469 switch (c)
2470 {
2471 /* Check for quantifiers after a class. XCLASS is used for classes that
2472 cannot be represented just by a bit map. This includes negated single
2473 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2474 actual length is stored in the compiled code, so we must update "code"
2475 here. */
2476
2477 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2478 case OP_XCLASS:
2479 ccode = code += GET(code, 1);
2480 goto CHECK_CLASS_REPEAT;
2481 #endif
2482
2483 case OP_CLASS:
2484 case OP_NCLASS:
2485 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2486
2487 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2488 CHECK_CLASS_REPEAT:
2489 #endif
2490
2491 switch (*ccode)
2492 {
2493 case OP_CRSTAR: /* These could be empty; continue */
2494 case OP_CRMINSTAR:
2495 case OP_CRQUERY:
2496 case OP_CRMINQUERY:
2497 case OP_CRPOSSTAR:
2498 case OP_CRPOSQUERY:
2499 break;
2500
2501 default: /* Non-repeat => class must match */
2502 case OP_CRPLUS: /* These repeats aren't empty */
2503 case OP_CRMINPLUS:
2504 case OP_CRPOSPLUS:
2505 return FALSE;
2506
2507 case OP_CRRANGE:
2508 case OP_CRMINRANGE:
2509 case OP_CRPOSRANGE:
2510 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2511 break;
2512 }
2513 break;
2514
2515 /* Opcodes that must match a character */
2516
2517 case OP_ANY:
2518 case OP_ALLANY:
2519 case OP_ANYBYTE:
2520
2521 case OP_PROP:
2522 case OP_NOTPROP:
2523 case OP_ANYNL:
2524
2525 case OP_NOT_HSPACE:
2526 case OP_HSPACE:
2527 case OP_NOT_VSPACE:
2528 case OP_VSPACE:
2529 case OP_EXTUNI:
2530
2531 case OP_NOT_DIGIT:
2532 case OP_DIGIT:
2533 case OP_NOT_WHITESPACE:
2534 case OP_WHITESPACE:
2535 case OP_NOT_WORDCHAR:
2536 case OP_WORDCHAR:
2537
2538 case OP_CHAR:
2539 case OP_CHARI:
2540 case OP_NOT:
2541 case OP_NOTI:
2542
2543 case OP_PLUS:
2544 case OP_PLUSI:
2545 case OP_MINPLUS:
2546 case OP_MINPLUSI:
2547
2548 case OP_NOTPLUS:
2549 case OP_NOTPLUSI:
2550 case OP_NOTMINPLUS:
2551 case OP_NOTMINPLUSI:
2552
2553 case OP_POSPLUS:
2554 case OP_POSPLUSI:
2555 case OP_NOTPOSPLUS:
2556 case OP_NOTPOSPLUSI:
2557
2558 case OP_EXACT:
2559 case OP_EXACTI:
2560 case OP_NOTEXACT:
2561 case OP_NOTEXACTI:
2562
2563 case OP_TYPEPLUS:
2564 case OP_TYPEMINPLUS:
2565 case OP_TYPEPOSPLUS:
2566 case OP_TYPEEXACT:
2567
2568 return FALSE;
2569
2570 /* These are going to continue, as they may be empty, but we have to
2571 fudge the length for the \p and \P cases. */
2572
2573 case OP_TYPESTAR:
2574 case OP_TYPEMINSTAR:
2575 case OP_TYPEPOSSTAR:
2576 case OP_TYPEQUERY:
2577 case OP_TYPEMINQUERY:
2578 case OP_TYPEPOSQUERY:
2579 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2580 break;
2581
2582 /* Same for these */
2583
2584 case OP_TYPEUPTO:
2585 case OP_TYPEMINUPTO:
2586 case OP_TYPEPOSUPTO:
2587 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2588 code += 2;
2589 break;
2590
2591 /* End of branch */
2592
2593 case OP_KET:
2594 case OP_KETRMAX:
2595 case OP_KETRMIN:
2596 case OP_KETRPOS:
2597 case OP_ALT:
2598 return TRUE;
2599
2600 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2601 MINUPTO, and POSUPTO and their caseless and negative versions may be
2602 followed by a multibyte character. */
2603
2604 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2605 case OP_STAR:
2606 case OP_STARI:
2607 case OP_NOTSTAR:
2608 case OP_NOTSTARI:
2609
2610 case OP_MINSTAR:
2611 case OP_MINSTARI:
2612 case OP_NOTMINSTAR:
2613 case OP_NOTMINSTARI:
2614
2615 case OP_POSSTAR:
2616 case OP_POSSTARI:
2617 case OP_NOTPOSSTAR:
2618 case OP_NOTPOSSTARI:
2619
2620 case OP_QUERY:
2621 case OP_QUERYI:
2622 case OP_NOTQUERY:
2623 case OP_NOTQUERYI:
2624
2625 case OP_MINQUERY:
2626 case OP_MINQUERYI:
2627 case OP_NOTMINQUERY:
2628 case OP_NOTMINQUERYI:
2629
2630 case OP_POSQUERY:
2631 case OP_POSQUERYI:
2632 case OP_NOTPOSQUERY:
2633 case OP_NOTPOSQUERYI:
2634
2635 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2636 break;
2637
2638 case OP_UPTO:
2639 case OP_UPTOI:
2640 case OP_NOTUPTO:
2641 case OP_NOTUPTOI:
2642
2643 case OP_MINUPTO:
2644 case OP_MINUPTOI:
2645 case OP_NOTMINUPTO:
2646 case OP_NOTMINUPTOI:
2647
2648 case OP_POSUPTO:
2649 case OP_POSUPTOI:
2650 case OP_NOTPOSUPTO:
2651 case OP_NOTPOSUPTOI:
2652
2653 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2654 break;
2655 #endif
2656
2657 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2658 string. */
2659
2660 case OP_MARK:
2661 case OP_PRUNE_ARG:
2662 case OP_SKIP_ARG:
2663 case OP_THEN_ARG:
2664 code += code[1];
2665 break;
2666
2667 /* None of the remaining opcodes are required to match a character. */
2668
2669 default:
2670 break;
2671 }
2672 }
2673
2674 return TRUE;
2675 }
2676
2677
2678
2679 /*************************************************
2680 * Scan compiled regex for non-emptiness *
2681 *************************************************/
2682
2683 /* This function is called to check for left recursive calls. We want to check
2684 the current branch of the current pattern to see if it could match the empty
2685 string. If it could, we must look outwards for branches at other levels,
2686 stopping when we pass beyond the bracket which is the subject of the recursion.
2687 This function is called only during the real compile, not during the
2688 pre-compile.
2689
2690 Arguments:
2691 code points to start of the recursion
2692 endcode points to where to stop (current RECURSE item)
2693 bcptr points to the chain of current (unclosed) branch starts
2694 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2695 cd pointers to tables etc
2696
2697 Returns: TRUE if what is matched could be empty
2698 */
2699
2700 static BOOL
2701 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2702 branch_chain *bcptr, BOOL utf, compile_data *cd)
2703 {
2704 while (bcptr != NULL && bcptr->current_branch >= code)
2705 {
2706 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2707 return FALSE;
2708 bcptr = bcptr->outer;
2709 }
2710 return TRUE;
2711 }
2712
2713
2714
2715 /*************************************************
2716 * Base opcode of repeated opcodes *
2717 *************************************************/
2718
2719 /* Returns the base opcode for repeated single character type opcodes. If the
2720 opcode is not a repeated character type, it returns with the original value.
2721
2722 Arguments: c opcode
2723 Returns: base opcode for the type
2724 */
2725
2726 static pcre_uchar
2727 get_repeat_base(pcre_uchar c)
2728 {
2729 return (c > OP_TYPEPOSUPTO)? c :
2730 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2731 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2732 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2733 (c >= OP_STARI)? OP_STARI :
2734 OP_STAR;
2735 }
2736
2737
2738
2739 #ifdef SUPPORT_UCP
2740 /*************************************************
2741 * Check a character and a property *
2742 *************************************************/
2743
2744 /* This function is called by check_auto_possessive() when a property item
2745 is adjacent to a fixed character.
2746
2747 Arguments:
2748 c the character
2749 ptype the property type
2750 pdata the data for the type
2751 negated TRUE if it's a negated property (\P or \p{^)
2752
2753 Returns: TRUE if auto-possessifying is OK
2754 */
2755
2756 static BOOL
2757 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2758 BOOL negated)
2759 {
2760 const pcre_uint32 *p;
2761 const ucd_record *prop = GET_UCD(c);
2762
2763 switch(ptype)
2764 {
2765 case PT_LAMP:
2766 return (prop->chartype == ucp_Lu ||
2767 prop->chartype == ucp_Ll ||
2768 prop->chartype == ucp_Lt) == negated;
2769
2770 case PT_GC:
2771 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2772
2773 case PT_PC:
2774 return (pdata == prop->chartype) == negated;
2775
2776 case PT_SC:
2777 return (pdata == prop->script) == negated;
2778
2779 /* These are specials */
2780
2781 case PT_ALNUM:
2782 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2783 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2784
2785 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2786 means that Perl space and POSIX space are now identical. PCRE was changed
2787 at release 8.34. */
2788
2789 case PT_SPACE: /* Perl space */
2790 case PT_PXSPACE: /* POSIX space */
2791 switch(c)
2792 {
2793 HSPACE_CASES:
2794 VSPACE_CASES:
2795 return negated;
2796
2797 default:
2798 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2799 }
2800 break; /* Control never reaches here */
2801
2802 case PT_WORD:
2803 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2804 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2805 c == CHAR_UNDERSCORE) == negated;
2806
2807 case PT_CLIST:
2808 p = PRIV(ucd_caseless_sets) + prop->caseset;
2809 for (;;)
2810 {
2811 if (c < *p) return !negated;
2812 if (c == *p++) return negated;
2813 }
2814 break; /* Control never reaches here */
2815 }
2816
2817 return FALSE;
2818 }
2819 #endif /* SUPPORT_UCP */
2820
2821
2822
2823 /*************************************************
2824 * Fill the character property list *
2825 *************************************************/
2826
2827 /* Checks whether the code points to an opcode that can take part in auto-
2828 possessification, and if so, fills a list with its properties.
2829
2830 Arguments:
2831 code points to start of expression
2832 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2833 fcc points to case-flipping table
2834 list points to output list
2835 list[0] will be filled with the opcode
2836 list[1] will be non-zero if this opcode
2837 can match an empty character string
2838 list[2..7] depends on the opcode
2839
2840 Returns: points to the start of the next opcode if *code is accepted
2841 NULL if *code is not accepted
2842 */
2843
2844 static const pcre_uchar *
2845 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2846 const pcre_uint8 *fcc, pcre_uint32 *list)
2847 {
2848 pcre_uchar c = *code;
2849 const pcre_uchar *end;
2850 const pcre_uint32 *clist_src;
2851 pcre_uint32 *clist_dest;
2852 pcre_uint32 chr;
2853 pcre_uchar base;
2854
2855 list[0] = c;
2856 list[1] = FALSE;
2857 code++;
2858
2859 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2860 {
2861 base = get_repeat_base(c);
2862 c -= (base - OP_STAR);
2863
2864 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2865 code += IMM2_SIZE;
2866
2867 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2868
2869 switch(base)
2870 {
2871 case OP_STAR:
2872 list[0] = OP_CHAR;
2873 break;
2874
2875 case OP_STARI:
2876 list[0] = OP_CHARI;
2877 break;
2878
2879 case OP_NOTSTAR:
2880 list[0] = OP_NOT;
2881 break;
2882
2883 case OP_NOTSTARI:
2884 list[0] = OP_NOTI;
2885 break;
2886
2887 case OP_TYPESTAR:
2888 list[0] = *code;
2889 code++;
2890 break;
2891 }
2892 c = list[0];
2893 }
2894
2895 switch(c)
2896 {
2897 case OP_NOT_DIGIT:
2898 case OP_DIGIT:
2899 case OP_NOT_WHITESPACE:
2900 case OP_WHITESPACE:
2901 case OP_NOT_WORDCHAR:
2902 case OP_WORDCHAR:
2903 case OP_ANY:
2904 case OP_ALLANY:
2905 case OP_ANYNL:
2906 case OP_NOT_HSPACE:
2907 case OP_HSPACE:
2908 case OP_NOT_VSPACE:
2909 case OP_VSPACE:
2910 case OP_EXTUNI:
2911 case OP_EODN:
2912 case OP_EOD:
2913 case OP_DOLL:
2914 case OP_DOLLM:
2915 return code;
2916
2917 case OP_CHAR:
2918 case OP_NOT:
2919 GETCHARINCTEST(chr, code);
2920 list[2] = chr;
2921 list[3] = NOTACHAR;
2922 return code;
2923
2924 case OP_CHARI:
2925 case OP_NOTI:
2926 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2927 GETCHARINCTEST(chr, code);
2928 list[2] = chr;
2929
2930 #ifdef SUPPORT_UCP
2931 if (chr < 128 || (chr < 256 && !utf))
2932 list[3] = fcc[chr];
2933 else
2934 list[3] = UCD_OTHERCASE(chr);
2935 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2936 list[3] = (chr < 256) ? fcc[chr] : chr;
2937 #else
2938 list[3] = fcc[chr];
2939 #endif
2940
2941 /* The othercase might be the same value. */
2942
2943 if (chr == list[3])
2944 list[3] = NOTACHAR;
2945 else
2946 list[4] = NOTACHAR;
2947 return code;
2948
2949 #ifdef SUPPORT_UCP
2950 case OP_PROP:
2951 case OP_NOTPROP:
2952 if (code[0] != PT_CLIST)
2953 {
2954 list[2] = code[0];
2955 list[3] = code[1];
2956 return code + 2;
2957 }
2958
2959 /* Convert only if we have enough space. */
2960
2961 clist_src = PRIV(ucd_caseless_sets) + code[1];
2962 clist_dest = list + 2;
2963 code += 2;
2964
2965 do {
2966 if (clist_dest >= list + 8)
2967 {
2968 /* Early return if there is not enough space. This should never
2969 happen, since all clists are shorter than 5 character now. */
2970 list[2] = code[0];
2971 list[3] = code[1];
2972 return code;
2973 }
2974 *clist_dest++ = *clist_src;
2975 }
2976 while(*clist_src++ != NOTACHAR);
2977
2978 /* All characters are stored. The terminating NOTACHAR
2979 is copied form the clist itself. */
2980
2981 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
2982 return code;
2983 #endif
2984
2985 case OP_NCLASS:
2986 case OP_CLASS:
2987 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2988 case OP_XCLASS:
2989 if (c == OP_XCLASS)
2990 end = code + GET(code, 0) - 1;
2991 else
2992 #endif
2993 end = code + 32 / sizeof(pcre_uchar);
2994
2995 switch(*end)
2996 {
2997 case OP_CRSTAR:
2998 case OP_CRMINSTAR:
2999 case OP_CRQUERY:
3000 case OP_CRMINQUERY:
3001 case OP_CRPOSSTAR:
3002 case OP_CRPOSQUERY:
3003 list[1] = TRUE;
3004 end++;
3005 break;
3006
3007 case OP_CRPLUS:
3008 case OP_CRMINPLUS:
3009 case OP_CRPOSPLUS:
3010 end++;
3011 break;
3012
3013 case OP_CRRANGE:
3014 case OP_CRMINRANGE:
3015 case OP_CRPOSRANGE:
3016 list[1] = (GET2(end, 1) == 0);
3017 end += 1 + 2 * IMM2_SIZE;
3018 break;
3019 }
3020 list[2] = end - code;
3021 return end;
3022 }
3023 return NULL; /* Opcode not accepted */
3024 }
3025
3026
3027
3028 /*************************************************
3029 * Scan further character sets for match *
3030 *************************************************/
3031
3032 /* Checks whether the base and the current opcode have a common character, in
3033 which case the base cannot be possessified.
3034
3035 Arguments:
3036 code points to the byte code
3037 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3038 cd static compile data
3039 base_list the data list of the base opcode
3040
3041 Returns: TRUE if the auto-possessification is possible
3042 */
3043
3044 static BOOL
3045 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3046 const pcre_uint32 *base_list, const pcre_uchar *base_end)
3047 {
3048 pcre_uchar c;
3049 pcre_uint32 list[8];
3050 const pcre_uint32 *chr_ptr;
3051 const pcre_uint32 *ochr_ptr;
3052 const pcre_uint32 *list_ptr;
3053 const pcre_uchar *next_code;
3054 const pcre_uint8 *class_bitset;
3055 const pcre_uint32 *set1, *set2, *set_end;
3056 pcre_uint32 chr;
3057 BOOL accepted, invert_bits;
3058
3059 /* Note: the base_list[1] contains whether the current opcode has greedy
3060 (represented by a non-zero value) quantifier. This is a different from
3061 other character type lists, which stores here that the character iterator
3062 matches to an empty string (also represented by a non-zero value). */
3063
3064 for(;;)
3065 {
3066 /* All operations move the code pointer forward.
3067 Therefore infinite recursions are not possible. */
3068
3069 c = *code;
3070
3071 /* Skip over callouts */
3072
3073 if (c == OP_CALLOUT)
3074 {
3075 code += PRIV(OP_lengths)[c];
3076 continue;
3077 }
3078
3079 if (c == OP_ALT)
3080 {
3081 do code += GET(code, 1); while (*code == OP_ALT);
3082 c = *code;
3083 }
3084
3085 switch(c)
3086 {
3087 case OP_END:
3088 case OP_KETRPOS:
3089 /* TRUE only in greedy case. The non-greedy case could be replaced by
3090 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3091 uses more memory, which we cannot get at this stage.) */
3092
3093 return base_list[1] != 0;
3094
3095 case OP_KET:
3096 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3097 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3098 cannot be converted to a possessive form. */
3099
3100 if (base_list[1] == 0) return FALSE;
3101
3102 switch(*(code - GET(code, 1)))
3103 {
3104 case OP_ASSERT:
3105 case OP_ASSERT_NOT:
3106 case OP_ASSERTBACK:
3107 case OP_ASSERTBACK_NOT:
3108 case OP_ONCE:
3109 case OP_ONCE_NC:
3110 /* Atomic sub-patterns and assertions can always auto-possessify their
3111 last iterator. */
3112 return TRUE;
3113 }
3114
3115 code += PRIV(OP_lengths)[c];
3116 continue;
3117
3118 case OP_ONCE:
3119 case OP_ONCE_NC:
3120 case OP_BRA:
3121 case OP_CBRA:
3122 next_code = code + GET(code, 1);
3123 code += PRIV(OP_lengths)[c];
3124
3125 while (*next_code == OP_ALT)
3126 {
3127 if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3128 code = next_code + 1 + LINK_SIZE;
3129 next_code += GET(next_code, 1);
3130 }
3131 continue;
3132
3133 case OP_BRAZERO:
3134 case OP_BRAMINZERO:
3135
3136 next_code = code + 1;
3137 if (*next_code != OP_BRA && *next_code != OP_CBRA
3138 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3139
3140 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3141
3142 /* The bracket content will be checked by the
3143 OP_BRA/OP_CBRA case above. */
3144 next_code += 1 + LINK_SIZE;
3145 if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3146 return FALSE;
3147
3148 code += PRIV(OP_lengths)[c];
3149 continue;
3150 }
3151
3152 /* Check for a supported opcode, and load its properties. */
3153
3154 code = get_chr_property_list(code, utf, cd->fcc, list);
3155 if (code == NULL) return FALSE; /* Unsupported */
3156
3157 /* If either opcode is a small character list, set pointers for comparing
3158 characters from that list with another list, or with a property. */
3159
3160 if (base_list[0] == OP_CHAR)
3161 {
3162 chr_ptr = base_list + 2;
3163 list_ptr = list;
3164 }
3165 else if (list[0] == OP_CHAR)
3166 {
3167 chr_ptr = list + 2;
3168 list_ptr = base_list;
3169 }
3170
3171 /* Character bitsets can also be compared to certain opcodes. */
3172
3173 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3174 #ifdef COMPILE_PCRE8
3175 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3176 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3177 #endif
3178 )
3179 {
3180 #ifdef COMPILE_PCRE8
3181 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3182 #else
3183 if (base_list[0] == OP_CLASS)
3184 #endif
3185 {
3186 set1 = (pcre_uint32 *)(base_end - base_list[2]);
3187 list_ptr = list;
3188 }
3189 else
3190 {
3191 set1 = (pcre_uint32 *)(code - list[2]);
3192 list_ptr = base_list;
3193 }
3194
3195 invert_bits = FALSE;
3196 switch(list_ptr[0])
3197 {
3198 case OP_CLASS:
3199 case OP_NCLASS:
3200 set2 = (pcre_uint32 *)
3201 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3202 break;
3203
3204 /* OP_XCLASS cannot be supported here, because its bitset
3205 is not necessarily complete. E.g: [a-\0x{200}] is stored
3206 as a character range, and the appropriate bits are not set. */
3207
3208 case OP_NOT_DIGIT:
3209 invert_bits = TRUE;
3210 /* Fall through */
3211 case OP_DIGIT:
3212 set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
3213 break;
3214
3215 case OP_NOT_WHITESPACE:
3216 invert_bits = TRUE;
3217 /* Fall through */
3218 case OP_WHITESPACE:
3219 set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
3220 break;
3221
3222 case OP_NOT_WORDCHAR:
3223 invert_bits = TRUE;
3224 /* Fall through */
3225 case OP_WORDCHAR:
3226 set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
3227 break;
3228
3229 default:
3230 return FALSE;
3231 }
3232
3233 /* Compare 4 bytes to improve speed. */
3234 set_end = set1 + (32 / 4);
3235 if (invert_bits)
3236 {
3237 do
3238 {
3239 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3240 }
3241 while (set1 < set_end);
3242 }
3243 else
3244 {
3245 do
3246 {
3247 if ((*set1++ & *set2++) != 0) return FALSE;
3248 }
3249 while (set1 < set_end);
3250 }
3251
3252 if (list[1] == 0) return TRUE;
3253 /* Might be an empty repeat. */
3254 continue;
3255 }
3256
3257 /* Some property combinations also acceptable. Unicode property opcodes are
3258 processed specially; the rest can be handled with a lookup table. */
3259
3260 else
3261 {
3262 pcre_uint32 leftop, rightop;
3263
3264 leftop = base_list[0];
3265 rightop = list[0];
3266
3267 #ifdef SUPPORT_UCP
3268 accepted = FALSE; /* Always set in non-unicode case. */
3269 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3270 {
3271 if (rightop == OP_EOD)
3272 accepted = TRUE;
3273 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3274 {
3275 int n;
3276 const pcre_uint8 *p;
3277 BOOL same = leftop == rightop;
3278 BOOL lisprop = leftop == OP_PROP;
3279 BOOL risprop = rightop == OP_PROP;
3280 BOOL bothprop = lisprop && risprop;
3281
3282 /* There's a table that specifies how each combination is to be
3283 processed:
3284 0 Always return FALSE (never auto-possessify)
3285 1 Character groups are distinct (possessify if both are OP_PROP)
3286 2 Check character categories in the same group (general or particular)
3287 3 Return TRUE if the two opcodes are not the same
3288 ... see comments below
3289 */
3290
3291 n = propposstab[base_list[2]][list[2]];
3292 switch(n)
3293 {
3294 case 0: break;
3295 case 1: accepted = bothprop; break;
3296 case 2: accepted = (base_list[3] == list[3]) != same; break;
3297 case 3: accepted = !same; break;
3298
3299 case 4: /* Left general category, right particular category */
3300 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3301 break;
3302
3303 case 5: /* Right general category, left particular category */
3304 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3305 break;
3306
3307 /* This code is logically tricky. Think hard before fiddling with it.
3308 The posspropstab table has four entries per row. Each row relates to
3309 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3310 Only WORD actually needs all four entries, but using repeats for the
3311 others means they can all use the same code below.
3312
3313 The first two entries in each row are Unicode general categories, and
3314 apply always, because all the characters they include are part of the
3315 PCRE character set. The third and fourth entries are a general and a
3316 particular category, respectively, that include one or more relevant
3317 characters. One or the other is used, depending on whether the check
3318 is for a general or a particular category. However, in both cases the
3319 category contains more characters than the specials that are defined
3320 for the property being tested against. Therefore, it cannot be used
3321 in a NOTPROP case.
3322
3323 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3324 Underscore is covered by ucp_P or ucp_Po. */
3325
3326 case 6: /* Left alphanum vs right general category */
3327 case 7: /* Left space vs right general category */
3328 case 8: /* Left word vs right general category */
3329 p = posspropstab[n-6];
3330 accepted = risprop && lisprop ==
3331 (list[3] != p[0] &&
3332 list[3] != p[1] &&
3333 (list[3] != p[2] || !lisprop));
3334 break;
3335
3336 case 9: /* Right alphanum vs left general category */
3337 case 10: /* Right space vs left general category */
3338 case 11: /* Right word vs left general category */
3339 p = posspropstab[n-9];
3340 accepted = lisprop && risprop ==
3341 (base_list[3] != p[0] &&
3342 base_list[3] != p[1] &&
3343 (base_list[3] != p[2] || !risprop));
3344 break;
3345
3346 case 12: /* Left alphanum vs right particular category */
3347 case 13: /* Left space vs right particular category */
3348 case 14: /* Left word vs right particular category */
3349 p = posspropstab[n-12];
3350 accepted = risprop && lisprop ==
3351 (catposstab[p[0]][list[3]] &&
3352 catposstab[p[1]][list[3]] &&
3353 (list[3] != p[3] || !lisprop));
3354 break;
3355
3356 case 15: /* Right alphanum vs left particular category */
3357 case 16: /* Right space vs left particular category */
3358 case 17: /* Right word vs left particular category */
3359 p = posspropstab[n-15];
3360 accepted = lisprop && risprop ==
3361 (catposstab[p[0]][base_list[3]] &&
3362 catposstab[p[1]][base_list[3]] &&
3363 (base_list[3] != p[3] || !risprop));
3364 break;
3365 }
3366 }
3367 }
3368
3369 else
3370 #endif /* SUPPORT_UCP */
3371
3372 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3373 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3374 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3375
3376 if (!accepted)
3377 return FALSE;
3378
3379 if (list[1] == 0) return TRUE;
3380 /* Might be an empty repeat. */
3381 continue;
3382 }
3383
3384 /* Control reaches here only if one of the items is a small character list.
3385 All characters are checked against the other side. */
3386
3387 do
3388 {
3389 chr = *chr_ptr;
3390
3391 switch(list_ptr[0])
3392 {
3393 case OP_CHAR:
3394 ochr_ptr = list_ptr + 2;
3395 do
3396 {
3397 if (chr == *ochr_ptr) return FALSE;
3398 ochr_ptr++;
3399 }
3400 while(*ochr_ptr != NOTACHAR);
3401 break;
3402
3403 case OP_NOT:
3404 ochr_ptr = list_ptr + 2;
3405 do
3406 {
3407 if (chr == *ochr_ptr)
3408 break;
3409 ochr_ptr++;
3410 }
3411 while(*ochr_ptr != NOTACHAR);
3412 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3413 break;
3414
3415 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3416 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3417
3418 case OP_DIGIT:
3419 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3420 break;
3421
3422 case OP_NOT_DIGIT:
3423 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3424 break;
3425
3426 case OP_WHITESPACE:
3427 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3428 break;
3429
3430 case OP_NOT_WHITESPACE:
3431 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3432 break;
3433
3434 case OP_WORDCHAR:
3435 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3436 break;
3437
3438 case OP_NOT_WORDCHAR:
3439 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3440 break;
3441
3442 case OP_HSPACE:
3443 switch(chr)
3444 {
3445 HSPACE_CASES: return FALSE;
3446 default: break;
3447 }
3448 break;
3449
3450 case OP_NOT_HSPACE:
3451 switch(chr)
3452 {
3453 HSPACE_CASES: break;
3454 default: return FALSE;
3455 }
3456 break;
3457
3458 case OP_ANYNL:
3459 case OP_VSPACE:
3460 switch(chr)
3461 {
3462 VSPACE_CASES: return FALSE;
3463 default: break;
3464 }
3465 break;
3466
3467 case OP_NOT_VSPACE:
3468 switch(chr)
3469 {
3470 VSPACE_CASES: break;
3471 default: return FALSE;
3472 }
3473 break;
3474
3475 case OP_DOLL:
3476 case OP_EODN:
3477 switch (chr)
3478 {
3479 case CHAR_CR:
3480 case CHAR_LF:
3481 case CHAR_VT:
3482 case CHAR_FF:
3483 case CHAR_NEL:
3484 #ifndef EBCDIC
3485 case 0x2028:
3486 case 0x2029:
3487 #endif /* Not EBCDIC */
3488 return FALSE;
3489 }
3490 break;
3491
3492 case OP_EOD: /* Can always possessify before \z */
3493 break;
3494
3495 #ifdef SUPPORT_UCP
3496 case OP_PROP:
3497 case OP_NOTPROP:
3498 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3499 list_ptr[0] == OP_NOTPROP))
3500 return FALSE;
3501 break;
3502 #endif
3503
3504 case OP_NCLASS:
3505 if (chr > 255) return FALSE;
3506 /* Fall through */
3507
3508 case OP_CLASS:
3509 if (chr > 255) break;
3510 class_bitset = (pcre_uint8 *)
3511 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3512 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3513 break;
3514
3515 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3516 case OP_XCLASS:
3517 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3518 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3519 break;
3520 #endif
3521
3522 default:
3523 return FALSE;
3524 }
3525
3526 chr_ptr++;
3527 }
3528 while(*chr_ptr != NOTACHAR);
3529
3530 /* At least one character must be matched from this opcode. */
3531
3532 if (list[1] == 0) return TRUE;
3533 }
3534
3535 return FALSE;
3536 }
3537
3538
3539
3540 /*************************************************
3541 * Scan compiled regex for auto-possession *
3542 *************************************************/
3543
3544 /* Replaces single character iterations with their possessive alternatives
3545 if appropriate. This function modifies the compiled opcode!
3546
3547 Arguments:
3548 code points to start of the byte code
3549 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3550 cd static compile data
3551
3552 Returns: nothing
3553 */
3554
3555 static void
3556 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3557 {
3558 register pcre_uchar c;
3559 const pcre_uchar *end;
3560 pcre_uchar *repeat_opcode;
3561 pcre_uint32 list[8];
3562
3563 for (;;)
3564 {
3565 c = *code;
3566
3567 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3568 {
3569 c -= get_repeat_base(c) - OP_STAR;
3570 end = (c <= OP_MINUPTO) ?
3571 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3572 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3573
3574 if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3575 {
3576 switch(c)
3577 {
3578 case OP_STAR:
3579 *code += OP_POSSTAR - OP_STAR;
3580 break;
3581
3582 case OP_MINSTAR:
3583 *code += OP_POSSTAR - OP_MINSTAR;
3584 break;
3585
3586 case OP_PLUS:
3587 *code += OP_POSPLUS - OP_PLUS;
3588 break;
3589
3590 case OP_MINPLUS:
3591 *code += OP_POSPLUS - OP_MINPLUS;
3592 break;
3593
3594 case OP_QUERY:
3595 *code += OP_POSQUERY - OP_QUERY;
3596 break;
3597
3598 case OP_MINQUERY:
3599 *code += OP_POSQUERY - OP_MINQUERY;
3600 break;
3601
3602 case OP_UPTO:
3603 *code += OP_POSUPTO - OP_UPTO;
3604 break;
3605
3606 case OP_MINUPTO:
3607 *code += OP_MINUPTO - OP_UPTO;
3608 break;
3609 }
3610 }
3611 c = *code;
3612 }
3613 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3614 {
3615 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3616 if (c == OP_XCLASS)
3617 repeat_opcode = code + GET(code, 1);
3618 else
3619 #endif
3620 repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3621
3622 c = *repeat_opcode;
3623 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3624 {
3625 /* end must not be NULL. */
3626 end = get_chr_property_list(code, utf, cd->fcc, list);
3627
3628 list[1] = (c & 1) == 0;
3629
3630 if (compare_opcodes(end, utf, cd, list, end))
3631 {
3632 switch (c)
3633 {
3634 case OP_CRSTAR:
3635 case OP_CRMINSTAR:
3636 *repeat_opcode = OP_CRPOSSTAR;
3637 break;
3638
3639 case OP_CRPLUS:
3640 case OP_CRMINPLUS:
3641 *repeat_opcode = OP_CRPOSPLUS;
3642 break;
3643
3644 case OP_CRQUERY:
3645 case OP_CRMINQUERY:
3646 *repeat_opcode = OP_CRPOSQUERY;
3647 break;
3648
3649 case OP_CRRANGE:
3650 case OP_CRMINRANGE:
3651 *repeat_opcode = OP_CRPOSRANGE;
3652 break;
3653 }
3654 }
3655 }
3656 c = *code;
3657 }
3658
3659 switch(c)
3660 {
3661 case OP_END:
3662 return;
3663
3664 case OP_TYPESTAR:
3665 case OP_TYPEMINSTAR:
3666 case OP_TYPEPLUS:
3667 case OP_TYPEMINPLUS:
3668 case OP_TYPEQUERY:
3669 case OP_TYPEMINQUERY:
3670 case OP_TYPEPOSSTAR:
3671 case OP_TYPEPOSPLUS:
3672 case OP_TYPEPOSQUERY:
3673 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3674 break;
3675
3676 case OP_TYPEUPTO:
3677 case OP_TYPEMINUPTO:
3678 case OP_TYPEEXACT:
3679 case OP_TYPEPOSUPTO:
3680 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3681 code += 2;
3682 break;
3683
3684 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3685 case OP_XCLASS:
3686 code += GET(code, 1);
3687 break;
3688 #endif
3689
3690 case OP_MARK:
3691 case OP_PRUNE_ARG:
3692 case OP_SKIP_ARG:
3693 case OP_THEN_ARG:
3694 code += code[1];
3695 break;
3696 }
3697
3698 /* Add in the fixed length from the table */
3699
3700 code += PRIV(OP_lengths)[c];
3701
3702 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3703 a multi-byte character. The length in the table is a minimum, so we have to
3704 arrange to skip the extra bytes. */
3705
3706 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3707 if (utf) switch(c)
3708 {
3709 case OP_CHAR:
3710 case OP_CHARI:
3711 case OP_NOT:
3712 case OP_NOTI:
3713 case OP_STAR:
3714 case OP_MINSTAR:
3715 case OP_PLUS:
3716 case OP_MINPLUS:
3717 case OP_QUERY:
3718 case OP_MINQUERY:
3719 case OP_UPTO:
3720 case OP_MINUPTO:
3721 case OP_EXACT:
3722 case OP_POSSTAR:
3723 case OP_POSPLUS:
3724 case OP_POSQUERY:
3725 case OP_POSUPTO:
3726 case OP_STARI:
3727 case OP_MINSTARI:
3728 case OP_PLUSI:
3729 case OP_MINPLUSI:
3730 case OP_QUERYI:
3731 case OP_MINQUERYI:
3732 case OP_UPTOI:
3733 case OP_MINUPTOI:
3734 case OP_EXACTI:
3735 case OP_POSSTARI:
3736 case OP_POSPLUSI:
3737 case OP_POSQUERYI:
3738 case OP_POSUPTOI:
3739 case OP_NOTSTAR:
3740 case OP_NOTMINSTAR:
3741 case OP_NOTPLUS:
3742 case OP_NOTMINPLUS:
3743 case OP_NOTQUERY:
3744 case OP_NOTMINQUERY:
3745 case OP_NOTUPTO:
3746 case OP_NOTMINUPTO:
3747 case OP_NOTEXACT:
3748 case OP_NOTPOSSTAR:
3749 case OP_NOTPOSPLUS:
3750 case OP_NOTPOSQUERY:
3751 case OP_NOTPOSUPTO:
3752 case OP_NOTSTARI:
3753 case OP_NOTMINSTARI:
3754 case OP_NOTPLUSI:
3755 case OP_NOTMINPLUSI:
3756 case OP_NOTQUERYI:
3757 case OP_NOTMINQUERYI:
3758 case OP_NOTUPTOI:
3759 case OP_NOTMINUPTOI:
3760 case OP_NOTEXACTI:
3761 case OP_NOTPOSSTARI:
3762 case OP_NOTPOSPLUSI:
3763 case OP_NOTPOSQUERYI:
3764 case OP_NOTPOSUPTOI:
3765 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3766 break;
3767 }
3768 #else
3769 (void)(utf); /* Keep compiler happy by referencing function argument */
3770 #endif
3771 }
3772 }
3773
3774
3775
3776 /*************************************************
3777 * Check for POSIX class syntax *
3778 *************************************************/
3779
3780 /* This function is called when the sequence "[:" or "[." or "[=" is
3781 encountered in a character class. It checks whether this is followed by a
3782 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3783 reach an unescaped ']' without the special preceding character, return FALSE.
3784
3785 Originally, this function only recognized a sequence of letters between the
3786 terminators, but it seems that Perl recognizes any sequence of characters,
3787 though of course unknown POSIX names are subsequently rejected. Perl gives an
3788 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3789 didn't consider this to be a POSIX class. Likewise for [:1234:].
3790
3791 The problem in trying to be exactly like Perl is in the handling of escapes. We
3792 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3793 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3794 below handles the special case of \], but does not try to do any other escape
3795 processing. This makes it different from Perl for cases such as [:l\ower:]
3796 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3797 "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
3798 I think.
3799
3800 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3801 It seems that the appearance of a nested POSIX class supersedes an apparent
3802 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3803 a digit.
3804
3805 In Perl, unescaped square brackets may also appear as part of class names. For
3806 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3807 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3808 seem right at all. PCRE does not allow closing square brackets in POSIX class
3809 names.
3810
3811 Arguments:
3812 ptr pointer to the initial [
3813 endptr where to return the end pointer
3814
3815 Returns: TRUE or FALSE
3816 */
3817
3818 static BOOL
3819 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3820 {
3821 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3822 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3823 for (++ptr; *ptr != CHAR_NULL; ptr++)
3824 {
3825 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3826 ptr++;
3827 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3828 else
3829 {
3830 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3831 {
3832 *endptr = ptr;
3833 return TRUE;
3834 }
3835 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3836 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3837 ptr[1] == CHAR_EQUALS_SIGN) &&
3838 check_posix_syntax(ptr, endptr))
3839 return FALSE;
3840 }
3841 }
3842 return FALSE;
3843 }
3844
3845
3846
3847
3848 /*************************************************
3849 * Check POSIX class name *
3850 *************************************************/
3851
3852 /* This function is called to check the name given in a POSIX-style class entry
3853 such as [:alnum:].
3854
3855 Arguments:
3856 ptr points to the first letter
3857 len the length of the name
3858
3859 Returns: a value representing the name, or -1 if unknown
3860 */
3861
3862 static int
3863 check_posix_name(const pcre_uchar *ptr, int len)
3864 {
3865 const char *pn = posix_names;
3866 register int yield = 0;
3867 while (posix_name_lengths[yield] != 0)
3868 {
3869 if (len == posix_name_lengths[yield] &&
3870 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3871 pn += posix_name_lengths[yield] + 1;
3872 yield++;
3873 }
3874 return -1;
3875 }
3876
3877
3878 /*************************************************
3879 * Adjust OP_RECURSE items in repeated group *
3880 *************************************************/
3881
3882 /* OP_RECURSE items contain an offset from the start of the regex to the group
3883 that is referenced. This means that groups can be replicated for fixed
3884 repetition simply by copying (because the recursion is allowed to refer to
3885 earlier groups that are outside the current group). However, when a group is
3886 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3887 inserted before it, after it has been compiled. This means that any OP_RECURSE
3888 items within it that refer to the group itself or any contained groups have to
3889 have their offsets adjusted. That one of the jobs of this function. Before it
3890 is called, the partially compiled regex must be temporarily terminated with
3891 OP_END.
3892
3893 This function has been extended with the possibility of forward references for
3894 recursions and subroutine calls. It must also check the list of such references
3895 for the group we are dealing with. If it finds that one of the recursions in
3896 the current group is on this list, it adjusts the offset in the list, not the
3897 value in the reference (which is a group number).
3898
3899 Arguments:
3900 group points to the start of the group
3901 adjust the amount by which the group is to be moved
3902 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3903 cd contains pointers to tables etc.
3904 save_hwm the hwm forward reference pointer at the start of the group
3905
3906 Returns: nothing
3907 */
3908
3909 static void
3910 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3911 pcre_uchar *save_hwm)
3912 {
3913 pcre_uchar *ptr = group;
3914
3915 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3916 {
3917 int offset;
3918 pcre_uchar *hc;
3919
3920 /* See if this recursion is on the forward reference list. If so, adjust the
3921 reference. */
3922
3923 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3924 {
3925 offset = (int)GET(hc, 0);
3926 if (cd->start_code + offset == ptr + 1)
3927 {
3928 PUT(hc, 0, offset + adjust);
3929 break;
3930 }
3931 }
3932
3933 /* Otherwise, adjust the recursion offset if it's after the start of this
3934 group. */
3935
3936 if (hc >= cd->hwm)
3937 {
3938 offset = (int)GET(ptr, 1);
3939 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3940 }
3941
3942 ptr += 1 + LINK_SIZE;
3943 }
3944 }
3945
3946
3947
3948 /*************************************************
3949 * Insert an automatic callout point *
3950 *************************************************/
3951
3952 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3953 callout points before each pattern item.
3954
3955 Arguments:
3956 code current code pointer
3957 ptr current pattern pointer
3958 cd pointers to tables etc
3959
3960 Returns: new code pointer
3961 */
3962
3963 static pcre_uchar *
3964 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
3965 {
3966 *code++ = OP_CALLOUT;
3967 *code++ = 255;
3968 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
3969 PUT(code, LINK_SIZE, 0); /* Default length */
3970 return code + 2 * LINK_SIZE;
3971 }
3972
3973
3974
3975 /*************************************************
3976 * Complete a callout item *
3977 *************************************************/
3978
3979 /* A callout item contains the length of the next item in the pattern, which
3980 we can't fill in till after we have reached the relevant point. This is used
3981 for both automatic and manual callouts.
3982
3983 Arguments:
3984 previous_callout points to previous callout item
3985 ptr current pattern pointer
3986 cd pointers to tables etc
3987
3988 Returns: nothing
3989 */
3990
3991 static void
3992 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
3993 {
3994 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
3995 PUT(previous_callout, 2 + LINK_SIZE, length);
3996 }
3997
3998
3999
4000 #ifdef SUPPORT_UCP
4001 /*************************************************
4002 * Get othercase range *
4003 *************************************************/
4004
4005 /* This function is passed the start and end of a class range, in UTF-8 mode
4006 with UCP support. It searches up the characters, looking for ranges of
4007 characters in the "other" case. Each call returns the next one, updating the
4008 start address. A character with multiple other cases is returned on its own
4009 with a special return value.
4010
4011 Arguments:
4012 cptr points to starting character value; updated
4013 d end value
4014 ocptr where to put start of othercase range
4015 odptr where to put end of othercase range
4016
4017 Yield: -1 when no more
4018 0 when a range is returned
4019 >0 the CASESET offset for char with multiple other cases
4020 in this case, ocptr contains the original
4021 */
4022
4023 static int
4024 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4025 pcre_uint32 *odptr)
4026 {
4027 pcre_uint32 c, othercase, next;
4028 unsigned int co;
4029
4030 /* Find the first character that has an other case. If it has multiple other
4031 cases, return its case offset value. */
4032
4033 for (c = *cptr; c <= d; c++)
4034 {
4035 if ((co = UCD_CASESET(c)) != 0)
4036 {
4037 *ocptr = c++; /* Character that has the set */
4038 *cptr = c; /* Rest of input range */
4039 return (int)co;
4040 }
4041 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4042 }
4043
4044 if (c > d) return -1; /* Reached end of range */
4045
4046 *ocptr = othercase;
4047 next = othercase + 1;
4048
4049 for (++c; c <= d; c++)
4050 {
4051 if (UCD_OTHERCASE(c) != next) break;
4052 next++;
4053 }
4054
4055 *odptr = next - 1; /* End of othercase range */
4056 *cptr = c; /* Rest of input range */
4057 return 0;
4058 }
4059 #endif /* SUPPORT_UCP */
4060
4061
4062
4063 /*************************************************
4064 * Add a character or range to a class *
4065 *************************************************/
4066
4067 /* This function packages up the logic of adding a character or range of
4068 characters to a class. The character values in the arguments will be within the
4069 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4070 mutually recursive with the function immediately below.
4071
4072 Arguments:
4073 classbits the bit map for characters < 256
4074 uchardptr points to the pointer for extra data
4075 options the options word
4076 cd contains pointers to tables etc.
4077 start start of range character
4078 end end of range character
4079
4080 Returns: the number of < 256 characters added
4081 the pointer to extra data is updated
4082 */
4083
4084 static int
4085 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4086 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4087 {
4088 pcre_uint32 c;
4089 int n8 = 0;
4090
4091 /* If caseless matching is required, scan the range and process alternate
4092 cases. In Unicode, there are 8-bit characters that have alternate cases that
4093 are greater than 255 and vice-versa. Sometimes we can just extend the original
4094 range. */
4095
4096 if ((options & PCRE_CASELESS) != 0)
4097 {
4098 #ifdef SUPPORT_UCP
4099 if ((options & PCRE_UTF8) != 0)
4100 {
4101 int rc;
4102 pcre_uint32 oc, od;
4103
4104 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
4105 c = start;
4106
4107 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4108 {
4109 /* Handle a single character that has more than one other case. */
4110
4111 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4112 PRIV(ucd_caseless_sets) + rc, oc);
4113
4114 /* Do nothing if the other case range is within the original range. */
4115
4116 else if (oc >= start && od <= end) continue;
4117
4118 /* Extend the original range if there is overlap, noting that if oc < c, we
4119 can't have od > end because a subrange is always shorter than the basic
4120 range. Otherwise, use a recursive call to add the additional range. */
4121
4122 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4123 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
4124 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4125 }
4126 }
4127 else
4128 #endif /* SUPPORT_UCP */
4129
4130 /* Not UTF-mode, or no UCP */
4131
4132 for (c = start; c <= end && c < 256; c++)
4133 {
4134 SETBIT(classbits, cd->fcc[c]);
4135 n8++;
4136 }
4137 }
4138
4139 /* Now handle the original range. Adjust the final value according to the bit
4140 length - this means that the same lists of (e.g.) horizontal spaces can be used
4141 in all cases. */
4142
4143 #if defined COMPILE_PCRE8
4144 #ifdef SUPPORT_UTF
4145 if ((options & PCRE_UTF8) == 0)
4146 #endif
4147 if (end > 0xff) end = 0xff;
4148
4149 #elif defined COMPILE_PCRE16
4150 #ifdef SUPPORT_UTF
4151 if ((options & PCRE_UTF16) == 0)
4152 #endif
4153 if (end > 0xffff) end = 0xffff;
4154
4155 #endif /* COMPILE_PCRE[8|16] */
4156
4157 /* If all characters are less than 256, use the bit map. Otherwise use extra
4158 data. */
4159
4160 if (end < 0x100)
4161 {
4162 for (c = start; c <= end; c++)
4163 {
4164 n8++;
4165 SETBIT(classbits, c);
4166 }
4167 }
4168
4169 else
4170 {
4171 pcre_uchar *uchardata = *uchardptr;
4172
4173 #ifdef SUPPORT_UTF
4174 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
4175 {
4176 if (start < end)
4177 {
4178 *uchardata++ = XCL_RANGE;
4179 uchardata += PRIV(ord2utf)(start, uchardata);
4180 uchardata += PRIV(ord2utf)(end, uchardata);
4181 }
4182 else if (start == end)
4183 {
4184 *uchardata++ = XCL_SINGLE;
4185 uchardata += PRIV(ord2utf)(start, uchardata);
4186 }
4187 }
4188 else
4189 #endif /* SUPPORT_UTF */
4190
4191 /* Without UTF support, character values are constrained by the bit length,
4192 and can only be > 256 for 16-bit and 32-bit libraries. */
4193
4194 #ifdef COMPILE_PCRE8
4195 {}
4196 #else
4197 if (start < end)
4198 {
4199 *uchardata++ = XCL_RANGE;
4200 *uchardata++ = start;
4201 *uchardata++ = end;
4202 }
4203 else if (start == end)
4204 {
4205 *uchardata++ = XCL_SINGLE;
4206 *uchardata++ = start;
4207 }
4208 #endif
4209
4210 *uchardptr = uchardata; /* Updata extra data pointer */
4211 }
4212
4213 return n8; /* Number of 8-bit characters */
4214 }
4215
4216
4217
4218
4219 /*************************************************
4220 * Add a list of characters to a class *
4221 *************************************************/
4222
4223 /* This function is used for adding a list of case-equivalent characters to a
4224 class, and also for adding a list of horizontal or vertical whitespace. If the
4225 list is in order (which it should be), ranges of characters are detected and
4226 handled appropriately. This function is mutually recursive with the function
4227 above.
4228
4229 Arguments:
4230 classbits the bit map for characters < 256
4231 uchardptr points to the pointer for extra data
4232 options the options word
4233 cd contains pointers to tables etc.
4234 p points to row of 32-bit values, terminated by NOTACHAR
4235 except character to omit; this is used when adding lists of
4236 case-equivalent characters to avoid including the one we
4237 already know about
4238
4239 Returns: the number of < 256 characters added
4240 the pointer to extra data is updated
4241 */
4242
4243 static int
4244 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4245 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4246 {
4247 int n8 = 0;
4248 while (p[0] < NOTACHAR)
4249 {
4250 int n = 0;
4251 if (p[0] != except)
4252 {
4253 while(p[n+1] == p[0] + n + 1) n++;
4254 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4255 }
4256 p += n + 1;
4257 }
4258 return n8;
4259 }
4260
4261
4262
4263 /*************************************************
4264 * Add characters not in a list to a class *
4265 *************************************************/
4266
4267 /* This function is used for adding the complement of a list of horizontal or
4268 vertical whitespace to a class. The list must be in order.
4269
4270 Arguments:
4271 classbits the bit map for characters < 256
4272 uchardptr points to the pointer for extra data
4273 options the options word
4274 cd contains pointers to tables etc.
4275 p points to row of 32-bit values, terminated by NOTACHAR
4276
4277 Returns: the number of < 256 characters added
4278 the pointer to extra data is updated
4279 */
4280
4281 static int
4282 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4283 int options, compile_data *cd, const pcre_uint32 *p)
4284 {
4285 BOOL utf = (options & PCRE_UTF8) != 0;
4286 int n8 = 0;
4287 if (p[0] > 0)
4288 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4289 while (p[0] < NOTACHAR)
4290 {
4291 while (p[1] == p[0] + 1) p++;
4292 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4293 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4294 p++;
4295 }
4296 return n8;
4297 }
4298
4299
4300
4301 /*************************************************
4302 * Compile one branch *
4303 *************************************************/
4304
4305 /* Scan the pattern, compiling it into the a vector. If the options are
4306 changed during the branch, the pointer is used to change the external options
4307 bits. This function is used during the pre-compile phase when we are trying
4308 to find out the amount of memory needed, as well as during the real compile
4309 phase. The value of lengthptr distinguishes the two phases.
4310
4311 Arguments:
4312 optionsptr pointer to the option bits
4313 codeptr points to the pointer to the current code point
4314 ptrptr points to the current pattern pointer
4315 errorcodeptr points to error code variable
4316 firstcharptr place to put the first required character
4317 firstcharflagsptr place to put the first character flags, or a negative number
4318 reqcharptr place to put the last required character
4319 reqcharflagsptr place to put the last required character flags, or a negative number
4320 bcptr points to current branch chain
4321 cond_depth conditional nesting depth
4322 cd contains pointers to tables etc.
4323 lengthptr NULL during the real compile phase
4324 points to length accumulator during pre-compile phase
4325
4326 Returns: TRUE on success
4327 FALSE, with *errorcodeptr set non-zero on error
4328 */
4329
4330 static BOOL
4331 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4332 const pcre_uchar **ptrptr, int *errorcodeptr,
4333 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4334 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4335 branch_chain *bcptr, int cond_depth,
4336 compile_data *cd, int *lengthptr)
4337 {
4338 int repeat_type, op_type;
4339 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4340 int bravalue = 0;
4341 int greedy_default, greedy_non_default;
4342 pcre_uint32 firstchar, reqchar;
4343 pcre_int32 firstcharflags, reqcharflags;
4344 pcre_uint32 zeroreqchar, zerofirstchar;
4345 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4346 pcre_int32 req_caseopt, reqvary, tempreqvary;
4347 int options = *optionsptr; /* May change dynamically */
4348 int after_manual_callout = 0;
4349 int length_prevgroup = 0;
4350 register pcre_uint32 c;
4351 int escape;
4352 register pcre_uchar *code = *codeptr;
4353 pcre_uchar *last_code = code;
4354 pcre_uchar *orig_code = code;
4355 pcre_uchar *tempcode;
4356 BOOL inescq = FALSE;
4357 BOOL groupsetfirstchar = FALSE;
4358 const pcre_uchar *ptr = *ptrptr;
4359 const pcre_uchar *tempptr;
4360 const pcre_uchar *nestptr = NULL;
4361 pcre_uchar *previous = NULL;
4362 pcre_uchar *previous_callout = NULL;
4363 pcre_uchar *save_hwm = NULL;
4364 pcre_uint8 classbits[32];
4365
4366 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4367 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4368 dynamically as we process the pattern. */
4369
4370 #ifdef SUPPORT_UTF
4371 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4372 BOOL utf = (options & PCRE_UTF8) != 0;
4373 #ifndef COMPILE_PCRE32
4374 pcre_uchar utf_chars[6];
4375 #endif
4376 #else
4377 BOOL utf = FALSE;
4378 #endif
4379
4380 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4381 class_uchardata always so that it can be passed to add_to_class() always,
4382 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4383 alternative calls for the different cases. */
4384
4385 pcre_uchar *class_uchardata;
4386 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4387 BOOL xclass;
4388 pcre_uchar *class_uchardata_base;
4389 #endif
4390
4391 #ifdef PCRE_DEBUG
4392 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4393 #endif
4394
4395 /* Set up the default and non-default settings for greediness */
4396
4397 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4398 greedy_non_default = greedy_default ^ 1;
4399
4400 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4401 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4402 matches a non-fixed char first char; reqchar just remains unset if we never
4403 find one.
4404
4405 When we hit a repeat whose minimum is zero, we may have to adjust these values
4406 to take the zero repeat into account. This is implemented by setting them to
4407 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4408 item types that can be repeated set these backoff variables appropriately. */
4409
4410 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4411 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4412
4413 /* The variable req_caseopt contains either the REQ_CASELESS value
4414 or zero, according to the current setting of the caseless flag. The
4415 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4416 firstchar or reqchar variables to record the case status of the
4417 value. This is used only for ASCII characters. */
4418
4419 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4420
4421 /* Switch on next character until the end of the branch */
4422
4423 for (;; ptr++)
4424 {
4425 BOOL negate_class;
4426 BOOL should_flip_negation;
4427 BOOL possessive_quantifier;
4428 BOOL is_quantifier;
4429 BOOL is_recurse;
4430 BOOL reset_bracount;
4431 int class_has_8bitchar;
4432 int class_one_char;
4433 int newoptions;
4434 int recno;
4435 int refsign;
4436 int skipbytes;
4437 pcre_uint32 subreqchar, subfirstchar;
4438 pcre_int32 subreqcharflags, subfirstcharflags;
4439 int terminator;
4440 unsigned int mclength;
4441 unsigned int tempbracount;
4442 pcre_uint32 ec;
4443 pcre_uchar mcbuffer[8];
4444
4445 /* Get next character in the pattern */
4446
4447 c = *ptr;
4448
4449 /* If we are at the end of a nested substitution, revert to the outer level
4450 string. Nesting only happens one level deep. */
4451
4452 if (c == CHAR_NULL && nestptr != NULL)
4453 {
4454 ptr = nestptr;
4455 nestptr = NULL;
4456 c = *ptr;
4457 }
4458
4459 /* If we are in the pre-compile phase, accumulate the length used for the
4460 previous cycle of this loop. */
4461
4462 if (lengthptr != NULL)
4463 {
4464 #ifdef PCRE_DEBUG
4465 if (code > cd->hwm) cd->hwm = code; /* High water info */
4466 #endif
4467 if (code > cd->start_workspace + cd->workspace_size -
4468 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4469 {
4470 *errorcodeptr = ERR52;
4471 goto FAILED;
4472 }
4473
4474 /* There is at least one situation where code goes backwards: this is the
4475 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4476 the class is simply eliminated. However, it is created first, so we have to
4477 allow memory for it. Therefore, don't ever reduce the length at this point.
4478 */
4479
4480 if (code < last_code) code = last_code;
4481
4482 /* Paranoid check for integer overflow */
4483
4484 if (OFLOW_MAX - *lengthptr < code - last_code)
4485 {
4486 *errorcodeptr = ERR20;
4487 goto FAILED;
4488 }
4489
4490 *lengthptr += (int)(code - last_code);
4491 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4492 (int)(code - last_code), c, c));
4493
4494 /* If "previous" is set and it is not at the start of the work space, move
4495 it back to there, in order to avoid filling up the work space. Otherwise,
4496 if "previous" is NULL, reset the current code pointer to the start. */
4497
4498 if (previous != NULL)
4499 {
4500 if (previous > orig_code)
4501 {
4502 memmove(orig_code, previous, IN_UCHARS(code - previous));
4503 code -= previous - orig_code;
4504 previous = orig_code;
4505 }
4506 }
4507 else code = orig_code;
4508
4509 /* Remember where this code item starts so we can pick up the length
4510 next time round. */
4511
4512 last_code = code;
4513 }
4514
4515 /* In the real compile phase, just check the workspace used by the forward
4516 reference list. */
4517
4518 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4519 WORK_SIZE_SAFETY_MARGIN)
4520 {
4521 *errorcodeptr = ERR52;
4522 goto FAILED;
4523 }
4524
4525 /* If in \Q...\E, check for the end; if not, we have a literal */
4526
4527 if (inescq && c != CHAR_NULL)
4528 {
4529 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4530 {
4531 inescq = FALSE;
4532 ptr++;
4533 continue;
4534 }
4535 else
4536 {
4537 if (previous_callout != NULL)
4538 {
4539 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4540 complete_callout(previous_callout, ptr, cd);
4541 previous_callout = NULL;
4542 }
4543 if ((options & PCRE_AUTO_CALLOUT) != 0)
4544 {
4545 previous_callout = code;
4546 code = auto_callout(code, ptr, cd);
4547 }
4548 goto NORMAL_CHAR;
4549 }
4550 }
4551
4552 is_quantifier =
4553 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4554 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4555
4556 /* Fill in length of a previous callout, except when the next thing is a
4557 quantifier or when processing a property substitution string in UCP mode. */
4558
4559 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4560 after_manual_callout-- <= 0)
4561 {
4562 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4563 complete_callout(previous_callout, ptr, cd);
4564 previous_callout = NULL;
4565 }
4566
4567 /* In extended mode, skip white space and comments. */
4568
4569 if ((options & PCRE_EXTENDED) != 0)
4570 {
4571 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
4572 if (c == CHAR_NUMBER_SIGN)
4573 {
4574 ptr++;
4575 while (*ptr != CHAR_NULL)
4576 {
4577 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
4578 ptr++;
4579 #ifdef SUPPORT_UTF
4580 if (utf) FORWARDCHAR(ptr);
4581 #endif
4582 }
4583 if (*ptr != CHAR_NULL) continue;
4584
4585 /* Else fall through to handle end of string */
4586 c = 0;
4587 }
4588 }
4589
4590 /* No auto callout for quantifiers, or while processing property strings that
4591 are substituted for \w etc in UCP mode. */
4592
4593 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4594 {
4595 previous_callout = code;
4596 code = auto_callout(code, ptr, cd);
4597 }
4598
4599 switch(c)
4600 {
4601 /* ===================================================================*/
4602 case 0: /* The branch terminates at string end */
4603 case CHAR_VERTICAL_LINE: /* or | or ) */
4604 case CHAR_RIGHT_PARENTHESIS:
4605 *firstcharptr = firstchar;
4606 *firstcharflagsptr = firstcharflags;
4607 *reqcharptr = reqchar;
4608 *reqcharflagsptr = reqcharflags;
4609 *codeptr = code;
4610 *ptrptr = ptr;
4611 if (lengthptr != NULL)
4612 {
4613 if (OFLOW_MAX - *lengthptr < code - last_code)
4614 {
4615 *errorcodeptr = ERR20;
4616 goto FAILED;
4617 }
4618 *lengthptr += (int)(code - last_code); /* To include callout length */
4619 DPRINTF((">> end branch\n"));
4620 }
4621 return TRUE;
4622
4623
4624 /* ===================================================================*/
4625 /* Handle single-character metacharacters. In multiline mode, ^ disables
4626 the setting of any following char as a first character. */
4627
4628 case CHAR_CIRCUMFLEX_ACCENT:
4629 previous = NULL;
4630 if ((options & PCRE_MULTILINE) != 0)
4631 {
4632 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4633 *code++ = OP_CIRCM;
4634 }
4635 else *code++ = OP_CIRC;
4636 break;
4637
4638 case CHAR_DOLLAR_SIGN:
4639 previous = NULL;
4640 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4641 break;
4642
4643 /* There can never be a first char if '.' is first, whatever happens about
4644 repeats. The value of reqchar doesn't change either. */
4645
4646 case CHAR_DOT:
4647 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4648 zerofirstchar = firstchar;
4649 zerofirstcharflags = firstcharflags;
4650 zeroreqchar = reqchar;
4651 zeroreqcharflags = reqcharflags;
4652 previous = code;
4653 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4654 break;
4655
4656
4657 /* ===================================================================*/
4658 /* Character classes. If the included characters are all < 256, we build a
4659 32-byte bitmap of the permitted characters, except in the special case
4660 where there is only one such character. For negated classes, we build the
4661 map as usual, then invert it at the end. However, we use a different opcode
4662 so that data characters > 255 can be handled correctly.
4663
4664 If the class contains characters outside the 0-255 range, a different
4665 opcode is compiled. It may optionally have a bit map for characters < 256,
4666 but those above are are explicitly listed afterwards. A flag byte tells
4667 whether the bitmap is present, and whether this is a negated class or not.
4668
4669 In JavaScript compatibility mode, an isolated ']' causes an error. In
4670 default (Perl) mode, it is treated as a data character. */
4671
4672 case CHAR_RIGHT_SQUARE_BRACKET:
4673 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4674 {
4675 *errorcodeptr = ERR64;
4676 goto FAILED;
4677 }
4678 goto NORMAL_CHAR;
4679
4680 case CHAR_LEFT_SQUARE_BRACKET:
4681 previous = code;
4682
4683 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4684 they are encountered at the top level, so we'll do that too. */
4685
4686 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4687 ptr[1] == CHAR_EQUALS_SIGN) &&
4688 check_posix_syntax(ptr, &tempptr))
4689 {
4690 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4691 goto FAILED;
4692 }
4693
4694 /* If the first character is '^', set the negation flag and skip it. Also,
4695 if the first few characters (either before or after ^) are \Q\E or \E we
4696 skip them too. This makes for compatibility with Perl. */
4697
4698 negate_class = FALSE;
4699 for (;;)
4700 {
4701 c = *(++ptr);
4702 if (c == CHAR_BACKSLASH)
4703 {
4704 if (ptr[1] == CHAR_E)
4705 ptr++;
4706 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4707 ptr += 3;
4708 else
4709 break;
4710 }
4711 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4712 negate_class = TRUE;
4713 else break;
4714 }
4715
4716 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4717 an initial ']' is taken as a data character -- the code below handles
4718 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4719 [^] must match any character, so generate OP_ALLANY. */
4720
4721 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4722 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4723 {
4724 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4725 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4726 zerofirstchar = firstchar;
4727 zerofirstcharflags = firstcharflags;
4728 break;
4729 }
4730
4731 /* If a class contains a negative special such as \S, we need to flip the
4732 negation flag at the end, so that support for characters > 255 works
4733 correctly (they are all included in the class). */
4734
4735 should_flip_negation = FALSE;
4736
4737 /* For optimization purposes, we track some properties of the class:
4738 class_has_8bitchar will be non-zero if the class contains at least one <
4739 256 character; class_one_char will be 1 if the class contains just one
4740 character. */
4741
4742 class_has_8bitchar = 0;
4743 class_one_char = 0;
4744
4745 /* Initialize the 32-char bit map to all zeros. We build the map in a
4746 temporary bit of memory, in case the class contains fewer than two
4747 8-bit characters because in that case the compiled code doesn't use the bit
4748 map. */
4749
4750 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4751
4752 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4753 xclass = FALSE;
4754 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4755 class_uchardata_base = class_uchardata; /* Save the start */
4756 #endif
4757
4758 /* Process characters until ] is reached. By writing this as a "do" it
4759 means that an initial ] is taken as a data character. At the start of the
4760 loop, c contains the first byte of the character. */
4761
4762 if (c != CHAR_NULL) do
4763 {
4764 const pcre_uchar *oldptr;
4765
4766 #ifdef SUPPORT_UTF
4767 if (utf && HAS_EXTRALEN(c))
4768 { /* Braces are required because the */
4769 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4770 }
4771 #endif
4772
4773 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4774 /* In the pre-compile phase, accumulate the length of any extra
4775 data and reset the pointer. This is so that very large classes that
4776 contain a zillion > 255 characters no longer overwrite the work space
4777 (which is on the stack). We have to remember that there was XCLASS data,
4778 however. */
4779
4780 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4781 {
4782 xclass = TRUE;
4783 *lengthptr += class_uchardata - class_uchardata_base;
4784 class_uchardata = class_uchardata_base;
4785 }
4786 #endif
4787
4788 /* Inside \Q...\E everything is literal except \E */
4789
4790 if (inescq)
4791 {
4792 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4793 {
4794 inescq = FALSE; /* Reset literal state */
4795 ptr++; /* Skip the 'E' */
4796 continue; /* Carry on with next */
4797 }
4798 goto CHECK_RANGE; /* Could be range if \E follows */
4799 }
4800
4801 /* Handle POSIX class names. Perl allows a negation extension of the
4802 form [:^name:]. A square bracket that doesn't match the syntax is
4803 treated as a literal. We also recognize the POSIX constructions
4804 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4805 5.6 and 5.8 do. */
4806
4807 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4808 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4809 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4810 {
4811 BOOL local_negate = FALSE;
4812 int posix_class, taboffset, tabopt;
4813 register const pcre_uint8 *cbits = cd->cbits;
4814 pcre_uint8 pbits[32];
4815
4816 if (ptr[1] != CHAR_COLON)
4817 {
4818 *errorcodeptr = ERR31;
4819 goto FAILED;
4820 }
4821
4822 ptr += 2;
4823 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4824 {
4825 local_negate = TRUE;
4826 should_flip_negation = TRUE; /* Note negative special */
4827 ptr++;
4828 }
4829
4830 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4831 if (posix_class < 0)
4832 {
4833 *errorcodeptr = ERR30;
4834 goto FAILED;
4835 }
4836
4837 /* If matching is caseless, upper and lower are converted to
4838 alpha. This relies on the fact that the class table starts with
4839 alpha, lower, upper as the first 3 entries. */
4840
4841 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4842 posix_class = 0;
4843
4844 /* When PCRE_UCP is set, some of the POSIX classes are converted to
4845 different escape sequences that use Unicode properties \p or \P. Others
4846 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4847 directly. */
4848
4849 #ifdef SUPPORT_UCP
4850 if ((options & PCRE_UCP) != 0)
4851 {
4852 unsigned int ptype = 0;
4853 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4854
4855 /* The posix_substitutes table specifies which POSIX classes can be
4856 converted to \p or \P items. */
4857
4858 if (posix_substitutes[pc] != NULL)
4859 {
4860 nestptr = tempptr + 1;
4861 ptr = posix_substitutes[pc] - 1;
4862 continue;
4863 }
4864
4865 /* There are three other classes that generate special property calls
4866 that are recognized only in an XCLASS. */
4867
4868 else switch(posix_class)
4869 {
4870 case PC_GRAPH:
4871 ptype = PT_PXGRAPH;
4872 /* Fall through */
4873 case PC_PRINT:
4874 if (ptype == 0) ptype = PT_PXPRINT;
4875 /* Fall through */
4876 case PC_PUNCT:
4877 if (ptype == 0) ptype = PT_PXPUNCT;
4878 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4879 *class_uchardata++ = ptype;
4880 *class_uchardata++ = 0;
4881 ptr = tempptr + 1;
4882 continue;
4883
4884 /* For all other POSIX classes, no special action is taken in UCP
4885 mode. Fall through to the non_UCP case. */
4886
4887 default:
4888 break;
4889 }
4890 }
4891 #endif
4892 /* In the non-UCP case, or when UCP makes no difference, we build the
4893 bit map for the POSIX class in a chunk of local store because we may be
4894 adding and subtracting from it, and we don't want to subtract bits that
4895 may be in the main map already. At the end we or the result into the
4896 bit map that is being built. */
4897
4898 posix_class *= 3;
4899
4900 /* Copy in the first table (always present) */
4901
4902 memcpy(pbits, cbits + posix_class_maps[posix_class],
4903 32 * sizeof(pcre_uint8));
4904
4905 /* If there is a second table, add or remove it as required. */
4906
4907 taboffset = posix_class_maps[posix_class + 1];
4908 tabopt = posix_class_maps[posix_class + 2];
4909
4910 if (taboffset >= 0)
4911 {
4912 if (tabopt >= 0)
4913 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4914 else
4915 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4916 }
4917
4918 /* Now see if we need to remove any special characters. An option
4919 value of 1 removes vertical space and 2 removes underscore. */
4920
4921 if (tabopt < 0) tabopt = -tabopt;
4922 if (tabopt == 1) pbits[1] &= ~0x3c;
4923 else if (tabopt == 2) pbits[11] &= 0x7f;
4924
4925 /* Add the POSIX table or its complement into the main table that is
4926 being built and we are done. */
4927
4928 if (local_negate)
4929 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4930 else
4931 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4932
4933 ptr = tempptr + 1;
4934 /* Every class contains at least one < 256 character. */
4935 class_has_8bitchar = 1;
4936 /* Every class contains at least two characters. */
4937 class_one_char = 2;
4938 continue; /* End of POSIX syntax handling */
4939 }
4940
4941 /* Backslash may introduce a single character, or it may introduce one
4942 of the specials, which just set a flag. The sequence \b is a special
4943 case. Inside a class (and only there) it is treated as backspace. We
4944 assume that other escapes have more than one character in them, so
4945 speculatively set both class_has_8bitchar and class_one_char bigger
4946 than one. Unrecognized escapes fall through and are either treated
4947 as literal characters (by default), or are faulted if
4948 PCRE_EXTRA is set. */
4949
4950 if (c == CHAR_BACKSLASH)
4951 {
4952 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
4953 TRUE);
4954 if (*errorcodeptr != 0) goto FAILED;
4955 if (escape == 0) c = ec;
4956 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4957 else if (escape == ESC_N) /* \N is not supported in a class */
4958 {
4959 *errorcodeptr = ERR71;
4960 goto FAILED;
4961 }
4962 else if (escape == ESC_Q) /* Handle start of quoted string */
4963 {
4964 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4965 {
4966 ptr += 2; /* avoid empty string */
4967 }
4968 else inescq = TRUE;
4969 continue;
4970 }
4971 else if (escape == ESC_E) continue; /* Ignore orphan \E */
4972
4973 else
4974 {
4975 register const pcre_uint8 *cbits = cd->cbits;
4976 /* Every class contains at least two < 256 characters. */
4977 class_has_8bitchar++;
4978 /* Every class contains at least two characters. */
4979 class_one_char += 2;
4980
4981 switch (escape)
4982 {
4983 #ifdef SUPPORT_UCP
4984 case ESC_du: /* These are the values given for \d etc */
4985 case ESC_DU: /* when PCRE_UCP is set. We replace the */
4986 case ESC_wu: /* escape sequence with an appropriate \p */
4987 case ESC_WU: /* or \P to test Unicode properties instead */
4988 case ESC_su: /* of the default ASCII testing. */
4989 case ESC_SU:
4990 nestptr = ptr;
4991 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4992 class_has_8bitchar--; /* Undo! */
4993 continue;
4994 #endif
4995 case ESC_d:
4996 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4997 continue;
4998
4999 case ESC_D:
5000 should_flip_negation = TRUE;
5001 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5002 continue;
5003
5004 case ESC_w:
5005 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5006 continue;
5007
5008 case ESC_W:
5009 should_flip_negation = TRUE;
5010 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5011 continue;
5012
5013 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5014 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5015 previously set by something earlier in the character class.
5016 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5017 we could just adjust the appropriate bit. From PCRE 8.34 we no
5018 longer treat \s and \S specially. */
5019
5020 case ESC_s:
5021 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5022 continue;
5023
5024 case ESC_S:
5025 should_flip_negation = TRUE;
5026 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5027 continue;
5028
5029 /* The rest apply in both UCP and non-UCP cases. */
5030
5031 case ESC_h:
5032 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5033 PRIV(hspace_list), NOTACHAR);
5034 continue;
5035
5036 case ESC_H:
5037 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5038 cd, PRIV(hspace_list));
5039 continue;
5040
5041 case ESC_v:
5042 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5043 PRIV(vspace_list), NOTACHAR);
5044 continue;
5045
5046 case ESC_V:
5047 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5048 cd, PRIV(vspace_list));
5049 continue;
5050
5051 #ifdef SUPPORT_UCP
5052 case ESC_p:
5053 case ESC_P:
5054 {
5055 BOOL negated;
5056 unsigned int ptype = 0, pdata = 0;
5057 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5058 goto FAILED;
5059 *class_uchardata++ = ((escape == ESC_p) != negated)?
5060 XCL_PROP : XCL_NOTPROP;
5061 *class_uchardata++ = ptype;
5062 *class_uchardata++ = pdata;
5063 class_has_8bitchar--; /* Undo! */
5064 continue;
5065 }
5066 #endif
5067 /* Unrecognized escapes are faulted if PCRE is running in its
5068 strict mode. By default, for compatibility with Perl, they are
5069 treated as literals. */
5070
5071 default:
5072 if ((options & PCRE_EXTRA) != 0)
5073 {
5074 *errorcodeptr = ERR7;
5075 goto FAILED;
5076 }
5077 class_has_8bitchar--; /* Undo the speculative increase. */
5078 class_one_char -= 2; /* Undo the speculative increase. */
5079 c = *ptr; /* Get the final character and fall through */
5080 break;
5081 }
5082 }
5083
5084 /* Fall through if the escape just defined a single character (c >= 0).
5085 This may be greater than 256. */
5086
5087 escape = 0;
5088
5089 } /* End of backslash handling */
5090
5091 /* A character may be followed by '-' to form a range. However, Perl does
5092 not permit ']' to be the end of the range. A '-' character at the end is
5093 treated as a literal. Perl ignores orphaned \E sequences entirely. The
5094 code for handling \Q and \E is messy. */
5095
5096 CHECK_RANGE:
5097 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5098 {
5099 inescq = FALSE;
5100 ptr += 2;
5101 }
5102 oldptr = ptr;
5103
5104 /* Remember if \r or \n were explicitly used */
5105
5106 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5107
5108 /* Check for range */
5109
5110 if (!inescq && ptr[1] == CHAR_MINUS)
5111 {
5112 pcre_uint32 d;
5113 ptr += 2;
5114 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5115
5116 /* If we hit \Q (not followed by \E) at this point, go into escaped
5117 mode. */
5118
5119 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5120 {
5121 ptr += 2;
5122 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5123 { ptr += 2; continue; }
5124 inescq = TRUE;
5125 break;
5126 }
5127
5128 /* Minus (hyphen) at the end of a class is treated as a literal, so put
5129 back the pointer and jump to handle the character that preceded it. */
5130
5131 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5132 {
5133 ptr = oldptr;
5134 goto CLASS_SINGLE_CHARACTER;
5135 }
5136
5137 /* Otherwise, we have a potential range; pick up the next character */
5138
5139 #ifdef SUPPORT_UTF
5140 if (utf)
5141 { /* Braces are required because the */
5142 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
5143 }
5144 else
5145 #endif
5146 d = *ptr; /* Not UTF-8 mode */
5147
5148 /* The second part of a range can be a single-character escape
5149 sequence, but not any of the other escapes. Perl treats a hyphen as a
5150 literal in such circumstances. However, in Perl's warning mode, a
5151 warning is given, so PCRE now faults it as it is almost certainly a
5152 mistake on the user's part. */
5153
5154 if (!inescq)
5155 {
5156 if (d == CHAR_BACKSLASH)
5157 {
5158 int descape;
5159 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5160 if (*errorcodeptr != 0) goto FAILED;
5161
5162 /* 0 means a character was put into d; \b is backspace; any other
5163 special causes an error. */
5164
5165 if (descape != 0)
5166 {
5167 if (descape == ESC_b) d = CHAR_BS; else
5168 {
5169 *errorcodeptr = ERR83;
5170 goto FAILED;
5171 }
5172 }
5173 }
5174
5175 /* A hyphen followed by a POSIX class is treated in the same way. */
5176
5177 else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5178 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5179 ptr[1] == CHAR_EQUALS_SIGN) &&
5180 check_posix_syntax(ptr, &tempptr))
5181 {
5182 *errorcodeptr = ERR83;
5183 goto FAILED;
5184 }
5185 }
5186
5187 /* Check that the two values are in the correct order. Optimize
5188 one-character ranges. */
5189
5190 if (d < c)
5191 {
5192 *errorcodeptr = ERR8;
5193 goto FAILED;
5194 }
5195 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
5196
5197 /* We have found a character range, so single character optimizations
5198 cannot be done anymore. Any value greater than 1 indicates that there
5199 is more than one character. */
5200
5201 class_one_char = 2;
5202
5203 /* Remember an explicit \r or \n, and add the range to the class. */
5204
5205 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5206
5207 class_has_8bitchar +=
5208 add_to_class(classbits, &class_uchardata, options, cd, c, d);
5209
5210 continue; /* Go get the next char in the class */
5211 }
5212
5213 /* Handle a single character - we can get here for a normal non-escape
5214 char, or after \ that introduces a single character or for an apparent
5215 range that isn't. Only the value 1 matters for class_one_char, so don't
5216 increase it if it is already 2 or more ... just in case there's a class
5217 with a zillion characters in it. */
5218
5219 CLASS_SINGLE_CHARACTER:
5220 if (class_one_char < 2) class_one_char++;
5221
5222 /* If class_one_char is 1, we have the first single character in the
5223 class, and there have been no prior ranges, or XCLASS items generated by
5224 escapes. If this is the final character in the class, we can optimize by
5225 turning the item into a 1-character OP_CHAR[I] if it's positive, or
5226 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
5227 to be set. Otherwise, there can be no first char if this item is first,
5228 whatever repeat count may follow. In the case of reqchar, save the
5229 previous value for reinstating. */
5230
5231 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5232 {
5233 ptr++;
5234 zeroreqchar = reqchar;
5235 zeroreqcharflags = reqcharflags;
5236
5237 if (negate_class)
5238 {
5239 #ifdef SUPPORT_UCP
5240 int d;
5241 #endif
5242 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5243 zerofirstchar = firstchar;
5244 zerofirstcharflags = firstcharflags;
5245
5246 /* For caseless UTF-8 mode when UCP support is available, check
5247 whether this character has more than one other case. If so, generate
5248 a special OP_NOTPROP item instead of OP_NOTI. */
5249
5250 #ifdef SUPPORT_UCP
5251 if (utf && (options & PCRE_CASELESS) != 0 &&
5252 (d = UCD_CASESET(c)) != 0)
5253 {
5254 *code++ = OP_NOTPROP;
5255 *code++ = PT_CLIST;
5256 *code++ = d;
5257 }
5258 else
5259 #endif
5260 /* Char has only one other case, or UCP not available */
5261
5262 {
5263 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5264 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5265 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5266 code += PRIV(ord2utf)(c, code);
5267 else
5268 #endif
5269 *code++ = c;
5270 }
5271
5272 /* We are finished with this character class */
5273
5274 goto END_CLASS;
5275 }
5276
5277 /* For a single, positive character, get the value into mcbuffer, and
5278 then we can handle this with the normal one-character code. */
5279
5280 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5281 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5282 mclength = PRIV(ord2utf)(c, mcbuffer);
5283 else
5284 #endif
5285 {
5286 mcbuffer[0] = c;
5287 mclength = 1;
5288 }
5289 goto ONE_CHAR;
5290 } /* End of 1-char optimization */
5291
5292 /* There is more than one character in the class, or an XCLASS item
5293 has been generated. Add this character to the class. */
5294
5295 class_has_8bitchar +=
5296 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5297 }
5298
5299 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5300 If we are at the end of an internal nested string, revert to the outer
5301 string. */
5302
5303 while (((c = *(++ptr)) != CHAR_NULL ||
5304 (nestptr != NULL &&
5305 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5306 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5307
5308 /* Check for missing terminating ']' */
5309
5310 if (c == CHAR_NULL)
5311 {
5312 *errorcodeptr = ERR6;
5313 goto FAILED;
5314 }
5315
5316 /* We will need an XCLASS if data has been placed in class_uchardata. In
5317 the second phase this is a sufficient test. However, in the pre-compile
5318 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5319 only if the very last character in the class needs XCLASS will it contain
5320 anything at this point. For this reason, xclass gets set TRUE above when
5321 uchar_classdata is emptied, and that's why this code is the way it is here
5322 instead of just doing a test on class_uchardata below. */
5323
5324 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5325 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5326 #endif
5327
5328 /* If this is the first thing in the branch, there can be no first char
5329 setting, whatever the repeat count. Any reqchar setting must remain
5330 unchanged after any kind of repeat. */
5331
5332 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5333 zerofirstchar = firstchar;
5334 zerofirstcharflags = firstcharflags;
5335 zeroreqchar = reqchar;
5336 zeroreqcharflags = reqcharflags;
5337
5338 /* If there are characters with values > 255, we have to compile an
5339 extended class, with its own opcode, unless there was a negated special
5340 such as \S in the class, and PCRE_UCP is not set, because in that case all
5341 characters > 255 are in the class, so any that were explicitly given as
5342 well can be ignored. If (when there are explicit characters > 255 that must
5343 be listed) there are no characters < 256, we can omit the bitmap in the
5344 actual compiled code. */
5345
5346 #ifdef SUPPORT_UTF
5347 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5348 #elif !defined COMPILE_PCRE8
5349 if (xclass && !should_flip_negation)
5350 #endif
5351 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5352 {
5353 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5354 *code++ = OP_XCLASS;
5355 code += LINK_SIZE;
5356 *code = negate_class? XCL_NOT:0;
5357
5358 /* If the map is required, move up the extra data to make room for it;
5359 otherwise just move the code pointer to the end of the extra data. */
5360
5361 if (class_has_8bitchar > 0)
5362 {
5363 *code++ |= XCL_MAP;
5364 memmove(code + (32 / sizeof(pcre_uchar)), code,
5365 IN_UCHARS(class_uchardata - code));
5366 memcpy(code, classbits, 32);
5367 code = class_uchardata + (32 / sizeof(pcre_uchar));
5368 }
5369 else code = class_uchardata;
5370
5371 /* Now fill in the complete length of the item */
5372
5373 PUT(previous, 1, (int)(code - previous));
5374 break; /* End of class handling */
5375 }
5376 #endif
5377
5378 /* If there are no characters > 255, or they are all to be included or
5379 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5380 whole class was negated and whether there were negative specials such as \S
5381 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5382 negating it if necessary. */
5383
5384 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5385 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5386 {
5387 if (negate_class)
5388 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5389 memcpy(code, classbits, 32);
5390 }
5391 code += 32 / sizeof(pcre_uchar);
5392
5393 END_CLASS:
5394 break;
5395
5396
5397 /* ===================================================================*/
5398 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5399 has been tested above. */
5400
5401 case CHAR_LEFT_CURLY_BRACKET:
5402 if (!is_quantifier) goto NORMAL_CHAR;
5403 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5404 if (*errorcodeptr != 0) goto FAILED;
5405 goto REPEAT;
5406
5407 case CHAR_ASTERISK:
5408 repeat_min = 0;
5409 repeat_max = -1;
5410 goto REPEAT;
5411
5412 case CHAR_PLUS:
5413 repeat_min = 1;
5414 repeat_max = -1;
5415 goto REPEAT;
5416
5417 case CHAR_QUESTION_MARK:
5418 repeat_min = 0;
5419 repeat_max = 1;
5420
5421 REPEAT:
5422 if (previous == NULL)
5423 {
5424 *errorcodeptr = ERR9;
5425 goto FAILED;
5426 }
5427
5428 if (repeat_min == 0)
5429 {
5430 firstchar = zerofirstchar; /* Adjust for zero repeat */
5431 firstcharflags = zerofirstcharflags;
5432 reqchar = zeroreqchar; /* Ditto */
5433 reqcharflags = zeroreqcharflags;
5434 }
5435
5436 /* Remember whether this is a variable length repeat */
5437
5438 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5439
5440 op_type = 0; /* Default single-char op codes */
5441 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5442
5443 /* Save start of previous item, in case we have to move it up in order to
5444 insert something before it. */
5445
5446 tempcode = previous;
5447
5448 /* If the next character is '+', we have a possessive quantifier. This
5449 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5450 If the next character is '?' this is a minimizing repeat, by default,
5451 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5452 repeat type to the non-default. */
5453
5454 if (ptr[1] == CHAR_PLUS)
5455 {
5456 repeat_type = 0; /* Force greedy */
5457 possessive_quantifier = TRUE;
5458 ptr++;
5459 }
5460 else if (ptr[1] == CHAR_QUESTION_MARK)
5461 {
5462 repeat_type = greedy_non_default;
5463 ptr++;
5464 }
5465 else repeat_type = greedy_default;
5466
5467 /* If previous was a recursion call, wrap it in atomic brackets so that
5468 previous becomes the atomic group. All recursions were so wrapped in the
5469 past, but it no longer happens for non-repeated recursions. In fact, the
5470 repeated ones could be re-implemented independently so as not to need this,
5471 but for the moment we rely on the code for repeating groups. */
5472
5473 if (*previous == OP_RECURSE)
5474 {
5475 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5476 *previous = OP_ONCE;
5477 PUT(previous, 1, 2 + 2*LINK_SIZE);
5478 previous[2 + 2*LINK_SIZE] = OP_KET;
5479 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5480 code += 2 + 2 * LINK_SIZE;
5481 length_prevgroup = 3 + 3*LINK_SIZE;
5482
5483 /* When actually compiling, we need to check whether this was a forward
5484 reference, and if so, adjust the offset. */
5485
5486 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5487 {
5488 int offset = GET(cd->hwm, -LINK_SIZE);
5489 if (offset == previous + 1 - cd->start_code)
5490 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5491 }
5492 }
5493
5494 /* Now handle repetition for the different types of item. */
5495
5496 /* If previous was a character or negated character match, abolish the item
5497 and generate a repeat item instead. If a char item has a minimum of more
5498 than one, ensure that it is set in reqchar - it might not be if a sequence
5499 such as x{3} is the first thing in a branch because the x will have gone
5500 into firstchar instead. */
5501
5502 if (*previous == OP_CHAR || *previous == OP_CHARI
5503 || *previous == OP_NOT || *previous == OP_NOTI)
5504 {
5505 switch (*previous)
5506 {
5507 default: /* Make compiler happy. */
5508 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5509 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5510 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5511 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5512 }
5513
5514 /* Deal with UTF characters that take up more than one character. It's
5515 easier to write this out separately than try to macrify it. Use c to
5516 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5517 it's a length rather than a small character. */
5518
5519 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5520 if (utf && NOT_FIRSTCHAR(code[-1]))
5521 {
5522 pcre_uchar *lastchar = code - 1;
5523 BACKCHAR(lastchar);
5524 c = (int)(code - lastchar); /* Length of UTF-8 character */
5525 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5526 c |= UTF_LENGTH; /* Flag c as a length */
5527 }
5528 else
5529 #endif /* SUPPORT_UTF */
5530
5531 /* Handle the case of a single charater - either with no UTF support, or
5532 with UTF disabled, or for a single character UTF character. */
5533 {
5534 c = code[-1];
5535 if (*previous <= OP_CHARI && repeat_min > 1)
5536 {
5537 reqchar = c;
5538 reqcharflags = req_caseopt | cd->req_varyopt;
5539 }
5540 }
5541
5542 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5543 }
5544
5545 /* If previous was a character type match (\d or similar), abolish it and
5546 create a suitable repeat item. The code is shared with single-character
5547 repeats by setting op_type to add a suitable offset into repeat_type. Note
5548 the the Unicode property types will be present only when SUPPORT_UCP is
5549 defined, but we don't wrap the little bits of code here because it just
5550 makes it horribly messy. */
5551
5552 else if (*previous < OP_EODN)
5553 {
5554 pcre_uchar *oldcode;
5555 int prop_type, prop_value;
5556 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5557 c = *previous;
5558
5559 OUTPUT_SINGLE_REPEAT:
5560 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5561 {
5562 prop_type = previous[1];
5563 prop_value = previous[2];
5564 }
5565 else prop_type = prop_value = -1;
5566
5567 oldcode = code;
5568 code = previous; /* Usually overwrite previous item */
5569
5570 /* If the maximum is zero then the minimum must also be zero; Perl allows
5571 this case, so we do too - by simply omitting the item altogether. */
5572
5573 if (repeat_max == 0) goto END_REPEAT;
5574
5575 /* Combine the op_type with the repeat_type */
5576
5577 repeat_type += op_type;
5578
5579 /* A minimum of zero is handled either as the special case * or ?, or as
5580 an UPTO, with the maximum given. */
5581
5582 if (repeat_min == 0)
5583 {
5584 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5585 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5586 else
5587 {
5588 *code++ = OP_UPTO + repeat_type;
5589 PUT2INC(code, 0, repeat_max);
5590 }
5591 }
5592
5593 /* A repeat minimum of 1 is optimized into some special cases. If the
5594 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5595 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5596 one less than the maximum. */
5597
5598 else if (repeat_min == 1)
5599 {
5600 if (repeat_max == -1)
5601 *code++ = OP_PLUS + repeat_type;
5602 else
5603 {
5604 code = oldcode; /* leave previous item in place */
5605 if (repeat_max == 1) goto END_REPEAT;
5606 *code++ = OP_UPTO + repeat_type;
5607 PUT2INC(code, 0, repeat_max - 1);
5608 }
5609 }
5610
5611 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5612 handled as an EXACT followed by an UPTO. */
5613
5614 else
5615 {
5616 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5617 PUT2INC(code, 0, repeat_min);
5618
5619 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5620 we have to insert the character for the previous code. For a repeated
5621 Unicode property match, there are two extra bytes that define the
5622 required property. In UTF-8 mode, long characters have their length in
5623 c, with the UTF_LENGTH bit as a flag. */
5624
5625 if (repeat_max < 0)
5626 {
5627 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5628 if (utf && (c & UTF_LENGTH) != 0)
5629 {
5630 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5631 code += c & 7;
5632 }
5633 else
5634 #endif
5635 {
5636 *code++ = c;
5637 if (prop_type >= 0)
5638 {
5639 *code++ = prop_type;
5640 *code++ = prop_value;
5641 }
5642 }
5643 *code++ = OP_STAR + repeat_type;
5644 }
5645
5646 /* Else insert an UPTO if the max is greater than the min, again
5647 preceded by the character, for the previously inserted code. If the
5648 UPTO is just for 1 instance, we can use QUERY instead. */
5649
5650 else if (repeat_max != repeat_min)
5651 {
5652 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5653 if (utf && (c & UTF_LENGTH) != 0)
5654 {
5655 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5656 code += c & 7;
5657 }
5658 else
5659 #endif
5660 *code++ = c;
5661 if (prop_type >= 0)
5662 {
5663 *code++ = prop_type;
5664 *code++ = prop_value;
5665 }
5666 repeat_max -= repeat_min;
5667
5668 if (repeat_max == 1)
5669 {
5670 *code++ = OP_QUERY + repeat_type;
5671 }
5672 else
5673 {
5674 *code++ = OP_UPTO + repeat_type;
5675 PUT2INC(code, 0, repeat_max);
5676 }
5677 }
5678 }
5679
5680 /* The character or character type itself comes last in all cases. */
5681
5682 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5683 if (utf && (c & UTF_LENGTH) != 0)
5684 {
5685 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5686 code += c & 7;
5687 }
5688 else
5689 #endif
5690 *code++ = c;
5691
5692 /* For a repeated Unicode property match, there are two extra bytes that
5693 define the required property. */
5694
5695 #ifdef SUPPORT_UCP
5696 if (prop_type >= 0)
5697 {
5698 *code++ = prop_type;
5699 *code++ = prop_value;
5700 }
5701 #endif
5702 }
5703
5704 /* If previous was a character class or a back reference, we put the repeat
5705 stuff after it, but just skip the item if the repeat was {0,0}. */
5706
5707 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5708 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5709 *previous == OP_XCLASS ||
5710 #endif
5711 *previous == OP_REF || *previous == OP_REFI ||
5712 *previous == OP_DNREF || *previous == OP_DNREFI)
5713 {
5714 if (repeat_max == 0)
5715 {
5716 code = previous;
5717 goto END_REPEAT;
5718 }
5719
5720 if (repeat_min == 0 && repeat_max == -1)
5721 *code++ = OP_CRSTAR + repeat_type;
5722 else if (repeat_min == 1 && repeat_max == -1)
5723 *code++ = OP_CRPLUS + repeat_type;
5724 else if (repeat_min == 0 && repeat_max == 1)
5725 *code++ = OP_CRQUERY + repeat_type;
5726 else
5727 {
5728 *code++ = OP_CRRANGE + repeat_type;
5729 PUT2INC(code, 0, repeat_min);
5730 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5731 PUT2INC(code, 0, repeat_max);
5732 }
5733 }
5734
5735 /* If previous was a bracket group, we may have to replicate it in certain
5736 cases. Note that at this point we can encounter only the "basic" bracket
5737 opcodes such as BRA and CBRA, as this is the place where they get converted
5738 into the more special varieties such as BRAPOS and SBRA. A test for >=
5739 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5740 ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5741 Originally, PCRE did not allow repetition of assertions, but now it does,
5742 for Perl compatibility. */
5743
5744 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5745 {
5746 register int i;
5747 int len = (int)(code - previous);
5748 pcre_uchar *bralink = NULL;
5749 pcre_uchar *brazeroptr = NULL;
5750
5751 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5752 we just ignore the repeat. */
5753
5754 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5755 goto END_REPEAT;
5756
5757 /* There is no sense in actually repeating assertions. The only potential
5758 use of repetition is in cases when the assertion is optional. Therefore,
5759 if the minimum is greater than zero, just ignore the repeat. If the
5760 maximum is not zero or one, set it to 1. */
5761
5762 if (*previous < OP_ONCE) /* Assertion */
5763 {
5764 if (repeat_min > 0) goto END_REPEAT;
5765 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5766 }
5767
5768 /* The case of a zero minimum is special because of the need to stick
5769 OP_BRAZERO in front of it, and because the group appears once in the
5770 data, whereas in other cases it appears the minimum number of times. For
5771 this reason, it is simplest to treat this case separately, as otherwise
5772 the code gets far too messy. There are several special subcases when the
5773 minimum is zero. */
5774
5775 if (repeat_min == 0)
5776 {
5777 /* If the maximum is also zero, we used to just omit the group from the
5778 output altogether, like this:
5779
5780 ** if (repeat_max == 0)
5781 ** {
5782 ** code = previous;
5783 ** goto END_REPEAT;
5784 ** }
5785
5786 However, that fails when a group or a subgroup within it is referenced
5787 as a subroutine from elsewhere in the pattern, so now we stick in
5788 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5789 don't have a list of which groups are referenced, we cannot do this
5790 selectively.
5791
5792 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5793 and do no more at this point. However, we do need to adjust any
5794 OP_RECURSE calls inside the group that refer to the group itself or any
5795 internal or forward referenced group, because the offset is from the
5796 start of the whole regex. Temporarily terminate the pattern while doing
5797 this. */
5798
5799 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5800 {
5801 *code = OP_END;
5802 adjust_recurse(previous, 1, utf, cd, save_hwm);
5803 memmove(previous + 1, previous, IN_UCHARS(len));
5804 code++;
5805 if (repeat_max == 0)
5806 {
5807 *previous++ = OP_SKIPZERO;
5808 goto END_REPEAT;
5809 }
5810 brazeroptr = previous; /* Save for possessive optimizing */
5811 *previous++ = OP_BRAZERO + repeat_type;
5812 }
5813
5814 /* If the maximum is greater than 1 and limited, we have to replicate
5815 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5816 The first one has to be handled carefully because it's the original
5817 copy, which has to be moved up. The remainder can be handled by code
5818 that is common with the non-zero minimum case below. We have to
5819 adjust the value or repeat_max, since one less copy is required. Once
5820 again, we may have to adjust any OP_RECURSE calls inside the group. */
5821
5822 else
5823 {
5824 int offset;
5825 *code = OP_END;
5826 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5827 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5828 code += 2 + LINK_SIZE;
5829 *previous++ = OP_BRAZERO + repeat_type;
5830 *previous++ = OP_BRA;
5831
5832 /* We chain together the bracket offset fields that have to be
5833 filled in later when the ends of the brackets are reached. */
5834
5835 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5836 bralink = previous;
5837 PUTINC(previous, 0, offset);
5838 }
5839
5840 repeat_max--;
5841 }
5842
5843 /* If the minimum is greater than zero, replicate the group as many
5844 times as necessary, and adjust the maximum to the number of subsequent
5845 copies that we need. If we set a first char from the group, and didn't
5846 set a required char, copy the latter from the former. If there are any
5847 forward reference subroutine calls in the group, there will be entries on
5848 the workspace list; replicate these with an appropriate increment. */
5849
5850 else
5851 {
5852 if (repeat_min > 1)
5853 {
5854 /* In the pre-compile phase, we don't actually do the replication. We
5855 just adjust the length as if we had. Do some paranoid checks for
5856 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5857 integer type when available, otherwise double. */
5858
5859 if (lengthptr != NULL)
5860 {
5861 int delta = (repeat_min - 1)*length_prevgroup;
5862 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5863 (INT64_OR_DOUBLE)length_prevgroup >
5864 (INT64_OR_DOUBLE)INT_MAX ||
5865 OFLOW_MAX - *lengthptr < delta)
5866 {
5867 *errorcodeptr = ERR20;
5868 goto FAILED;
5869 }
5870 *lengthptr += delta;
5871 }
5872
5873 /* This is compiling for real. If there is a set first byte for
5874 the group, and we have not yet set a "required byte", set it. Make
5875 sure there is enough workspace for copying forward references before
5876 doing the copy. */
5877
5878 else
5879 {
5880 if (groupsetfirstchar && reqcharflags < 0)
5881 {
5882 reqchar = firstchar;
5883 reqcharflags = firstcharflags;
5884 }
5885
5886 for (i = 1; i < repeat_min; i++)
5887 {
5888 pcre_uchar *hc;
5889 pcre_uchar *this_hwm = cd->hwm;
5890 memcpy(code, previous, IN_UCHARS(len));
5891
5892 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5893 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5894 {
5895 int save_offset = save_hwm - cd->start_workspace;
5896 int this_offset = this_hwm - cd->start_workspace;
5897 *errorcodeptr = expand_workspace(cd);
5898 if (*errorcodeptr != 0) goto FAILED;
5899 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5900 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5901 }
5902
5903 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5904 {
5905 PUT(cd->hwm, 0, GET(hc, 0) + len);
5906 cd->hwm += LINK_SIZE;
5907 }
5908 save_hwm = this_hwm;
5909 code += len;
5910 }
5911 }
5912 }
5913
5914 if (repeat_max > 0) repeat_max -= repeat_min;
5915 }
5916
5917 /* This code is common to both the zero and non-zero minimum cases. If
5918 the maximum is limited, it replicates the group in a nested fashion,
5919 remembering the bracket starts on a stack. In the case of a zero minimum,
5920 the first one was set up above. In all cases the repeat_max now specifies
5921 the number of additional copies needed. Again, we must remember to
5922 replicate entries on the forward reference list. */
5923
5924 if (repeat_max >= 0)
5925 {
5926 /* In the pre-compile phase, we don't actually do the replication. We
5927 just adjust the length as if we had. For each repetition we must add 1
5928 to the length for BRAZERO and for all but the last repetition we must
5929 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5930 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5931 a 64-bit integer type when available, otherwise double. */
5932
5933 if (lengthptr != NULL && repeat_max > 0)
5934 {
5935 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5936 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5937 if ((INT64_OR_DOUBLE)repeat_max *
5938 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5939 > (INT64_OR_DOUBLE)INT_MAX ||
5940 OFLOW_MAX - *lengthptr < delta)
5941 {
5942 *errorcodeptr = ERR20;
5943 goto FAILED;
5944 }
5945 *lengthptr += delta;
5946 }
5947
5948 /* This is compiling for real */
5949
5950 else for (i = repeat_max - 1; i >= 0; i--)
5951 {
5952 pcre_uchar *hc;
5953 pcre_uchar *this_hwm = cd->hwm;
5954
5955 *code++ = OP_BRAZERO + repeat_type;
5956
5957 /* All but the final copy start a new nesting, maintaining the
5958 chain of brackets outstanding. */
5959
5960 if (i != 0)
5961 {
5962 int offset;
5963 *code++ = OP_BRA;
5964 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5965 bralink = code;
5966 PUTINC(code, 0, offset);
5967 }
5968
5969 memcpy(code, previous, IN_UCHARS(len));
5970
5971 /* Ensure there is enough workspace for forward references before
5972 copying them. */
5973
5974 while (cd->hwm > cd->start_workspace + cd->workspace_size -
5975 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5976 {
5977 int save_offset = save_hwm - cd->start_workspace;
5978 int this_offset = this_hwm - cd->start_workspace;
5979 *errorcodeptr = expand_workspace(cd);
5980 if (*errorcodeptr != 0) goto FAILED;
5981 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5982 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5983 }
5984
5985 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5986 {
5987 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5988 cd->hwm += LINK_SIZE;
5989 }
5990 save_hwm = this_hwm;
5991 code += len;
5992 }
5993
5994 /* Now chain through the pending brackets, and fill in their length
5995 fields (which are holding the chain links pro tem). */
5996
5997 while (bralink != NULL)
5998 {
5999 int oldlinkoffset;
6000 int offset = (int)(code - bralink + 1);
6001 pcre_uchar *bra = code - offset;
6002 oldlinkoffset = GET(bra, 1);
6003 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6004 *code++ = OP_KET;
6005 PUTINC(code, 0, offset);
6006 PUT(bra, 1, offset);
6007 }
6008 }
6009
6010 /* If the maximum is unlimited, set a repeater in the final copy. For
6011 ONCE brackets, that's all we need to do. However, possessively repeated
6012 ONCE brackets can be converted into non-capturing brackets, as the
6013 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6014 deal with possessive ONCEs specially.
6015
6016 Otherwise, when we are doing the actual compile phase, check to see
6017 whether this group is one that could match an empty string. If so,
6018 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6019 that runtime checking can be done. [This check is also applied to ONCE
6020 groups at runtime, but in a different way.]
6021
6022 Then, if the quantifier was possessive and the bracket is not a
6023 conditional, we convert the BRA code to the POS form, and the KET code to
6024 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6025 subpattern at both the start and at the end.) The use of special opcodes
6026 makes it possible to reduce greatly the stack usage in pcre_exec(). If
6027 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6028
6029 Then, if the minimum number of matches is 1 or 0, cancel the possessive
6030 flag so that the default action below, of wrapping everything inside
6031 atomic brackets, does not happen. When the minimum is greater than 1,
6032 there will be earlier copies of the group, and so we still have to wrap
6033 the whole thing. */
6034
6035 else
6036 {
6037 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6038 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6039
6040 /* Convert possessive ONCE brackets to non-capturing */
6041
6042 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6043 possessive_quantifier) *bracode = OP_BRA;
6044
6045 /* For non-possessive ONCE brackets, all we need to do is to
6046 set the KET. */
6047
6048 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6049 *ketcode = OP_KETRMAX + repeat_type;
6050
6051 /* Handle non-ONCE brackets and possessive ONCEs (which have been
6052 converted to non-capturing above). */
6053
6054 else
6055 {
6056 /* In the compile phase, check for empty string matching. */
6057
6058 if (lengthptr == NULL)
6059 {
6060 pcre_uchar *scode = bracode;
6061 do
6062 {
6063 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6064 {
6065 *bracode += OP_SBRA - OP_BRA;
6066 break;
6067 }
6068 scode += GET(scode, 1);
6069 }
6070 while (*scode == OP_ALT);
6071 }
6072
6073 /* Handle possessive quantifiers. */
6074
6075 if (possessive_quantifier)
6076 {
6077 /* For COND brackets, we wrap the whole thing in a possessively
6078 repeated non-capturing bracket, because we have not invented POS
6079 versions of the COND opcodes. Because we are moving code along, we
6080 must ensure that any pending recursive references are updated. */
6081
6082 if (*bracode == OP_COND || *bracode == OP_SCOND)
6083 {
6084 int nlen = (int)(code - bracode);
6085 *code = OP_END;
6086 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
6087 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6088 code += 1 + LINK_SIZE;
6089 nlen += 1 + LINK_SIZE;
6090 *bracode = OP_BRAPOS;
6091 *code++ = OP_KETRPOS;
6092 PUTINC(code, 0, nlen);
6093 PUT(bracode, 1, nlen);
6094 }
6095
6096 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6097
6098 else
6099 {
6100 *bracode += 1; /* Switch to xxxPOS opcodes */
6101 *ketcode = OP_KETRPOS;
6102 }
6103
6104 /* If the minimum is zero, mark it as possessive, then unset the
6105 possessive flag when the minimum is 0 or 1. */
6106
6107 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6108 if (repeat_min < 2) possessive_quantifier = FALSE;
6109 }
6110
6111 /* Non-possessive quantifier */
6112
6113 else *ketcode = OP_KETRMAX + repeat_type;
6114 }
6115 }
6116 }
6117
6118 /* If previous is OP_FAIL, it was generated by an empty class [] in
6119 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6120 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6121 error above. We can just ignore the repeat in JS case. */
6122
6123 else if (*previous == OP_FAIL) goto END_REPEAT;
6124
6125 /* Else there's some kind of shambles */
6126
6127 else
6128 {
6129 *errorcodeptr = ERR11;
6130 goto FAILED;
6131 }
6132
6133 /* If the character following a repeat is '+', possessive_quantifier is
6134 TRUE. For some opcodes, there are special alternative opcodes for this
6135 case. For anything else, we wrap the entire repeated item inside OP_ONCE
6136 brackets. Logically, the '+' notation is just syntactic sugar, taken from
6137 Sun's Java package, but the special opcodes can optimize it.
6138
6139 Some (but not all) possessively repeated subpatterns have already been
6140 completely handled in the code just above. For them, possessive_quantifier
6141 is always FALSE at this stage. Note that the repeated item starts at
6142 tempcode, not at previous, which might be the first part of a string whose
6143 (former) last char we repeated. */
6144
6145 if (possessive_quantifier)
6146 {
6147 int len;
6148
6149 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6150 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6151 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6152 remains is greater than zero, there's a further opcode that can be
6153 handled. If not, do nothing, leaving the EXACT alone. */
6154
6155 switch(*tempcode)
6156 {
6157 case OP_TYPEEXACT:
6158 tempcode += PRIV(OP_lengths)[*tempcode] +
6159 ((tempcode[1 + IMM2_SIZE] == OP_PROP
6160 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6161 break;
6162
6163 /* CHAR opcodes are used for exacts whose count is 1. */
6164
6165 case OP_CHAR:
6166 case OP_CHARI:
6167 case OP_NOT:
6168 case OP_NOTI:
6169 case OP_EXACT:
6170 case OP_EXACTI:
6171 case OP_NOTEXACT:
6172 case OP_NOTEXACTI:
6173 tempcode += PRIV(OP_lengths)[*tempcode];
6174 #ifdef SUPPORT_UTF
6175 if (utf && HAS_EXTRALEN(tempcode[-1]))
6176 tempcode += GET_EXTRALEN(tempcode[-1]);
6177 #endif
6178 break;
6179
6180 /* For the class opcodes, the repeat operator appears at the end;
6181 adjust tempcode to point to it. */
6182
6183 case OP_CLASS:
6184 case OP_NCLASS:
6185 tempcode += 1 + 32/sizeof(pcre_uchar);
6186 break;
6187
6188 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6189 case OP_XCLASS:
6190 tempcode += GET(tempcode, 1);
6191 break;
6192 #endif
6193 }
6194
6195 /* If tempcode is equal to code (which points to the end of the repeated
6196 item), it means we have skipped an EXACT item but there is no following
6197 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6198 all other cases, tempcode will be pointing to the repeat opcode, and will
6199 be less than code, so the value of len will be greater than 0. */
6200
6201 len = (int)(code - tempcode);
6202 if (len > 0)
6203 {
6204 unsigned int repcode = *tempcode;
6205
6206 /* There is a table for possessifying opcodes, all of which are less
6207 than OP_CALLOUT. A zero entry means there is no possessified version.
6208 */
6209
6210 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6211 *tempcode = opcode_possessify[repcode];
6212
6213 /* For opcode without a special possessified version, wrap the item in
6214 ONCE brackets. Because we are moving code along, we must ensure that any
6215 pending recursive references are updated. */
6216
6217 else
6218 {
6219 *code = OP_END;
6220 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6221 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6222 code += 1 + LINK_SIZE;
6223 len += 1 + LINK_SIZE;
6224 tempcode[0] = OP_ONCE;
6225 *code++ = OP_KET;
6226 PUTINC(code, 0, len);
6227 PUT(tempcode, 1, len);
6228 }
6229 }
6230
6231 #ifdef NEVER
6232 if (len > 0) switch (*tempcode)
6233 {
6234 case OP_STAR: *tempcode = OP_POSSTAR; break;
6235 case OP_PLUS: *tempcode = OP_POSPLUS; break;
6236 case OP_QUERY: *tempcode = OP_POSQUERY; break;
6237 case OP_UPTO: *tempcode = OP_POSUPTO; break;
6238
6239 case OP_STARI: *tempcode = OP_POSSTARI; break;
6240 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
6241 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6242 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
6243
6244 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
6245 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
6246 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6247 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
6248
6249 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
6250 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
6251 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6252 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
6253
6254 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
6255 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
6256 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6257 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
6258
6259 case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break;
6260 case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break;
6261 case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break;
6262 case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break;
6263
6264 /* Because we are moving code along, we must ensure that any
6265 pending recursive references are updated. */
6266
6267 default:
6268 *code = OP_END;
6269 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6270 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6271 code += 1 + LINK_SIZE;
6272 len += 1 + LINK_SIZE;
6273 tempcode[0] = OP_ONCE;
6274 *code++ = OP_KET;
6275 PUTINC(code, 0, len);
6276 PUT(tempcode, 1, len);
6277 break;
6278 }
6279 #endif
6280 }
6281
6282 /* In all case we no longer have a previous item. We also set the
6283 "follows varying string" flag for subsequently encountered reqchars if
6284 it isn't already set and we have just passed a varying length item. */
6285
6286 END_REPEAT:
6287 previous = NULL;
6288 cd->req_varyopt |= reqvary;
6289 break;
6290
6291
6292 /* ===================================================================*/
6293 /* Start of nested parenthesized sub-expression, or comment or lookahead or
6294 lookbehind or option setting or condition or all the other extended
6295 parenthesis forms. */
6296
6297 case CHAR_LEFT_PARENTHESIS:
6298 newoptions = options;
6299 skipbytes = 0;
6300 bravalue = OP_CBRA;
6301 save_hwm = cd->hwm;
6302 reset_bracount = FALSE;
6303
6304 /* First deal with various "verbs" that can be introduced by '*'. */
6305
6306 ptr++;
6307 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6308 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6309 {
6310 int i, namelen;
6311 int arglen = 0;
6312 const char *vn = verbnames;
6313 const pcre_uchar *name = ptr + 1;
6314 const pcre_uchar *arg = NULL;
6315 previous = NULL;
6316 ptr++;
6317 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6318 namelen = (int)(ptr - name);
6319
6320 /* It appears that Perl allows any characters whatsoever, other than
6321 a closing parenthesis, to appear in arguments, so we no longer insist on
6322 letters, digits, and underscores. */
6323
6324 if (*ptr == CHAR_COLON)
6325 {
6326 arg = ++ptr;
6327 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6328 arglen = (int)(ptr - arg);
6329 if ((unsigned int)arglen > MAX_MARK)
6330 {
6331 *errorcodeptr = ERR75;
6332 goto FAILED;
6333 }
6334 }
6335
6336 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6337 {
6338 *errorcodeptr = ERR60;
6339 goto FAILED;
6340 }
6341
6342 /* Scan the table of verb names */
6343
6344 for (i = 0; i < verbcount; i++)
6345 {
6346 if (namelen == verbs[i].len &&
6347 STRNCMP_UC_C8(name, vn, namelen) == 0)
6348 {
6349 int setverb;
6350
6351 /* Check for open captures before ACCEPT and convert it to
6352 ASSERT_ACCEPT if in an assertion. */
6353
6354 if (verbs[i].op == OP_ACCEPT)
6355 {
6356 open_capitem *oc;
6357 if (arglen != 0)
6358 {
6359 *errorcodeptr = ERR59;
6360 goto FAILED;
6361 }
6362 cd->had_accept = TRUE;
6363 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6364 {
6365 *code++ = OP_CLOSE;
6366 PUT2INC(code, 0, oc->number);
6367 }
6368 setverb = *code++ =
6369 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6370
6371 /* Do not set firstchar after *ACCEPT */
6372 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6373 }
6374
6375 /* Handle other cases with/without an argument */
6376
6377 else if (arglen == 0)
6378 {
6379 if (verbs[i].op < 0) /* Argument is mandatory */
6380 {
6381 *errorcodeptr = ERR66;
6382 goto FAILED;
6383 }
6384 setverb = *code++ = verbs[i].op;
6385 }
6386
6387 else
6388 {
6389 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6390 {
6391 *errorcodeptr = ERR59;
6392 goto FAILED;
6393 }
6394 setverb = *code++ = verbs[i].op_arg;
6395 *code++ = arglen;
6396 memcpy(code, arg, IN_UCHARS(arglen));
6397 code += arglen;
6398 *code++ = 0;
6399 }
6400
6401 switch (setverb)
6402 {
6403 case OP_THEN:
6404 case OP_THEN_ARG:
6405 cd->external_flags |= PCRE_HASTHEN;
6406 break;
6407
6408 case OP_PRUNE:
6409 case OP_PRUNE_ARG:
6410 case OP_SKIP:
6411 case OP_SKIP_ARG:
6412 cd->had_pruneorskip = TRUE;
6413 break;
6414 }
6415
6416 break; /* Found verb, exit loop */
6417 }
6418
6419 vn += verbs[i].len + 1;
6420 }
6421
6422 if (i < verbcount) continue; /* Successfully handled a verb */
6423 *errorcodeptr = ERR60; /* Verb not recognized */
6424 goto FAILED;
6425 }
6426
6427 /* Deal with the extended parentheses; all are introduced by '?', and the
6428 appearance of any of them means that this is not a capturing group. */
6429
6430 else if (*ptr == CHAR_QUESTION_MARK)
6431 {
6432 int i, set, unset, namelen;
6433 int *optset;
6434 const pcre_uchar *name;
6435 pcre_uchar *slot;
6436
6437 switch (*(++ptr))
6438 {
6439 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
6440 ptr++;
6441 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6442 if (*ptr == CHAR_NULL)
6443 {
6444 *errorcodeptr = ERR18;
6445 goto FAILED;
6446 }
6447 continue;
6448
6449
6450 /* ------------------------------------------------------------ */
6451 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6452 reset_bracount = TRUE;
6453 /* Fall through */
6454
6455 /* ------------------------------------------------------------ */
6456 case CHAR_COLON: /* Non-capturing bracket */
6457 bravalue = OP_BRA;
6458 ptr++;
6459 break;
6460
6461
6462 /* ------------------------------------------------------------ */
6463 case CHAR_LEFT_PARENTHESIS:
6464 bravalue = OP_COND; /* Conditional group */
6465 tempptr = ptr;
6466
6467 /* A condition can be an assertion, a number (referring to a numbered
6468 group), a name (referring to a named group), or 'R', referring to
6469 recursion. R<digits> and R&name are also permitted for recursion tests.
6470
6471 There are several syntaxes for testing a named group: (?(name)) is used
6472 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
6473
6474 There are two unfortunate ambiguities, caused by history. (a) 'R' can
6475 be the recursive thing or the name 'R' (and similarly for 'R' followed
6476 by digits), and (b) a number could be a name that consists of digits.
6477 In both cases, we look for a name first; if not found, we try the other
6478 cases.
6479
6480 For compatibility with auto-callouts, we allow a callout to be
6481 specified before a condition that is an assertion. First, check for the
6482 syntax of a callout; if found, adjust the temporary pointer that is
6483 used to check for an assertion condition. That's all that is needed! */
6484
6485 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6486 {
6487 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6488 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6489 tempptr += i + 1;
6490 }
6491
6492 /* For conditions that are assertions, check the syntax, and then exit
6493 the switch. This will take control down to where bracketed groups,
6494 including assertions, are processed. */
6495
6496 if (tempptr[1] == CHAR_QUESTION_MARK &&
6497 (tempptr[2] == CHAR_EQUALS_SIGN ||
6498 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6499 tempptr[2] == CHAR_LESS_THAN_SIGN))
6500 break;
6501
6502 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6503 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6504
6505 code[1+LINK_SIZE] = OP_CREF;
6506 skipbytes = 1+IMM2_SIZE;
6507 refsign = -1;
6508
6509 /* Check for a test for recursion in a named group. */
6510
6511 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
6512 {
6513 terminator = -1;
6514 ptr += 2;
6515 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6516 }
6517
6518 /* Check for a test for a named group's having been set, using the Perl
6519 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6520 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). As names may
6521 consist entirely of digits, there is scope for ambiguity. */
6522
6523 else if (ptr[1] == CHAR_LESS_THAN_SIGN)
6524 {
6525 terminator = CHAR_GREATER_THAN_SIGN;
6526 ptr++;
6527 }
6528 else if (ptr[1] == CHAR_APOSTROPHE)
6529 {
6530 terminator = CHAR_APOSTROPHE;
6531 ptr++;
6532 }
6533 else
6534 {
6535 terminator = CHAR_NULL;
6536 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
6537 }
6538
6539 /* When a name is one of a number of duplicates, a different opcode is
6540 used and it needs more memory. Unfortunately we cannot tell whether a
6541 name is a duplicate in the first pass, so we have to allow for more
6542 memory except when we know it is a relative numerical reference. */
6543
6544 if (refsign < 0 && lengthptr != NULL) *lengthptr += IMM2_SIZE;
6545
6546 /* We now expect to read a name (possibly all digits); any thing else
6547 is an error. In the case of all digits, also get it as a number. */
6548
6549 if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
6550 {
6551 ptr += 1; /* To get the right offset */
6552 *errorcodeptr = ERR28;
6553 goto FAILED;
6554 }
6555
6556 recno = 0;
6557 name = ++ptr;
6558 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6559 {
6560 if (recno >= 0)
6561 recno = (IS_DIGIT(*ptr))? recno * 10 + (int)(*ptr - CHAR_0) : -1;
6562 ptr++;
6563 }
6564 namelen = (int)(ptr - name);
6565
6566 /* Check the terminator */
6567
6568 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6569 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6570 {
6571 ptr--; /* Error offset */
6572 *errorcodeptr = ERR26;
6573 goto FAILED;
6574 }
6575
6576 /* Do no further checking in the pre-compile phase. */
6577
6578 if (lengthptr != NULL) break;
6579
6580 /* In the real compile we do the work of looking for the actual
6581 reference. If the string started with "+" or "-" we require the rest to
6582 be digits, in which case recno will be set. */
6583
6584 if (refsign > 0)
6585 {
6586 if (recno <= 0)
6587 {
6588 *errorcodeptr = ERR58;
6589 goto FAILED;
6590 }
6591 recno = (refsign == CHAR_MINUS)?
6592 cd->bracount - recno + 1 : recno +cd->bracount;
6593 if (recno <= 0 || recno > cd->final_bracount)
6594 {
6595 *errorcodeptr = ERR15;
6596 goto FAILED;
6597 }
6598 PUT2(code, 2+LINK_SIZE, recno);
6599 break;
6600 }
6601
6602 /* Otherwise (did not start with "+" or "-"), start by looking for the
6603 name. */
6604
6605 slot = cd->name_table;
6606 for (i = 0; i < cd->names_found; i++)
6607 {
6608 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6609 slot += cd->name_entry_size;
6610 }
6611
6612 /* Found the named subpattern. If the name is duplicated, add one to
6613 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6614 appropriate data values. Otherwise, just insert the unique subpattern
6615 number. */
6616
6617 if (i < cd->names_found)
6618 {
6619 int offset = i++;
6620 int count = 1;
6621 recno = GET2(slot, 0); /* Number from first found */
6622 for (; i < cd->names_found; i++)
6623 {
6624 slot += cd->name_entry_size;
6625 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0) break;
6626 count++;
6627 }
6628 if (count > 1)
6629 {
6630 PUT2(code, 2+LINK_SIZE, offset);
6631 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6632 skipbytes += IMM2_SIZE;
6633 code[1+LINK_SIZE]++;
6634 }
6635 else /* Not a duplicated name */
6636 {
6637 PUT2(code, 2+LINK_SIZE, recno);
6638 }
6639 }
6640
6641 /* If terminator == CHAR_NULL it means that the name followed directly
6642 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6643 are some further alternatives to try. For the cases where terminator !=
6644 0 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
6645 now checked all the possibilities, so give an error. */
6646
6647 else if (terminator != CHAR_NULL)
6648 {
6649 *errorcodeptr = ERR15;
6650 goto FAILED;
6651 }
6652
6653 /* Check for (?(R) for recursion. Allow digits after R to specify a
6654 specific group number. */
6655
6656 else if (*name == CHAR_R)
6657 {
6658 recno = 0;
6659 for (i = 1; i < namelen; i++)
6660 {
6661