/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1472 - (show annotations)
Mon Apr 21 16:11:50 2014 UTC (5 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 311054 byte(s)
Fix C++ compiler warnings and silly quantifier size check.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
264 another regex library. */
265
266 static const pcre_uchar sub_start_of_word[] = {
267 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
268 CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
269
270 static const pcre_uchar sub_end_of_word[] = {
271 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
272 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
273 CHAR_RIGHT_PARENTHESIS, '\0' };
274
275
276 /* Tables of names of POSIX character classes and their lengths. The names are
277 now all in a single string, to reduce the number of relocations when a shared
278 library is dynamically loaded. The list of lengths is terminated by a zero
279 length entry. The first three must be alpha, lower, upper, as this is assumed
280 for handling case independence. The indices for graph, print, and punct are
281 needed, so identify them. */
282
283 static const char posix_names[] =
284 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
285 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
286 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
287 STRING_word0 STRING_xdigit;
288
289 static const pcre_uint8 posix_name_lengths[] = {
290 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
291
292 #define PC_GRAPH 8
293 #define PC_PRINT 9
294 #define PC_PUNCT 10
295
296
297 /* Table of class bit maps for each POSIX class. Each class is formed from a
298 base map, with an optional addition or removal of another map. Then, for some
299 classes, there is some additional tweaking: for [:blank:] the vertical space
300 characters are removed, and for [:alpha:] and [:alnum:] the underscore
301 character is removed. The triples in the table consist of the base map offset,
302 second map offset or -1 if no second map, and a non-negative value for map
303 addition or a negative value for map subtraction (if there are two maps). The
304 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
305 remove vertical space characters, 2 => remove underscore. */
306
307 static const int posix_class_maps[] = {
308 cbit_word, cbit_digit, -2, /* alpha */
309 cbit_lower, -1, 0, /* lower */
310 cbit_upper, -1, 0, /* upper */
311 cbit_word, -1, 2, /* alnum - word without underscore */
312 cbit_print, cbit_cntrl, 0, /* ascii */
313 cbit_space, -1, 1, /* blank - a GNU extension */
314 cbit_cntrl, -1, 0, /* cntrl */
315 cbit_digit, -1, 0, /* digit */
316 cbit_graph, -1, 0, /* graph */
317 cbit_print, -1, 0, /* print */
318 cbit_punct, -1, 0, /* punct */
319 cbit_space, -1, 0, /* space */
320 cbit_word, -1, 0, /* word - a Perl extension */
321 cbit_xdigit,-1, 0 /* xdigit */
322 };
323
324 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
325 Unicode property escapes. */
326
327 #ifdef SUPPORT_UCP
328 static const pcre_uchar string_PNd[] = {
329 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
330 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
331 static const pcre_uchar string_pNd[] = {
332 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
333 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
334 static const pcre_uchar string_PXsp[] = {
335 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
336 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337 static const pcre_uchar string_pXsp[] = {
338 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340 static const pcre_uchar string_PXwd[] = {
341 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
342 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343 static const pcre_uchar string_pXwd[] = {
344 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
345 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346
347 static const pcre_uchar *substitutes[] = {
348 string_PNd, /* \D */
349 string_pNd, /* \d */
350 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
351 string_pXsp, /* \s */ /* space and POSIX space are the same. */
352 string_PXwd, /* \W */
353 string_pXwd /* \w */
354 };
355
356 /* The POSIX class substitutes must be in the order of the POSIX class names,
357 defined above, and there are both positive and negative cases. NULL means no
358 general substitute of a Unicode property escape (\p or \P). However, for some
359 POSIX classes (e.g. graph, print, punct) a special property code is compiled
360 directly. */
361
362 static const pcre_uchar string_pL[] = {
363 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
364 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
365 static const pcre_uchar string_pLl[] = {
366 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
367 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
368 static const pcre_uchar string_pLu[] = {
369 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
371 static const pcre_uchar string_pXan[] = {
372 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
373 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
374 static const pcre_uchar string_h[] = {
375 CHAR_BACKSLASH, CHAR_h, '\0' };
376 static const pcre_uchar string_pXps[] = {
377 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
378 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
379 static const pcre_uchar string_PL[] = {
380 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
381 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
382 static const pcre_uchar string_PLl[] = {
383 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
384 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
385 static const pcre_uchar string_PLu[] = {
386 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
387 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
388 static const pcre_uchar string_PXan[] = {
389 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
390 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
391 static const pcre_uchar string_H[] = {
392 CHAR_BACKSLASH, CHAR_H, '\0' };
393 static const pcre_uchar string_PXps[] = {
394 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
395 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
396
397 static const pcre_uchar *posix_substitutes[] = {
398 string_pL, /* alpha */
399 string_pLl, /* lower */
400 string_pLu, /* upper */
401 string_pXan, /* alnum */
402 NULL, /* ascii */
403 string_h, /* blank */
404 NULL, /* cntrl */
405 string_pNd, /* digit */
406 NULL, /* graph */
407 NULL, /* print */
408 NULL, /* punct */
409 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
410 string_pXwd, /* word */ /* Perl and POSIX space are the same */
411 NULL, /* xdigit */
412 /* Negated cases */
413 string_PL, /* ^alpha */
414 string_PLl, /* ^lower */
415 string_PLu, /* ^upper */
416 string_PXan, /* ^alnum */
417 NULL, /* ^ascii */
418 string_H, /* ^blank */
419 NULL, /* ^cntrl */
420 string_PNd, /* ^digit */
421 NULL, /* ^graph */
422 NULL, /* ^print */
423 NULL, /* ^punct */
424 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
425 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
426 NULL /* ^xdigit */
427 };
428 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
429 #endif
430
431 #define STRING(a) # a
432 #define XSTRING(s) STRING(s)
433
434 /* The texts of compile-time error messages. These are "char *" because they
435 are passed to the outside world. Do not ever re-use any error number, because
436 they are documented. Always add a new error instead. Messages marked DEAD below
437 are no longer used. This used to be a table of strings, but in order to reduce
438 the number of relocations needed when a shared library is loaded dynamically,
439 it is now one long string. We cannot use a table of offsets, because the
440 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
441 simply count through to the one we want - this isn't a performance issue
442 because these strings are used only when there is a compilation error.
443
444 Each substring ends with \0 to insert a null character. This includes the final
445 substring, so that the whole string ends with \0\0, which can be detected when
446 counting through. */
447
448 static const char error_texts[] =
449 "no error\0"
450 "\\ at end of pattern\0"
451 "\\c at end of pattern\0"
452 "unrecognized character follows \\\0"
453 "numbers out of order in {} quantifier\0"
454 /* 5 */
455 "number too big in {} quantifier\0"
456 "missing terminating ] for character class\0"
457 "invalid escape sequence in character class\0"
458 "range out of order in character class\0"
459 "nothing to repeat\0"
460 /* 10 */
461 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
462 "internal error: unexpected repeat\0"
463 "unrecognized character after (? or (?-\0"
464 "POSIX named classes are supported only within a class\0"
465 "missing )\0"
466 /* 15 */
467 "reference to non-existent subpattern\0"
468 "erroffset passed as NULL\0"
469 "unknown option bit(s) set\0"
470 "missing ) after comment\0"
471 "parentheses nested too deeply\0" /** DEAD **/
472 /* 20 */
473 "regular expression is too large\0"
474 "failed to get memory\0"
475 "unmatched parentheses\0"
476 "internal error: code overflow\0"
477 "unrecognized character after (?<\0"
478 /* 25 */
479 "lookbehind assertion is not fixed length\0"
480 "malformed number or name after (?(\0"
481 "conditional group contains more than two branches\0"
482 "assertion expected after (?(\0"
483 "(?R or (?[+-]digits must be followed by )\0"
484 /* 30 */
485 "unknown POSIX class name\0"
486 "POSIX collating elements are not supported\0"
487 "this version of PCRE is compiled without UTF support\0"
488 "spare error\0" /** DEAD **/
489 "character value in \\x{} or \\o{} is too large\0"
490 /* 35 */
491 "invalid condition (?(0)\0"
492 "\\C not allowed in lookbehind assertion\0"
493 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
494 "number after (?C is > 255\0"
495 "closing ) for (?C expected\0"
496 /* 40 */
497 "recursive call could loop indefinitely\0"
498 "unrecognized character after (?P\0"
499 "syntax error in subpattern name (missing terminator)\0"
500 "two named subpatterns have the same name\0"
501 "invalid UTF-8 string\0"
502 /* 45 */
503 "support for \\P, \\p, and \\X has not been compiled\0"
504 "malformed \\P or \\p sequence\0"
505 "unknown property name after \\P or \\p\0"
506 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
507 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
508 /* 50 */
509 "repeated subpattern is too long\0" /** DEAD **/
510 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
511 "internal error: overran compiling workspace\0"
512 "internal error: previously-checked referenced subpattern not found\0"
513 "DEFINE group contains more than one branch\0"
514 /* 55 */
515 "repeating a DEFINE group is not allowed\0" /** DEAD **/
516 "inconsistent NEWLINE options\0"
517 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
518 "a numbered reference must not be zero\0"
519 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
520 /* 60 */
521 "(*VERB) not recognized or malformed\0"
522 "number is too big\0"
523 "subpattern name expected\0"
524 "digit expected after (?+\0"
525 "] is an invalid data character in JavaScript compatibility mode\0"
526 /* 65 */
527 "different names for subpatterns of the same number are not allowed\0"
528 "(*MARK) must have an argument\0"
529 "this version of PCRE is not compiled with Unicode property support\0"
530 "\\c must be followed by an ASCII character\0"
531 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
532 /* 70 */
533 "internal error: unknown opcode in find_fixedlength()\0"
534 "\\N is not supported in a class\0"
535 "too many forward references\0"
536 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
537 "invalid UTF-16 string\0"
538 /* 75 */
539 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
540 "character value in \\u.... sequence is too large\0"
541 "invalid UTF-32 string\0"
542 "setting UTF is disabled by the application\0"
543 "non-hex character in \\x{} (closing brace missing?)\0"
544 /* 80 */
545 "non-octal character in \\o{} (closing brace missing?)\0"
546 "missing opening brace after \\o\0"
547 "parentheses are too deeply nested\0"
548 "invalid range in character class\0"
549 "group name must start with a non-digit\0"
550 /* 85 */
551 "parentheses are too deeply nested (stack check)\0"
552 ;
553
554 /* Table to identify digits and hex digits. This is used when compiling
555 patterns. Note that the tables in chartables are dependent on the locale, and
556 may mark arbitrary characters as digits - but the PCRE compiling code expects
557 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
558 a private table here. It costs 256 bytes, but it is a lot faster than doing
559 character value tests (at least in some simple cases I timed), and in some
560 applications one wants PCRE to compile efficiently as well as match
561 efficiently.
562
563 For convenience, we use the same bit definitions as in chartables:
564
565 0x04 decimal digit
566 0x08 hexadecimal digit
567
568 Then we can use ctype_digit and ctype_xdigit in the code. */
569
570 /* Using a simple comparison for decimal numbers rather than a memory read
571 is much faster, and the resulting code is simpler (the compiler turns it
572 into a subtraction and unsigned comparison). */
573
574 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
575
576 #ifndef EBCDIC
577
578 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
579 UTF-8 mode. */
580
581 static const pcre_uint8 digitab[] =
582 {
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
589 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
590 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
591 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
595 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
615
616 #else
617
618 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
619
620 static const pcre_uint8 digitab[] =
621 {
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
624 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
627 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
638 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
646 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
652 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
653 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
654
655 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
656 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
657 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
658 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
660 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
664 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
665 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
667 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
669 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
672 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
673 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
674 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
675 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
676 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
677 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
678 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
679 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
680 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
681 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
682 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
683 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
684 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
685 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
686 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
687 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
688 #endif
689
690
691 /* This table is used to check whether auto-possessification is possible
692 between adjacent character-type opcodes. The left-hand (repeated) opcode is
693 used to select the row, and the right-hand opcode is use to select the column.
694 A value of 1 means that auto-possessification is OK. For example, the second
695 value in the first row means that \D+\d can be turned into \D++\d.
696
697 The Unicode property types (\P and \p) have to be present to fill out the table
698 because of what their opcode values are, but the table values should always be
699 zero because property types are handled separately in the code. The last four
700 columns apply to items that cannot be repeated, so there is no need to have
701 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
702 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
703
704 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
705 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
706
707 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
708 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
709 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
710 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
711 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
712 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
713 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
714 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
715 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
716 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
717 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
718 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
719 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
720 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
721 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
722 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
723 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
724 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
725 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
726 };
727
728
729 /* This table is used to check whether auto-possessification is possible
730 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
731 left-hand (repeated) opcode is used to select the row, and the right-hand
732 opcode is used to select the column. The values are as follows:
733
734 0 Always return FALSE (never auto-possessify)
735 1 Character groups are distinct (possessify if both are OP_PROP)
736 2 Check character categories in the same group (general or particular)
737 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
738
739 4 Check left general category vs right particular category
740 5 Check right general category vs left particular category
741
742 6 Left alphanum vs right general category
743 7 Left space vs right general category
744 8 Left word vs right general category
745
746 9 Right alphanum vs left general category
747 10 Right space vs left general category
748 11 Right word vs left general category
749
750 12 Left alphanum vs right particular category
751 13 Left space vs right particular category
752 14 Left word vs right particular category
753
754 15 Right alphanum vs left particular category
755 16 Right space vs left particular category
756 17 Right word vs left particular category
757 */
758
759 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
760 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
761 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
762 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
763 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
764 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
765 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
766 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
767 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
768 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
769 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
770 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
771 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
772 };
773
774 /* This table is used to check whether auto-possessification is possible
775 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
776 specifies a general category and the other specifies a particular category. The
777 row is selected by the general category and the column by the particular
778 category. The value is 1 if the particular category is not part of the general
779 category. */
780
781 static const pcre_uint8 catposstab[7][30] = {
782 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
783 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
784 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
785 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
786 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
787 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
788 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
789 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
790 };
791
792 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
793 a general or particular category. The properties in each row are those
794 that apply to the character set in question. Duplication means that a little
795 unnecessary work is done when checking, but this keeps things much simpler
796 because they can all use the same code. For more details see the comment where
797 this table is used.
798
799 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
800 "space", but from Perl 5.18 it's included, so both categories are treated the
801 same here. */
802
803 static const pcre_uint8 posspropstab[3][4] = {
804 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
805 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
806 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
807 };
808
809 /* This table is used when converting repeating opcodes into possessified
810 versions as a result of an explicit possessive quantifier such as ++. A zero
811 value means there is no possessified version - in those cases the item in
812 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
813 because all relevant opcodes are less than that. */
814
815 static const pcre_uint8 opcode_possessify[] = {
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
817 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
818
819 0, /* NOTI */
820 OP_POSSTAR, 0, /* STAR, MINSTAR */
821 OP_POSPLUS, 0, /* PLUS, MINPLUS */
822 OP_POSQUERY, 0, /* QUERY, MINQUERY */
823 OP_POSUPTO, 0, /* UPTO, MINUPTO */
824 0, /* EXACT */
825 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
826
827 OP_POSSTARI, 0, /* STARI, MINSTARI */
828 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
829 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
830 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
831 0, /* EXACTI */
832 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
833
834 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
835 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
836 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
837 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
838 0, /* NOTEXACT */
839 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
840
841 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
842 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
843 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
844 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
845 0, /* NOTEXACTI */
846 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
847
848 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
849 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
850 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
851 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
852 0, /* TYPEEXACT */
853 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
854
855 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
856 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
857 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
858 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
859 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
860
861 0, 0, 0, /* CLASS, NCLASS, XCLASS */
862 0, 0, /* REF, REFI */
863 0, 0, /* DNREF, DNREFI */
864 0, 0 /* RECURSE, CALLOUT */
865 };
866
867
868
869 /*************************************************
870 * Find an error text *
871 *************************************************/
872
873 /* The error texts are now all in one long string, to save on relocations. As
874 some of the text is of unknown length, we can't use a table of offsets.
875 Instead, just count through the strings. This is not a performance issue
876 because it happens only when there has been a compilation error.
877
878 Argument: the error number
879 Returns: pointer to the error string
880 */
881
882 static const char *
883 find_error_text(int n)
884 {
885 const char *s = error_texts;
886 for (; n > 0; n--)
887 {
888 while (*s++ != CHAR_NULL) {};
889 if (*s == CHAR_NULL) return "Error text not found (please report)";
890 }
891 return s;
892 }
893
894
895
896 /*************************************************
897 * Expand the workspace *
898 *************************************************/
899
900 /* This function is called during the second compiling phase, if the number of
901 forward references fills the existing workspace, which is originally a block on
902 the stack. A larger block is obtained from malloc() unless the ultimate limit
903 has been reached or the increase will be rather small.
904
905 Argument: pointer to the compile data block
906 Returns: 0 if all went well, else an error number
907 */
908
909 static int
910 expand_workspace(compile_data *cd)
911 {
912 pcre_uchar *newspace;
913 int newsize = cd->workspace_size * 2;
914
915 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
916 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
917 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
918 return ERR72;
919
920 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
921 if (newspace == NULL) return ERR21;
922 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
923 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
924 if (cd->workspace_size > COMPILE_WORK_SIZE)
925 (PUBL(free))((void *)cd->start_workspace);
926 cd->start_workspace = newspace;
927 cd->workspace_size = newsize;
928 return 0;
929 }
930
931
932
933 /*************************************************
934 * Check for counted repeat *
935 *************************************************/
936
937 /* This function is called when a '{' is encountered in a place where it might
938 start a quantifier. It looks ahead to see if it really is a quantifier or not.
939 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
940 where the ddds are digits.
941
942 Arguments:
943 p pointer to the first char after '{'
944
945 Returns: TRUE or FALSE
946 */
947
948 static BOOL
949 is_counted_repeat(const pcre_uchar *p)
950 {
951 if (!IS_DIGIT(*p)) return FALSE;
952 p++;
953 while (IS_DIGIT(*p)) p++;
954 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
955
956 if (*p++ != CHAR_COMMA) return FALSE;
957 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
958
959 if (!IS_DIGIT(*p)) return FALSE;
960 p++;
961 while (IS_DIGIT(*p)) p++;
962
963 return (*p == CHAR_RIGHT_CURLY_BRACKET);
964 }
965
966
967
968 /*************************************************
969 * Handle escapes *
970 *************************************************/
971
972 /* This function is called when a \ has been encountered. It either returns a
973 positive value for a simple escape such as \n, or 0 for a data character which
974 will be placed in chptr. A backreference to group n is returned as negative n.
975 When UTF-8 is enabled, a positive value greater than 255 may be returned in
976 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
977 character of the escape sequence.
978
979 Arguments:
980 ptrptr points to the pattern position pointer
981 chptr points to a returned data character
982 errorcodeptr points to the errorcode variable
983 bracount number of previous extracting brackets
984 options the options bits
985 isclass TRUE if inside a character class
986
987 Returns: zero => a data character
988 positive => a special escape sequence
989 negative => a back reference
990 on error, errorcodeptr is set
991 */
992
993 static int
994 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
995 int bracount, int options, BOOL isclass)
996 {
997 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
998 BOOL utf = (options & PCRE_UTF8) != 0;
999 const pcre_uchar *ptr = *ptrptr + 1;
1000 pcre_uint32 c;
1001 int escape = 0;
1002 int i;
1003
1004 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1005 ptr--; /* Set pointer back to the last byte */
1006
1007 /* If backslash is at the end of the pattern, it's an error. */
1008
1009 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1010
1011 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1012 in a table. A non-zero result is something that can be returned immediately.
1013 Otherwise further processing may be required. */
1014
1015 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1016 /* Not alphanumeric */
1017 else if (c < CHAR_0 || c > CHAR_z) {}
1018 else if ((i = escapes[c - CHAR_0]) != 0)
1019 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1020
1021 #else /* EBCDIC coding */
1022 /* Not alphanumeric */
1023 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1024 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1025 #endif
1026
1027 /* Escapes that need further processing, or are illegal. */
1028
1029 else
1030 {
1031 const pcre_uchar *oldptr;
1032 BOOL braced, negated, overflow;
1033 int s;
1034
1035 switch (c)
1036 {
1037 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1038 error. */
1039
1040 case CHAR_l:
1041 case CHAR_L:
1042 *errorcodeptr = ERR37;
1043 break;
1044
1045 case CHAR_u:
1046 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1047 {
1048 /* In JavaScript, \u must be followed by four hexadecimal numbers.
1049 Otherwise it is a lowercase u letter. */
1050 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1051 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1052 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1053 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1054 {
1055 c = 0;
1056 for (i = 0; i < 4; ++i)
1057 {
1058 register pcre_uint32 cc = *(++ptr);
1059 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1060 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1061 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1062 #else /* EBCDIC coding */
1063 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1064 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1065 #endif
1066 }
1067
1068 #if defined COMPILE_PCRE8
1069 if (c > (utf ? 0x10ffffU : 0xffU))
1070 #elif defined COMPILE_PCRE16
1071 if (c > (utf ? 0x10ffffU : 0xffffU))
1072 #elif defined COMPILE_PCRE32
1073 if (utf && c > 0x10ffffU)
1074 #endif
1075 {
1076 *errorcodeptr = ERR76;
1077 }
1078 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1079 }
1080 }
1081 else
1082 *errorcodeptr = ERR37;
1083 break;
1084
1085 case CHAR_U:
1086 /* In JavaScript, \U is an uppercase U letter. */
1087 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1088 break;
1089
1090 /* In a character class, \g is just a literal "g". Outside a character
1091 class, \g must be followed by one of a number of specific things:
1092
1093 (1) A number, either plain or braced. If positive, it is an absolute
1094 backreference. If negative, it is a relative backreference. This is a Perl
1095 5.10 feature.
1096
1097 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1098 is part of Perl's movement towards a unified syntax for back references. As
1099 this is synonymous with \k{name}, we fudge it up by pretending it really
1100 was \k.
1101
1102 (3) For Oniguruma compatibility we also support \g followed by a name or a
1103 number either in angle brackets or in single quotes. However, these are
1104 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1105 the ESC_g code (cf \k). */
1106
1107 case CHAR_g:
1108 if (isclass) break;
1109 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1110 {
1111 escape = ESC_g;
1112 break;
1113 }
1114
1115 /* Handle the Perl-compatible cases */
1116
1117 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1118 {
1119 const pcre_uchar *p;
1120 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1121 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1122 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1123 {
1124 escape = ESC_k;
1125 break;
1126 }
1127 braced = TRUE;
1128 ptr++;
1129 }
1130 else braced = FALSE;
1131
1132 if (ptr[1] == CHAR_MINUS)
1133 {
1134 negated = TRUE;
1135 ptr++;
1136 }
1137 else negated = FALSE;
1138
1139 /* The integer range is limited by the machine's int representation. */
1140 s = 0;
1141 overflow = FALSE;
1142 while (IS_DIGIT(ptr[1]))
1143 {
1144 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1145 {
1146 overflow = TRUE;
1147 break;
1148 }
1149 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1150 }
1151 if (overflow) /* Integer overflow */
1152 {
1153 while (IS_DIGIT(ptr[1]))
1154 ptr++;
1155 *errorcodeptr = ERR61;
1156 break;
1157 }
1158
1159 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1160 {
1161 *errorcodeptr = ERR57;
1162 break;
1163 }
1164
1165 if (s == 0)
1166 {
1167 *errorcodeptr = ERR58;
1168 break;
1169 }
1170
1171 if (negated)
1172 {
1173 if (s > bracount)
1174 {
1175 *errorcodeptr = ERR15;
1176 break;
1177 }
1178 s = bracount - (s - 1);
1179 }
1180
1181 escape = -s;
1182 break;
1183
1184 /* The handling of escape sequences consisting of a string of digits
1185 starting with one that is not zero is not straightforward. Perl has changed
1186 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1187 recommended to avoid the ambiguities in the old syntax.
1188
1189 Outside a character class, the digits are read as a decimal number. If the
1190 number is less than 8 (used to be 10), or if there are that many previous
1191 extracting left brackets, then it is a back reference. Otherwise, up to
1192 three octal digits are read to form an escaped byte. Thus \123 is likely to
1193 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1194 the octal value is greater than 377, the least significant 8 bits are
1195 taken. \8 and \9 are treated as the literal characters 8 and 9.
1196
1197 Inside a character class, \ followed by a digit is always either a literal
1198 8 or 9 or an octal number. */
1199
1200 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1201 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1202
1203 if (!isclass)
1204 {
1205 oldptr = ptr;
1206 /* The integer range is limited by the machine's int representation. */
1207 s = (int)(c -CHAR_0);
1208 overflow = FALSE;
1209 while (IS_DIGIT(ptr[1]))
1210 {
1211 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1212 {
1213 overflow = TRUE;
1214 break;
1215 }
1216 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1217 }
1218 if (overflow) /* Integer overflow */
1219 {
1220 while (IS_DIGIT(ptr[1]))
1221 ptr++;
1222 *errorcodeptr = ERR61;
1223 break;
1224 }
1225 if (s < 8 || s <= bracount) /* Check for back reference */
1226 {
1227 escape = -s;
1228 break;
1229 }
1230 ptr = oldptr; /* Put the pointer back and fall through */
1231 }
1232
1233 /* Handle a digit following \ when the number is not a back reference. If
1234 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1235 then treat the digit as a following literal. At least by Perl 5.18 this
1236 changed so as not to insert the binary zero. */
1237
1238 if ((c = *ptr) >= CHAR_8) break;
1239
1240 /* Fall through with a digit less than 8 */
1241
1242 /* \0 always starts an octal number, but we may drop through to here with a
1243 larger first octal digit. The original code used just to take the least
1244 significant 8 bits of octal numbers (I think this is what early Perls used
1245 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1246 but no more than 3 octal digits. */
1247
1248 case CHAR_0:
1249 c -= CHAR_0;
1250 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1251 c = c * 8 + *(++ptr) - CHAR_0;
1252 #ifdef COMPILE_PCRE8
1253 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1254 #endif
1255 break;
1256
1257 /* \o is a relatively new Perl feature, supporting a more general way of
1258 specifying character codes in octal. The only supported form is \o{ddd}. */
1259
1260 case CHAR_o:
1261 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1262 {
1263 ptr += 2;
1264 c = 0;
1265 overflow = FALSE;
1266 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1267 {
1268 register pcre_uint32 cc = *ptr++;
1269 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1270 #ifdef COMPILE_PCRE32
1271 if (c >= 0x20000000l) { overflow = TRUE; break; }
1272 #endif
1273 c = (c << 3) + cc - CHAR_0 ;
1274 #if defined COMPILE_PCRE8
1275 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1276 #elif defined COMPILE_PCRE16
1277 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1278 #elif defined COMPILE_PCRE32
1279 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1280 #endif
1281 }
1282 if (overflow)
1283 {
1284 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1285 *errorcodeptr = ERR34;
1286 }
1287 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1288 {
1289 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1290 }
1291 else *errorcodeptr = ERR80;
1292 }
1293 break;
1294
1295 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1296 numbers. Otherwise it is a lowercase x letter. */
1297
1298 case CHAR_x:
1299 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1300 {
1301 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1302 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1303 {
1304 c = 0;
1305 for (i = 0; i < 2; ++i)
1306 {
1307 register pcre_uint32 cc = *(++ptr);
1308 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1309 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1310 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1311 #else /* EBCDIC coding */
1312 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1313 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1314 #endif
1315 }
1316 }
1317 } /* End JavaScript handling */
1318
1319 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1320 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1321 digits. If not, { used to be treated as a data character. However, Perl
1322 seems to read hex digits up to the first non-such, and ignore the rest, so
1323 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1324 now gives an error. */
1325
1326 else
1327 {
1328 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1329 {
1330 ptr += 2;
1331 c = 0;
1332 overflow = FALSE;
1333 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1334 {
1335 register pcre_uint32 cc = *ptr++;
1336 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1337
1338 #ifdef COMPILE_PCRE32
1339 if (c >= 0x10000000l) { overflow = TRUE; break; }
1340 #endif
1341
1342 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1343 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1344 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1345 #else /* EBCDIC coding */
1346 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1347 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1348 #endif
1349
1350 #if defined COMPILE_PCRE8
1351 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1352 #elif defined COMPILE_PCRE16
1353 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1354 #elif defined COMPILE_PCRE32
1355 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1356 #endif
1357 }
1358
1359 if (overflow)
1360 {
1361 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1362 *errorcodeptr = ERR34;
1363 }
1364
1365 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1366 {
1367 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1368 }
1369
1370 /* If the sequence of hex digits does not end with '}', give an error.
1371 We used just to recognize this construct and fall through to the normal
1372 \x handling, but nowadays Perl gives an error, which seems much more
1373 sensible, so we do too. */
1374
1375 else *errorcodeptr = ERR79;
1376 } /* End of \x{} processing */
1377
1378 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1379
1380 else
1381 {
1382 c = 0;
1383 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1384 {
1385 pcre_uint32 cc; /* Some compilers don't like */
1386 cc = *(++ptr); /* ++ in initializers */
1387 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1388 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1389 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1390 #else /* EBCDIC coding */
1391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1392 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1393 #endif
1394 }
1395 } /* End of \xdd handling */
1396 } /* End of Perl-style \x handling */
1397 break;
1398
1399 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1400 An error is given if the byte following \c is not an ASCII character. This
1401 coding is ASCII-specific, but then the whole concept of \cx is
1402 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1403
1404 case CHAR_c:
1405 c = *(++ptr);
1406 if (c == CHAR_NULL)
1407 {
1408 *errorcodeptr = ERR2;
1409 break;
1410 }
1411 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1412 if (c > 127) /* Excludes all non-ASCII in either mode */
1413 {
1414 *errorcodeptr = ERR68;
1415 break;
1416 }
1417 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1418 c ^= 0x40;
1419 #else /* EBCDIC coding */
1420 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1421 c ^= 0xC0;
1422 #endif
1423 break;
1424
1425 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1426 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1427 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1428 odd, but there used to be some cases other than the default, and there may
1429 be again in future, so I haven't "optimized" it. */
1430
1431 default:
1432 if ((options & PCRE_EXTRA) != 0) switch(c)
1433 {
1434 default:
1435 *errorcodeptr = ERR3;
1436 break;
1437 }
1438 break;
1439 }
1440 }
1441
1442 /* Perl supports \N{name} for character names, as well as plain \N for "not
1443 newline". PCRE does not support \N{name}. However, it does support
1444 quantification such as \N{2,3}. */
1445
1446 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1447 !is_counted_repeat(ptr+2))
1448 *errorcodeptr = ERR37;
1449
1450 /* If PCRE_UCP is set, we change the values for \d etc. */
1451
1452 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1453 escape += (ESC_DU - ESC_D);
1454
1455 /* Set the pointer to the final character before returning. */
1456
1457 *ptrptr = ptr;
1458 *chptr = c;
1459 return escape;
1460 }
1461
1462
1463
1464 #ifdef SUPPORT_UCP
1465 /*************************************************
1466 * Handle \P and \p *
1467 *************************************************/
1468
1469 /* This function is called after \P or \p has been encountered, provided that
1470 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1471 pointing at the P or p. On exit, it is pointing at the final character of the
1472 escape sequence.
1473
1474 Argument:
1475 ptrptr points to the pattern position pointer
1476 negptr points to a boolean that is set TRUE for negation else FALSE
1477 ptypeptr points to an unsigned int that is set to the type value
1478 pdataptr points to an unsigned int that is set to the detailed property value
1479 errorcodeptr points to the error code variable
1480
1481 Returns: TRUE if the type value was found, or FALSE for an invalid type
1482 */
1483
1484 static BOOL
1485 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1486 unsigned int *pdataptr, int *errorcodeptr)
1487 {
1488 pcre_uchar c;
1489 int i, bot, top;
1490 const pcre_uchar *ptr = *ptrptr;
1491 pcre_uchar name[32];
1492
1493 c = *(++ptr);
1494 if (c == CHAR_NULL) goto ERROR_RETURN;
1495
1496 *negptr = FALSE;
1497
1498 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1499 negation. */
1500
1501 if (c == CHAR_LEFT_CURLY_BRACKET)
1502 {
1503 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1504 {
1505 *negptr = TRUE;
1506 ptr++;
1507 }
1508 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1509 {
1510 c = *(++ptr);
1511 if (c == CHAR_NULL) goto ERROR_RETURN;
1512 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1513 name[i] = c;
1514 }
1515 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1516 name[i] = 0;
1517 }
1518
1519 /* Otherwise there is just one following character */
1520
1521 else
1522 {
1523 name[0] = c;
1524 name[1] = 0;
1525 }
1526
1527 *ptrptr = ptr;
1528
1529 /* Search for a recognized property name using binary chop */
1530
1531 bot = 0;
1532 top = PRIV(utt_size);
1533
1534 while (bot < top)
1535 {
1536 int r;
1537 i = (bot + top) >> 1;
1538 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1539 if (r == 0)
1540 {
1541 *ptypeptr = PRIV(utt)[i].type;
1542 *pdataptr = PRIV(utt)[i].value;
1543 return TRUE;
1544 }
1545 if (r > 0) bot = i + 1; else top = i;
1546 }
1547
1548 *errorcodeptr = ERR47;
1549 *ptrptr = ptr;
1550 return FALSE;
1551
1552 ERROR_RETURN:
1553 *errorcodeptr = ERR46;
1554 *ptrptr = ptr;
1555 return FALSE;
1556 }
1557 #endif
1558
1559
1560
1561 /*************************************************
1562 * Read repeat counts *
1563 *************************************************/
1564
1565 /* Read an item of the form {n,m} and return the values. This is called only
1566 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1567 so the syntax is guaranteed to be correct, but we need to check the values.
1568
1569 Arguments:
1570 p pointer to first char after '{'
1571 minp pointer to int for min
1572 maxp pointer to int for max
1573 returned as -1 if no max
1574 errorcodeptr points to error code variable
1575
1576 Returns: pointer to '}' on success;
1577 current ptr on error, with errorcodeptr set non-zero
1578 */
1579
1580 static const pcre_uchar *
1581 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1582 {
1583 int min = 0;
1584 int max = -1;
1585
1586 while (IS_DIGIT(*p))
1587 {
1588 min = min * 10 + (int)(*p++ - CHAR_0);
1589 if (min > 65535)
1590 {
1591 *errorcodeptr = ERR5;
1592 return p;
1593 }
1594 }
1595
1596 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1597 {
1598 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1599 {
1600 max = 0;
1601 while(IS_DIGIT(*p))
1602 {
1603 max = max * 10 + (int)(*p++ - CHAR_0);
1604 if (max > 65535)
1605 {
1606 *errorcodeptr = ERR5;
1607 return p;
1608 }
1609 }
1610 if (max < min)
1611 {
1612 *errorcodeptr = ERR4;
1613 return p;
1614 }
1615 }
1616 }
1617
1618 *minp = min;
1619 *maxp = max;
1620 return p;
1621 }
1622
1623
1624
1625 /*************************************************
1626 * Find first significant op code *
1627 *************************************************/
1628
1629 /* This is called by several functions that scan a compiled expression looking
1630 for a fixed first character, or an anchoring op code etc. It skips over things
1631 that do not influence this. For some calls, it makes sense to skip negative
1632 forward and all backward assertions, and also the \b assertion; for others it
1633 does not.
1634
1635 Arguments:
1636 code pointer to the start of the group
1637 skipassert TRUE if certain assertions are to be skipped
1638
1639 Returns: pointer to the first significant opcode
1640 */
1641
1642 static const pcre_uchar*
1643 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1644 {
1645 for (;;)
1646 {
1647 switch ((int)*code)
1648 {
1649 case OP_ASSERT_NOT:
1650 case OP_ASSERTBACK:
1651 case OP_ASSERTBACK_NOT:
1652 if (!skipassert) return code;
1653 do code += GET(code, 1); while (*code == OP_ALT);
1654 code += PRIV(OP_lengths)[*code];
1655 break;
1656
1657 case OP_WORD_BOUNDARY:
1658 case OP_NOT_WORD_BOUNDARY:
1659 if (!skipassert) return code;
1660 /* Fall through */
1661
1662 case OP_CALLOUT:
1663 case OP_CREF:
1664 case OP_DNCREF:
1665 case OP_RREF:
1666 case OP_DNRREF:
1667 case OP_DEF:
1668 code += PRIV(OP_lengths)[*code];
1669 break;
1670
1671 default:
1672 return code;
1673 }
1674 }
1675 /* Control never reaches here */
1676 }
1677
1678
1679
1680 /*************************************************
1681 * Find the fixed length of a branch *
1682 *************************************************/
1683
1684 /* Scan a branch and compute the fixed length of subject that will match it,
1685 if the length is fixed. This is needed for dealing with backward assertions.
1686 In UTF8 mode, the result is in characters rather than bytes. The branch is
1687 temporarily terminated with OP_END when this function is called.
1688
1689 This function is called when a backward assertion is encountered, so that if it
1690 fails, the error message can point to the correct place in the pattern.
1691 However, we cannot do this when the assertion contains subroutine calls,
1692 because they can be forward references. We solve this by remembering this case
1693 and doing the check at the end; a flag specifies which mode we are running in.
1694
1695 Arguments:
1696 code points to the start of the pattern (the bracket)
1697 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1698 atend TRUE if called when the pattern is complete
1699 cd the "compile data" structure
1700
1701 Returns: the fixed length,
1702 or -1 if there is no fixed length,
1703 or -2 if \C was encountered (in UTF-8 mode only)
1704 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1705 or -4 if an unknown opcode was encountered (internal error)
1706 */
1707
1708 static int
1709 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1710 {
1711 int length = -1;
1712
1713 register int branchlength = 0;
1714 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1715
1716 /* Scan along the opcodes for this branch. If we get to the end of the
1717 branch, check the length against that of the other branches. */
1718
1719 for (;;)
1720 {
1721 int d;
1722 pcre_uchar *ce, *cs;
1723 register pcre_uchar op = *cc;
1724
1725 switch (op)
1726 {
1727 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1728 OP_BRA (normal non-capturing bracket) because the other variants of these
1729 opcodes are all concerned with unlimited repeated groups, which of course
1730 are not of fixed length. */
1731
1732 case OP_CBRA:
1733 case OP_BRA:
1734 case OP_ONCE:
1735 case OP_ONCE_NC:
1736 case OP_COND:
1737 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1738 if (d < 0) return d;
1739 branchlength += d;
1740 do cc += GET(cc, 1); while (*cc == OP_ALT);
1741 cc += 1 + LINK_SIZE;
1742 break;
1743
1744 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1745 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1746 an ALT. If it is END it's the end of the outer call. All can be handled by
1747 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1748 because they all imply an unlimited repeat. */
1749
1750 case OP_ALT:
1751 case OP_KET:
1752 case OP_END:
1753 case OP_ACCEPT:
1754 case OP_ASSERT_ACCEPT:
1755 if (length < 0) length = branchlength;
1756 else if (length != branchlength) return -1;
1757 if (*cc != OP_ALT) return length;
1758 cc += 1 + LINK_SIZE;
1759 branchlength = 0;
1760 break;
1761
1762 /* A true recursion implies not fixed length, but a subroutine call may
1763 be OK. If the subroutine is a forward reference, we can't deal with
1764 it until the end of the pattern, so return -3. */
1765
1766 case OP_RECURSE:
1767 if (!atend) return -3;
1768 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1769 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1770 if (cc > cs && cc < ce) return -1; /* Recursion */
1771 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1772 if (d < 0) return d;
1773 branchlength += d;
1774 cc += 1 + LINK_SIZE;
1775 break;
1776
1777 /* Skip over assertive subpatterns */
1778
1779 case OP_ASSERT:
1780 case OP_ASSERT_NOT:
1781 case OP_ASSERTBACK:
1782 case OP_ASSERTBACK_NOT:
1783 do cc += GET(cc, 1); while (*cc == OP_ALT);
1784 cc += PRIV(OP_lengths)[*cc];
1785 break;
1786
1787 /* Skip over things that don't match chars */
1788
1789 case OP_MARK:
1790 case OP_PRUNE_ARG:
1791 case OP_SKIP_ARG:
1792 case OP_THEN_ARG:
1793 cc += cc[1] + PRIV(OP_lengths)[*cc];
1794 break;
1795
1796 case OP_CALLOUT:
1797 case OP_CIRC:
1798 case OP_CIRCM:
1799 case OP_CLOSE:
1800 case OP_COMMIT:
1801 case OP_CREF:
1802 case OP_DEF:
1803 case OP_DNCREF:
1804 case OP_DNRREF:
1805 case OP_DOLL:
1806 case OP_DOLLM:
1807 case OP_EOD:
1808 case OP_EODN:
1809 case OP_FAIL:
1810 case OP_NOT_WORD_BOUNDARY:
1811 case OP_PRUNE:
1812 case OP_REVERSE:
1813 case OP_RREF:
1814 case OP_SET_SOM:
1815 case OP_SKIP:
1816 case OP_SOD:
1817 case OP_SOM:
1818 case OP_THEN:
1819 case OP_WORD_BOUNDARY:
1820 cc += PRIV(OP_lengths)[*cc];
1821 break;
1822
1823 /* Handle literal characters */
1824
1825 case OP_CHAR:
1826 case OP_CHARI:
1827 case OP_NOT:
1828 case OP_NOTI:
1829 branchlength++;
1830 cc += 2;
1831 #ifdef SUPPORT_UTF
1832 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1833 #endif
1834 break;
1835
1836 /* Handle exact repetitions. The count is already in characters, but we
1837 need to skip over a multibyte character in UTF8 mode. */
1838
1839 case OP_EXACT:
1840 case OP_EXACTI:
1841 case OP_NOTEXACT:
1842 case OP_NOTEXACTI:
1843 branchlength += (int)GET2(cc,1);
1844 cc += 2 + IMM2_SIZE;
1845 #ifdef SUPPORT_UTF
1846 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1847 #endif
1848 break;
1849
1850 case OP_TYPEEXACT:
1851 branchlength += GET2(cc,1);
1852 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1853 cc += 2;
1854 cc += 1 + IMM2_SIZE + 1;
1855 break;
1856
1857 /* Handle single-char matchers */
1858
1859 case OP_PROP:
1860 case OP_NOTPROP:
1861 cc += 2;
1862 /* Fall through */
1863
1864 case OP_HSPACE:
1865 case OP_VSPACE:
1866 case OP_NOT_HSPACE:
1867 case OP_NOT_VSPACE:
1868 case OP_NOT_DIGIT:
1869 case OP_DIGIT:
1870 case OP_NOT_WHITESPACE:
1871 case OP_WHITESPACE:
1872 case OP_NOT_WORDCHAR:
1873 case OP_WORDCHAR:
1874 case OP_ANY:
1875 case OP_ALLANY:
1876 branchlength++;
1877 cc++;
1878 break;
1879
1880 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1881 otherwise \C is coded as OP_ALLANY. */
1882
1883 case OP_ANYBYTE:
1884 return -2;
1885
1886 /* Check a class for variable quantification */
1887
1888 case OP_CLASS:
1889 case OP_NCLASS:
1890 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1891 case OP_XCLASS:
1892 /* The original code caused an unsigned overflow in 64 bit systems,
1893 so now we use a conditional statement. */
1894 if (op == OP_XCLASS)
1895 cc += GET(cc, 1);
1896 else
1897 cc += PRIV(OP_lengths)[OP_CLASS];
1898 #else
1899 cc += PRIV(OP_lengths)[OP_CLASS];
1900 #endif
1901
1902 switch (*cc)
1903 {
1904 case OP_CRSTAR:
1905 case OP_CRMINSTAR:
1906 case OP_CRPLUS:
1907 case OP_CRMINPLUS:
1908 case OP_CRQUERY:
1909 case OP_CRMINQUERY:
1910 case OP_CRPOSSTAR:
1911 case OP_CRPOSPLUS:
1912 case OP_CRPOSQUERY:
1913 return -1;
1914
1915 case OP_CRRANGE:
1916 case OP_CRMINRANGE:
1917 case OP_CRPOSRANGE:
1918 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1919 branchlength += (int)GET2(cc,1);
1920 cc += 1 + 2 * IMM2_SIZE;
1921 break;
1922
1923 default:
1924 branchlength++;
1925 }
1926 break;
1927
1928 /* Anything else is variable length */
1929
1930 case OP_ANYNL:
1931 case OP_BRAMINZERO:
1932 case OP_BRAPOS:
1933 case OP_BRAPOSZERO:
1934 case OP_BRAZERO:
1935 case OP_CBRAPOS:
1936 case OP_EXTUNI:
1937 case OP_KETRMAX:
1938 case OP_KETRMIN:
1939 case OP_KETRPOS:
1940 case OP_MINPLUS:
1941 case OP_MINPLUSI:
1942 case OP_MINQUERY:
1943 case OP_MINQUERYI:
1944 case OP_MINSTAR:
1945 case OP_MINSTARI:
1946 case OP_MINUPTO:
1947 case OP_MINUPTOI:
1948 case OP_NOTMINPLUS:
1949 case OP_NOTMINPLUSI:
1950 case OP_NOTMINQUERY:
1951 case OP_NOTMINQUERYI:
1952 case OP_NOTMINSTAR:
1953 case OP_NOTMINSTARI:
1954 case OP_NOTMINUPTO:
1955 case OP_NOTMINUPTOI:
1956 case OP_NOTPLUS:
1957 case OP_NOTPLUSI:
1958 case OP_NOTPOSPLUS:
1959 case OP_NOTPOSPLUSI:
1960 case OP_NOTPOSQUERY:
1961 case OP_NOTPOSQUERYI:
1962 case OP_NOTPOSSTAR:
1963 case OP_NOTPOSSTARI:
1964 case OP_NOTPOSUPTO:
1965 case OP_NOTPOSUPTOI:
1966 case OP_NOTQUERY:
1967 case OP_NOTQUERYI:
1968 case OP_NOTSTAR:
1969 case OP_NOTSTARI:
1970 case OP_NOTUPTO:
1971 case OP_NOTUPTOI:
1972 case OP_PLUS:
1973 case OP_PLUSI:
1974 case OP_POSPLUS:
1975 case OP_POSPLUSI:
1976 case OP_POSQUERY:
1977 case OP_POSQUERYI:
1978 case OP_POSSTAR:
1979 case OP_POSSTARI:
1980 case OP_POSUPTO:
1981 case OP_POSUPTOI:
1982 case OP_QUERY:
1983 case OP_QUERYI:
1984 case OP_REF:
1985 case OP_REFI:
1986 case OP_DNREF:
1987 case OP_DNREFI:
1988 case OP_SBRA:
1989 case OP_SBRAPOS:
1990 case OP_SCBRA:
1991 case OP_SCBRAPOS:
1992 case OP_SCOND:
1993 case OP_SKIPZERO:
1994 case OP_STAR:
1995 case OP_STARI:
1996 case OP_TYPEMINPLUS:
1997 case OP_TYPEMINQUERY:
1998 case OP_TYPEMINSTAR:
1999 case OP_TYPEMINUPTO:
2000 case OP_TYPEPLUS:
2001 case OP_TYPEPOSPLUS:
2002 case OP_TYPEPOSQUERY:
2003 case OP_TYPEPOSSTAR:
2004 case OP_TYPEPOSUPTO:
2005 case OP_TYPEQUERY:
2006 case OP_TYPESTAR:
2007 case OP_TYPEUPTO:
2008 case OP_UPTO:
2009 case OP_UPTOI:
2010 return -1;
2011
2012 /* Catch unrecognized opcodes so that when new ones are added they
2013 are not forgotten, as has happened in the past. */
2014
2015 default:
2016 return -4;
2017 }
2018 }
2019 /* Control never gets here */
2020 }
2021
2022
2023
2024 /*************************************************
2025 * Scan compiled regex for specific bracket *
2026 *************************************************/
2027
2028 /* This little function scans through a compiled pattern until it finds a
2029 capturing bracket with the given number, or, if the number is negative, an
2030 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2031 so that it can be called from pcre_study() when finding the minimum matching
2032 length.
2033
2034 Arguments:
2035 code points to start of expression
2036 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2037 number the required bracket number or negative to find a lookbehind
2038
2039 Returns: pointer to the opcode for the bracket, or NULL if not found
2040 */
2041
2042 const pcre_uchar *
2043 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2044 {
2045 for (;;)
2046 {
2047 register pcre_uchar c = *code;
2048
2049 if (c == OP_END) return NULL;
2050
2051 /* XCLASS is used for classes that cannot be represented just by a bit
2052 map. This includes negated single high-valued characters. The length in
2053 the table is zero; the actual length is stored in the compiled code. */
2054
2055 if (c == OP_XCLASS) code += GET(code, 1);
2056
2057 /* Handle recursion */
2058
2059 else if (c == OP_REVERSE)
2060 {
2061 if (number < 0) return (pcre_uchar *)code;
2062 code += PRIV(OP_lengths)[c];
2063 }
2064
2065 /* Handle capturing bracket */
2066
2067 else if (c == OP_CBRA || c == OP_SCBRA ||
2068 c == OP_CBRAPOS || c == OP_SCBRAPOS)
2069 {
2070 int n = (int)GET2(code, 1+LINK_SIZE);
2071 if (n == number) return (pcre_uchar *)code;
2072 code += PRIV(OP_lengths)[c];
2073 }
2074
2075 /* Otherwise, we can get the item's length from the table, except that for
2076 repeated character types, we have to test for \p and \P, which have an extra
2077 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2078 must add in its length. */
2079
2080 else
2081 {
2082 switch(c)
2083 {
2084 case OP_TYPESTAR:
2085 case OP_TYPEMINSTAR:
2086 case OP_TYPEPLUS:
2087 case OP_TYPEMINPLUS:
2088 case OP_TYPEQUERY:
2089 case OP_TYPEMINQUERY:
2090 case OP_TYPEPOSSTAR:
2091 case OP_TYPEPOSPLUS:
2092 case OP_TYPEPOSQUERY:
2093 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2094 break;
2095
2096 case OP_TYPEUPTO:
2097 case OP_TYPEMINUPTO:
2098 case OP_TYPEEXACT:
2099 case OP_TYPEPOSUPTO:
2100 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2101 code += 2;
2102 break;
2103
2104 case OP_MARK:
2105 case OP_PRUNE_ARG:
2106 case OP_SKIP_ARG:
2107 case OP_THEN_ARG:
2108 code += code[1];
2109 break;
2110 }
2111
2112 /* Add in the fixed length from the table */
2113
2114 code += PRIV(OP_lengths)[c];
2115
2116 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2117 a multi-byte character. The length in the table is a minimum, so we have to
2118 arrange to skip the extra bytes. */
2119
2120 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2121 if (utf) switch(c)
2122 {
2123 case OP_CHAR:
2124 case OP_CHARI:
2125 case OP_EXACT:
2126 case OP_EXACTI:
2127 case OP_UPTO:
2128 case OP_UPTOI:
2129 case OP_MINUPTO:
2130 case OP_MINUPTOI:
2131 case OP_POSUPTO:
2132 case OP_POSUPTOI:
2133 case OP_STAR:
2134 case OP_STARI:
2135 case OP_MINSTAR:
2136 case OP_MINSTARI:
2137 case OP_POSSTAR:
2138 case OP_POSSTARI:
2139 case OP_PLUS:
2140 case OP_PLUSI:
2141 case OP_MINPLUS:
2142 case OP_MINPLUSI:
2143 case OP_POSPLUS:
2144 case OP_POSPLUSI:
2145 case OP_QUERY:
2146 case OP_QUERYI:
2147 case OP_MINQUERY:
2148 case OP_MINQUERYI:
2149 case OP_POSQUERY:
2150 case OP_POSQUERYI:
2151 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2152 break;
2153 }
2154 #else
2155 (void)(utf); /* Keep compiler happy by referencing function argument */
2156 #endif
2157 }
2158 }
2159 }
2160
2161
2162
2163 /*************************************************
2164 * Scan compiled regex for recursion reference *
2165 *************************************************/
2166
2167 /* This little function scans through a compiled pattern until it finds an
2168 instance of OP_RECURSE.
2169
2170 Arguments:
2171 code points to start of expression
2172 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2173
2174 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2175 */
2176
2177 static const pcre_uchar *
2178 find_recurse(const pcre_uchar *code, BOOL utf)
2179 {
2180 for (;;)
2181 {
2182 register pcre_uchar c = *code;
2183 if (c == OP_END) return NULL;
2184 if (c == OP_RECURSE) return code;
2185
2186 /* XCLASS is used for classes that cannot be represented just by a bit
2187 map. This includes negated single high-valued characters. The length in
2188 the table is zero; the actual length is stored in the compiled code. */
2189
2190 if (c == OP_XCLASS) code += GET(code, 1);
2191
2192 /* Otherwise, we can get the item's length from the table, except that for
2193 repeated character types, we have to test for \p and \P, which have an extra
2194 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2195 must add in its length. */
2196
2197 else
2198 {
2199 switch(c)
2200 {
2201 case OP_TYPESTAR:
2202 case OP_TYPEMINSTAR:
2203 case OP_TYPEPLUS:
2204 case OP_TYPEMINPLUS:
2205 case OP_TYPEQUERY:
2206 case OP_TYPEMINQUERY:
2207 case OP_TYPEPOSSTAR:
2208 case OP_TYPEPOSPLUS:
2209 case OP_TYPEPOSQUERY:
2210 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2211 break;
2212
2213 case OP_TYPEPOSUPTO:
2214 case OP_TYPEUPTO:
2215 case OP_TYPEMINUPTO:
2216 case OP_TYPEEXACT:
2217 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2218 code += 2;
2219 break;
2220
2221 case OP_MARK:
2222 case OP_PRUNE_ARG:
2223 case OP_SKIP_ARG:
2224 case OP_THEN_ARG:
2225 code += code[1];
2226 break;
2227 }
2228
2229 /* Add in the fixed length from the table */
2230
2231 code += PRIV(OP_lengths)[c];
2232
2233 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2234 by a multi-byte character. The length in the table is a minimum, so we have
2235 to arrange to skip the extra bytes. */
2236
2237 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2238 if (utf) switch(c)
2239 {
2240 case OP_CHAR:
2241 case OP_CHARI:
2242 case OP_NOT:
2243 case OP_NOTI:
2244 case OP_EXACT:
2245 case OP_EXACTI:
2246 case OP_NOTEXACT:
2247 case OP_NOTEXACTI:
2248 case OP_UPTO:
2249 case OP_UPTOI:
2250 case OP_NOTUPTO:
2251 case OP_NOTUPTOI:
2252 case OP_MINUPTO:
2253 case OP_MINUPTOI:
2254 case OP_NOTMINUPTO:
2255 case OP_NOTMINUPTOI:
2256 case OP_POSUPTO:
2257 case OP_POSUPTOI:
2258 case OP_NOTPOSUPTO:
2259 case OP_NOTPOSUPTOI:
2260 case OP_STAR:
2261 case OP_STARI:
2262 case OP_NOTSTAR:
2263 case OP_NOTSTARI:
2264 case OP_MINSTAR:
2265 case OP_MINSTARI:
2266 case OP_NOTMINSTAR:
2267 case OP_NOTMINSTARI:
2268 case OP_POSSTAR:
2269 case OP_POSSTARI:
2270 case OP_NOTPOSSTAR:
2271 case OP_NOTPOSSTARI:
2272 case OP_PLUS:
2273 case OP_PLUSI:
2274 case OP_NOTPLUS:
2275 case OP_NOTPLUSI:
2276 case OP_MINPLUS:
2277 case OP_MINPLUSI:
2278 case OP_NOTMINPLUS:
2279 case OP_NOTMINPLUSI:
2280 case OP_POSPLUS:
2281 case OP_POSPLUSI:
2282 case OP_NOTPOSPLUS:
2283 case OP_NOTPOSPLUSI:
2284 case OP_QUERY:
2285 case OP_QUERYI:
2286 case OP_NOTQUERY:
2287 case OP_NOTQUERYI:
2288 case OP_MINQUERY:
2289 case OP_MINQUERYI:
2290 case OP_NOTMINQUERY:
2291 case OP_NOTMINQUERYI:
2292 case OP_POSQUERY:
2293 case OP_POSQUERYI:
2294 case OP_NOTPOSQUERY:
2295 case OP_NOTPOSQUERYI:
2296 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2297 break;
2298 }
2299 #else
2300 (void)(utf); /* Keep compiler happy by referencing function argument */
2301 #endif
2302 }
2303 }
2304 }
2305
2306
2307
2308 /*************************************************
2309 * Scan compiled branch for non-emptiness *
2310 *************************************************/
2311
2312 /* This function scans through a branch of a compiled pattern to see whether it
2313 can match the empty string or not. It is called from could_be_empty()
2314 below and from compile_branch() when checking for an unlimited repeat of a
2315 group that can match nothing. Note that first_significant_code() skips over
2316 backward and negative forward assertions when its final argument is TRUE. If we
2317 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2318 bracket whose current branch will already have been scanned.
2319
2320 Arguments:
2321 code points to start of search
2322 endcode points to where to stop
2323 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2324 cd contains pointers to tables etc.
2325 recurses chain of recurse_check to catch mutual recursion
2326
2327 Returns: TRUE if what is matched could be empty
2328 */
2329
2330 typedef struct recurse_check {
2331 struct recurse_check *prev;
2332 const pcre_uchar *group;
2333 } recurse_check;
2334
2335 static BOOL
2336 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2337 BOOL utf, compile_data *cd, recurse_check *recurses)
2338 {
2339 register pcre_uchar c;
2340 recurse_check this_recurse;
2341
2342 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2343 code < endcode;
2344 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2345 {
2346 const pcre_uchar *ccode;
2347
2348 c = *code;
2349
2350 /* Skip over forward assertions; the other assertions are skipped by
2351 first_significant_code() with a TRUE final argument. */
2352
2353 if (c == OP_ASSERT)
2354 {
2355 do code += GET(code, 1); while (*code == OP_ALT);
2356 c = *code;
2357 continue;
2358 }
2359
2360 /* For a recursion/subroutine call, if its end has been reached, which
2361 implies a backward reference subroutine call, we can scan it. If it's a
2362 forward reference subroutine call, we can't. To detect forward reference
2363 we have to scan up the list that is kept in the workspace. This function is
2364 called only when doing the real compile, not during the pre-compile that
2365 measures the size of the compiled pattern. */
2366
2367 if (c == OP_RECURSE)
2368 {
2369 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2370 BOOL empty_branch;
2371
2372 /* Test for forward reference or uncompleted reference. This is disabled
2373 when called to scan a completed pattern by setting cd->start_workspace to
2374 NULL. */
2375
2376 if (cd->start_workspace != NULL)
2377 {
2378 const pcre_uchar *tcode;
2379 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2380 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2381 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2382 }
2383
2384 /* If we are scanning a completed pattern, there are no forward references
2385 and all groups are complete. We need to detect whether this is a recursive
2386 call, as otherwise there will be an infinite loop. If it is a recursion,
2387 just skip over it. Simple recursions are easily detected. For mutual
2388 recursions we keep a chain on the stack. */
2389
2390 else
2391 {
2392 recurse_check *r = recurses;
2393 const pcre_uchar *endgroup = scode;
2394
2395 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2396 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2397
2398 for (r = recurses; r != NULL; r = r->prev)
2399 if (r->group == scode) break;
2400 if (r != NULL) continue; /* Mutual recursion */
2401 }
2402
2403 /* Completed reference; scan the referenced group, remembering it on the
2404 stack chain to detect mutual recursions. */
2405
2406 empty_branch = FALSE;
2407 this_recurse.prev = recurses;
2408 this_recurse.group = scode;
2409
2410 do
2411 {
2412 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2413 {
2414 empty_branch = TRUE;
2415 break;
2416 }
2417 scode += GET(scode, 1);
2418 }
2419 while (*scode == OP_ALT);
2420
2421 if (!empty_branch) return FALSE; /* All branches are non-empty */
2422 continue;
2423 }
2424
2425 /* Groups with zero repeats can of course be empty; skip them. */
2426
2427 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2428 c == OP_BRAPOSZERO)
2429 {
2430 code += PRIV(OP_lengths)[c];
2431 do code += GET(code, 1); while (*code == OP_ALT);
2432 c = *code;
2433 continue;
2434 }
2435
2436 /* A nested group that is already marked as "could be empty" can just be
2437 skipped. */
2438
2439 if (c == OP_SBRA || c == OP_SBRAPOS ||
2440 c == OP_SCBRA || c == OP_SCBRAPOS)
2441 {
2442 do code += GET(code, 1); while (*code == OP_ALT);
2443 c = *code;
2444 continue;
2445 }
2446
2447 /* For other groups, scan the branches. */
2448
2449 if (c == OP_BRA || c == OP_BRAPOS ||
2450 c == OP_CBRA || c == OP_CBRAPOS ||
2451 c == OP_ONCE || c == OP_ONCE_NC ||
2452 c == OP_COND)
2453 {
2454 BOOL empty_branch;
2455 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2456
2457 /* If a conditional group has only one branch, there is a second, implied,
2458 empty branch, so just skip over the conditional, because it could be empty.
2459 Otherwise, scan the individual branches of the group. */
2460
2461 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2462 code += GET(code, 1);
2463 else
2464 {
2465 empty_branch = FALSE;
2466 do
2467 {
2468 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2469 empty_branch = TRUE;
2470 code += GET(code, 1);
2471 }
2472 while (*code == OP_ALT);
2473 if (!empty_branch) return FALSE; /* All branches are non-empty */
2474 }
2475
2476 c = *code;
2477 continue;
2478 }
2479
2480 /* Handle the other opcodes */
2481
2482 switch (c)
2483 {
2484 /* Check for quantifiers after a class. XCLASS is used for classes that
2485 cannot be represented just by a bit map. This includes negated single
2486 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2487 actual length is stored in the compiled code, so we must update "code"
2488 here. */
2489
2490 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2491 case OP_XCLASS:
2492 ccode = code += GET(code, 1);
2493 goto CHECK_CLASS_REPEAT;
2494 #endif
2495
2496 case OP_CLASS:
2497 case OP_NCLASS:
2498 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2499
2500 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2501 CHECK_CLASS_REPEAT:
2502 #endif
2503
2504 switch (*ccode)
2505 {
2506 case OP_CRSTAR: /* These could be empty; continue */
2507 case OP_CRMINSTAR:
2508 case OP_CRQUERY:
2509 case OP_CRMINQUERY:
2510 case OP_CRPOSSTAR:
2511 case OP_CRPOSQUERY:
2512 break;
2513
2514 default: /* Non-repeat => class must match */
2515 case OP_CRPLUS: /* These repeats aren't empty */
2516 case OP_CRMINPLUS:
2517 case OP_CRPOSPLUS:
2518 return FALSE;
2519
2520 case OP_CRRANGE:
2521 case OP_CRMINRANGE:
2522 case OP_CRPOSRANGE:
2523 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2524 break;
2525 }
2526 break;
2527
2528 /* Opcodes that must match a character */
2529
2530 case OP_ANY:
2531 case OP_ALLANY:
2532 case OP_ANYBYTE:
2533
2534 case OP_PROP:
2535 case OP_NOTPROP:
2536 case OP_ANYNL:
2537
2538 case OP_NOT_HSPACE:
2539 case OP_HSPACE:
2540 case OP_NOT_VSPACE:
2541 case OP_VSPACE:
2542 case OP_EXTUNI:
2543
2544 case OP_NOT_DIGIT:
2545 case OP_DIGIT:
2546 case OP_NOT_WHITESPACE:
2547 case OP_WHITESPACE:
2548 case OP_NOT_WORDCHAR:
2549 case OP_WORDCHAR:
2550
2551 case OP_CHAR:
2552 case OP_CHARI:
2553 case OP_NOT:
2554 case OP_NOTI:
2555
2556 case OP_PLUS:
2557 case OP_PLUSI:
2558 case OP_MINPLUS:
2559 case OP_MINPLUSI:
2560
2561 case OP_NOTPLUS:
2562 case OP_NOTPLUSI:
2563 case OP_NOTMINPLUS:
2564 case OP_NOTMINPLUSI:
2565
2566 case OP_POSPLUS:
2567 case OP_POSPLUSI:
2568 case OP_NOTPOSPLUS:
2569 case OP_NOTPOSPLUSI:
2570
2571 case OP_EXACT:
2572 case OP_EXACTI:
2573 case OP_NOTEXACT:
2574 case OP_NOTEXACTI:
2575
2576 case OP_TYPEPLUS:
2577 case OP_TYPEMINPLUS:
2578 case OP_TYPEPOSPLUS:
2579 case OP_TYPEEXACT:
2580
2581 return FALSE;
2582
2583 /* These are going to continue, as they may be empty, but we have to
2584 fudge the length for the \p and \P cases. */
2585
2586 case OP_TYPESTAR:
2587 case OP_TYPEMINSTAR:
2588 case OP_TYPEPOSSTAR:
2589 case OP_TYPEQUERY:
2590 case OP_TYPEMINQUERY:
2591 case OP_TYPEPOSQUERY:
2592 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2593 break;
2594
2595 /* Same for these */
2596
2597 case OP_TYPEUPTO:
2598 case OP_TYPEMINUPTO:
2599 case OP_TYPEPOSUPTO:
2600 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2601 code += 2;
2602 break;
2603
2604 /* End of branch */
2605
2606 case OP_KET:
2607 case OP_KETRMAX:
2608 case OP_KETRMIN:
2609 case OP_KETRPOS:
2610 case OP_ALT:
2611 return TRUE;
2612
2613 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2614 MINUPTO, and POSUPTO and their caseless and negative versions may be
2615 followed by a multibyte character. */
2616
2617 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2618 case OP_STAR:
2619 case OP_STARI:
2620 case OP_NOTSTAR:
2621 case OP_NOTSTARI:
2622
2623 case OP_MINSTAR:
2624 case OP_MINSTARI:
2625 case OP_NOTMINSTAR:
2626 case OP_NOTMINSTARI:
2627
2628 case OP_POSSTAR:
2629 case OP_POSSTARI:
2630 case OP_NOTPOSSTAR:
2631 case OP_NOTPOSSTARI:
2632
2633 case OP_QUERY:
2634 case OP_QUERYI:
2635 case OP_NOTQUERY:
2636 case OP_NOTQUERYI:
2637
2638 case OP_MINQUERY:
2639 case OP_MINQUERYI:
2640 case OP_NOTMINQUERY:
2641 case OP_NOTMINQUERYI:
2642
2643 case OP_POSQUERY:
2644 case OP_POSQUERYI:
2645 case OP_NOTPOSQUERY:
2646 case OP_NOTPOSQUERYI:
2647
2648 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2649 break;
2650
2651 case OP_UPTO:
2652 case OP_UPTOI:
2653 case OP_NOTUPTO:
2654 case OP_NOTUPTOI:
2655
2656 case OP_MINUPTO:
2657 case OP_MINUPTOI:
2658 case OP_NOTMINUPTO:
2659 case OP_NOTMINUPTOI:
2660
2661 case OP_POSUPTO:
2662 case OP_POSUPTOI:
2663 case OP_NOTPOSUPTO:
2664 case OP_NOTPOSUPTOI:
2665
2666 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2667 break;
2668 #endif
2669
2670 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2671 string. */
2672
2673 case OP_MARK:
2674 case OP_PRUNE_ARG:
2675 case OP_SKIP_ARG:
2676 case OP_THEN_ARG:
2677 code += code[1];
2678 break;
2679
2680 /* None of the remaining opcodes are required to match a character. */
2681
2682 default:
2683 break;
2684 }
2685 }
2686
2687 return TRUE;
2688 }
2689
2690
2691
2692 /*************************************************
2693 * Scan compiled regex for non-emptiness *
2694 *************************************************/
2695
2696 /* This function is called to check for left recursive calls. We want to check
2697 the current branch of the current pattern to see if it could match the empty
2698 string. If it could, we must look outwards for branches at other levels,
2699 stopping when we pass beyond the bracket which is the subject of the recursion.
2700 This function is called only during the real compile, not during the
2701 pre-compile.
2702
2703 Arguments:
2704 code points to start of the recursion
2705 endcode points to where to stop (current RECURSE item)
2706 bcptr points to the chain of current (unclosed) branch starts
2707 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2708 cd pointers to tables etc
2709
2710 Returns: TRUE if what is matched could be empty
2711 */
2712
2713 static BOOL
2714 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2715 branch_chain *bcptr, BOOL utf, compile_data *cd)
2716 {
2717 while (bcptr != NULL && bcptr->current_branch >= code)
2718 {
2719 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2720 return FALSE;
2721 bcptr = bcptr->outer;
2722 }
2723 return TRUE;
2724 }
2725
2726
2727
2728 /*************************************************
2729 * Base opcode of repeated opcodes *
2730 *************************************************/
2731
2732 /* Returns the base opcode for repeated single character type opcodes. If the
2733 opcode is not a repeated character type, it returns with the original value.
2734
2735 Arguments: c opcode
2736 Returns: base opcode for the type
2737 */
2738
2739 static pcre_uchar
2740 get_repeat_base(pcre_uchar c)
2741 {
2742 return (c > OP_TYPEPOSUPTO)? c :
2743 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2744 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2745 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2746 (c >= OP_STARI)? OP_STARI :
2747 OP_STAR;
2748 }
2749
2750
2751
2752 #ifdef SUPPORT_UCP
2753 /*************************************************
2754 * Check a character and a property *
2755 *************************************************/
2756
2757 /* This function is called by check_auto_possessive() when a property item
2758 is adjacent to a fixed character.
2759
2760 Arguments:
2761 c the character
2762 ptype the property type
2763 pdata the data for the type
2764 negated TRUE if it's a negated property (\P or \p{^)
2765
2766 Returns: TRUE if auto-possessifying is OK
2767 */
2768
2769 static BOOL
2770 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2771 BOOL negated)
2772 {
2773 const pcre_uint32 *p;
2774 const ucd_record *prop = GET_UCD(c);
2775
2776 switch(ptype)
2777 {
2778 case PT_LAMP:
2779 return (prop->chartype == ucp_Lu ||
2780 prop->chartype == ucp_Ll ||
2781 prop->chartype == ucp_Lt) == negated;
2782
2783 case PT_GC:
2784 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2785
2786 case PT_PC:
2787 return (pdata == prop->chartype) == negated;
2788
2789 case PT_SC:
2790 return (pdata == prop->script) == negated;
2791
2792 /* These are specials */
2793
2794 case PT_ALNUM:
2795 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2796 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2797
2798 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2799 means that Perl space and POSIX space are now identical. PCRE was changed
2800 at release 8.34. */
2801
2802 case PT_SPACE: /* Perl space */
2803 case PT_PXSPACE: /* POSIX space */
2804 switch(c)
2805 {
2806 HSPACE_CASES:
2807 VSPACE_CASES:
2808 return negated;
2809
2810 default:
2811 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2812 }
2813 break; /* Control never reaches here */
2814
2815 case PT_WORD:
2816 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2817 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2818 c == CHAR_UNDERSCORE) == negated;
2819
2820 case PT_CLIST:
2821 p = PRIV(ucd_caseless_sets) + prop->caseset;
2822 for (;;)
2823 {
2824 if (c < *p) return !negated;
2825 if (c == *p++) return negated;
2826 }
2827 break; /* Control never reaches here */
2828 }
2829
2830 return FALSE;
2831 }
2832 #endif /* SUPPORT_UCP */
2833
2834
2835
2836 /*************************************************
2837 * Fill the character property list *
2838 *************************************************/
2839
2840 /* Checks whether the code points to an opcode that can take part in auto-
2841 possessification, and if so, fills a list with its properties.
2842
2843 Arguments:
2844 code points to start of expression
2845 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2846 fcc points to case-flipping table
2847 list points to output list
2848 list[0] will be filled with the opcode
2849 list[1] will be non-zero if this opcode
2850 can match an empty character string
2851 list[2..7] depends on the opcode
2852
2853 Returns: points to the start of the next opcode if *code is accepted
2854 NULL if *code is not accepted
2855 */
2856
2857 static const pcre_uchar *
2858 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2859 const pcre_uint8 *fcc, pcre_uint32 *list)
2860 {
2861 pcre_uchar c = *code;
2862 pcre_uchar base;
2863 const pcre_uchar *end;
2864 pcre_uint32 chr;
2865
2866 #ifdef SUPPORT_UCP
2867 pcre_uint32 *clist_dest;
2868 const pcre_uint32 *clist_src;
2869 #else
2870 utf = utf; /* Suppress "unused parameter" compiler warning */
2871 #endif
2872
2873 list[0] = c;
2874 list[1] = FALSE;
2875 code++;
2876
2877 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2878 {
2879 base = get_repeat_base(c);
2880 c -= (base - OP_STAR);
2881
2882 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2883 code += IMM2_SIZE;
2884
2885 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2886
2887 switch(base)
2888 {
2889 case OP_STAR:
2890 list[0] = OP_CHAR;
2891 break;
2892
2893 case OP_STARI:
2894 list[0] = OP_CHARI;
2895 break;
2896
2897 case OP_NOTSTAR:
2898 list[0] = OP_NOT;
2899 break;
2900
2901 case OP_NOTSTARI:
2902 list[0] = OP_NOTI;
2903 break;
2904
2905 case OP_TYPESTAR:
2906 list[0] = *code;
2907 code++;
2908 break;
2909 }
2910 c = list[0];
2911 }
2912
2913 switch(c)
2914 {
2915 case OP_NOT_DIGIT:
2916 case OP_DIGIT:
2917 case OP_NOT_WHITESPACE:
2918 case OP_WHITESPACE:
2919 case OP_NOT_WORDCHAR:
2920 case OP_WORDCHAR:
2921 case OP_ANY:
2922 case OP_ALLANY:
2923 case OP_ANYNL:
2924 case OP_NOT_HSPACE:
2925 case OP_HSPACE:
2926 case OP_NOT_VSPACE:
2927 case OP_VSPACE:
2928 case OP_EXTUNI:
2929 case OP_EODN:
2930 case OP_EOD:
2931 case OP_DOLL:
2932 case OP_DOLLM:
2933 return code;
2934
2935 case OP_CHAR:
2936 case OP_NOT:
2937 GETCHARINCTEST(chr, code);
2938 list[2] = chr;
2939 list[3] = NOTACHAR;
2940 return code;
2941
2942 case OP_CHARI:
2943 case OP_NOTI:
2944 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2945 GETCHARINCTEST(chr, code);
2946 list[2] = chr;
2947
2948 #ifdef SUPPORT_UCP
2949 if (chr < 128 || (chr < 256 && !utf))
2950 list[3] = fcc[chr];
2951 else
2952 list[3] = UCD_OTHERCASE(chr);
2953 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2954 list[3] = (chr < 256) ? fcc[chr] : chr;
2955 #else
2956 list[3] = fcc[chr];
2957 #endif
2958
2959 /* The othercase might be the same value. */
2960
2961 if (chr == list[3])
2962 list[3] = NOTACHAR;
2963 else
2964 list[4] = NOTACHAR;
2965 return code;
2966
2967 #ifdef SUPPORT_UCP
2968 case OP_PROP:
2969 case OP_NOTPROP:
2970 if (code[0] != PT_CLIST)
2971 {
2972 list[2] = code[0];
2973 list[3] = code[1];
2974 return code + 2;
2975 }
2976
2977 /* Convert only if we have enough space. */
2978
2979 clist_src = PRIV(ucd_caseless_sets) + code[1];
2980 clist_dest = list + 2;
2981 code += 2;
2982
2983 do {
2984 if (clist_dest >= list + 8)
2985 {
2986 /* Early return if there is not enough space. This should never
2987 happen, since all clists are shorter than 5 character now. */
2988 list[2] = code[0];
2989 list[3] = code[1];
2990 return code;
2991 }
2992 *clist_dest++ = *clist_src;
2993 }
2994 while(*clist_src++ != NOTACHAR);
2995
2996 /* All characters are stored. The terminating NOTACHAR
2997 is copied form the clist itself. */
2998
2999 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3000 return code;
3001 #endif
3002
3003 case OP_NCLASS:
3004 case OP_CLASS:
3005 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3006 case OP_XCLASS:
3007 if (c == OP_XCLASS)
3008 end = code + GET(code, 0) - 1;
3009 else
3010 #endif
3011 end = code + 32 / sizeof(pcre_uchar);
3012
3013 switch(*end)
3014 {
3015 case OP_CRSTAR:
3016 case OP_CRMINSTAR:
3017 case OP_CRQUERY:
3018 case OP_CRMINQUERY:
3019 case OP_CRPOSSTAR:
3020 case OP_CRPOSQUERY:
3021 list[1] = TRUE;
3022 end++;
3023 break;
3024
3025 case OP_CRPLUS:
3026 case OP_CRMINPLUS:
3027 case OP_CRPOSPLUS:
3028 end++;
3029 break;
3030
3031 case OP_CRRANGE:
3032 case OP_CRMINRANGE:
3033 case OP_CRPOSRANGE:
3034 list[1] = (GET2(end, 1) == 0);
3035 end += 1 + 2 * IMM2_SIZE;
3036 break;
3037 }
3038 list[2] = end - code;
3039 return end;
3040 }
3041 return NULL; /* Opcode not accepted */
3042 }
3043
3044
3045
3046 /*************************************************
3047 * Scan further character sets for match *
3048 *************************************************/
3049
3050 /* Checks whether the base and the current opcode have a common character, in
3051 which case the base cannot be possessified.
3052
3053 Arguments:
3054 code points to the byte code
3055 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3056 cd static compile data
3057 base_list the data list of the base opcode
3058
3059 Returns: TRUE if the auto-possessification is possible
3060 */
3061
3062 static BOOL
3063 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3064 const pcre_uint32 *base_list, const pcre_uchar *base_end)
3065 {
3066 pcre_uchar c;
3067 pcre_uint32 list[8];
3068 const pcre_uint32 *chr_ptr;
3069 const pcre_uint32 *ochr_ptr;
3070 const pcre_uint32 *list_ptr;
3071 const pcre_uchar *next_code;
3072 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3073 const pcre_uchar *xclass_flags;
3074 #endif
3075 const pcre_uint8 *class_bitset;
3076 const pcre_uint8 *set1, *set2, *set_end;
3077 pcre_uint32 chr;
3078 BOOL accepted, invert_bits;
3079
3080 /* Note: the base_list[1] contains whether the current opcode has greedy
3081 (represented by a non-zero value) quantifier. This is a different from
3082 other character type lists, which stores here that the character iterator
3083 matches to an empty string (also represented by a non-zero value). */
3084
3085 for(;;)
3086 {
3087 /* All operations move the code pointer forward.
3088 Therefore infinite recursions are not possible. */
3089
3090 c = *code;
3091
3092 /* Skip over callouts */
3093
3094 if (c == OP_CALLOUT)
3095 {
3096 code += PRIV(OP_lengths)[c];
3097 continue;
3098 }
3099
3100 if (c == OP_ALT)
3101 {
3102 do code += GET(code, 1); while (*code == OP_ALT);
3103 c = *code;
3104 }
3105
3106 switch(c)
3107 {
3108 case OP_END:
3109 case OP_KETRPOS:
3110 /* TRUE only in greedy case. The non-greedy case could be replaced by
3111 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3112 uses more memory, which we cannot get at this stage.) */
3113
3114 return base_list[1] != 0;
3115
3116 case OP_KET:
3117 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3118 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3119 cannot be converted to a possessive form. */
3120
3121 if (base_list[1] == 0) return FALSE;
3122
3123 switch(*(code - GET(code, 1)))
3124 {
3125 case OP_ASSERT:
3126 case OP_ASSERT_NOT:
3127 case OP_ASSERTBACK:
3128 case OP_ASSERTBACK_NOT:
3129 case OP_ONCE:
3130 case OP_ONCE_NC:
3131 /* Atomic sub-patterns and assertions can always auto-possessify their
3132 last iterator. */
3133 return TRUE;
3134 }
3135
3136 code += PRIV(OP_lengths)[c];
3137 continue;
3138
3139 case OP_ONCE:
3140 case OP_ONCE_NC:
3141 case OP_BRA:
3142 case OP_CBRA:
3143 next_code = code + GET(code, 1);
3144 code += PRIV(OP_lengths)[c];
3145
3146 while (*next_code == OP_ALT)
3147 {
3148 if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3149 code = next_code + 1 + LINK_SIZE;
3150 next_code += GET(next_code, 1);
3151 }
3152 continue;
3153
3154 case OP_BRAZERO:
3155 case OP_BRAMINZERO:
3156
3157 next_code = code + 1;
3158 if (*next_code != OP_BRA && *next_code != OP_CBRA
3159 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3160
3161 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3162
3163 /* The bracket content will be checked by the
3164 OP_BRA/OP_CBRA case above. */
3165 next_code += 1 + LINK_SIZE;
3166 if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3167 return FALSE;
3168
3169 code += PRIV(OP_lengths)[c];
3170 continue;
3171 }
3172
3173 /* Check for a supported opcode, and load its properties. */
3174
3175 code = get_chr_property_list(code, utf, cd->fcc, list);
3176 if (code == NULL) return FALSE; /* Unsupported */
3177
3178 /* If either opcode is a small character list, set pointers for comparing
3179 characters from that list with another list, or with a property. */
3180
3181 if (base_list[0] == OP_CHAR)
3182 {
3183 chr_ptr = base_list + 2;
3184 list_ptr = list;
3185 }
3186 else if (list[0] == OP_CHAR)
3187 {
3188 chr_ptr = list + 2;
3189 list_ptr = base_list;
3190 }
3191
3192 /* Character bitsets can also be compared to certain opcodes. */
3193
3194 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3195 #ifdef COMPILE_PCRE8
3196 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3197 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3198 #endif
3199 )
3200 {
3201 #ifdef COMPILE_PCRE8
3202 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3203 #else
3204 if (base_list[0] == OP_CLASS)
3205 #endif
3206 {
3207 set1 = (pcre_uint8 *)(base_end - base_list[2]);
3208 list_ptr = list;
3209 }
3210 else
3211 {
3212 set1 = (pcre_uint8 *)(code - list[2]);
3213 list_ptr = base_list;
3214 }
3215
3216 invert_bits = FALSE;
3217 switch(list_ptr[0])
3218 {
3219 case OP_CLASS:
3220 case OP_NCLASS:
3221 set2 = (pcre_uint8 *)
3222 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3223 break;
3224
3225 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3226 case OP_XCLASS:
3227 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3228 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3229 if ((*xclass_flags & XCL_MAP) == 0)
3230 {
3231 /* No bits are set for characters < 256. */
3232 if (list[1] == 0) return TRUE;
3233 /* Might be an empty repeat. */
3234 continue;
3235 }
3236 set2 = (pcre_uint8 *)(xclass_flags + 1);
3237 break;
3238 #endif
3239
3240 case OP_NOT_DIGIT:
3241 invert_bits = TRUE;
3242 /* Fall through */
3243 case OP_DIGIT:
3244 set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3245 break;
3246
3247 case OP_NOT_WHITESPACE:
3248 invert_bits = TRUE;
3249 /* Fall through */
3250 case OP_WHITESPACE:
3251 set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3252 break;
3253
3254 case OP_NOT_WORDCHAR:
3255 invert_bits = TRUE;
3256 /* Fall through */
3257 case OP_WORDCHAR:
3258 set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3259 break;
3260
3261 default:
3262 return FALSE;
3263 }
3264
3265 /* Because the sets are unaligned, we need
3266 to perform byte comparison here. */
3267 set_end = set1 + 32;
3268 if (invert_bits)
3269 {
3270 do
3271 {
3272 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3273 }
3274 while (set1 < set_end);
3275 }
3276 else
3277 {
3278 do
3279 {
3280 if ((*set1++ & *set2++) != 0) return FALSE;
3281 }
3282 while (set1 < set_end);
3283 }
3284
3285 if (list[1] == 0) return TRUE;
3286 /* Might be an empty repeat. */
3287 continue;
3288 }
3289
3290 /* Some property combinations also acceptable. Unicode property opcodes are
3291 processed specially; the rest can be handled with a lookup table. */
3292
3293 else
3294 {
3295 pcre_uint32 leftop, rightop;
3296
3297 leftop = base_list[0];
3298 rightop = list[0];
3299
3300 #ifdef SUPPORT_UCP
3301 accepted = FALSE; /* Always set in non-unicode case. */
3302 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3303 {
3304 if (rightop == OP_EOD)
3305 accepted = TRUE;
3306 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3307 {
3308 int n;
3309 const pcre_uint8 *p;
3310 BOOL same = leftop == rightop;
3311 BOOL lisprop = leftop == OP_PROP;
3312 BOOL risprop = rightop == OP_PROP;
3313 BOOL bothprop = lisprop && risprop;
3314
3315 /* There's a table that specifies how each combination is to be
3316 processed:
3317 0 Always return FALSE (never auto-possessify)
3318 1 Character groups are distinct (possessify if both are OP_PROP)
3319 2 Check character categories in the same group (general or particular)
3320 3 Return TRUE if the two opcodes are not the same
3321 ... see comments below
3322 */
3323
3324 n = propposstab[base_list[2]][list[2]];
3325 switch(n)
3326 {
3327 case 0: break;
3328 case 1: accepted = bothprop; break;
3329 case 2: accepted = (base_list[3] == list[3]) != same; break;
3330 case 3: accepted = !same; break;
3331
3332 case 4: /* Left general category, right particular category */
3333 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3334 break;
3335
3336 case 5: /* Right general category, left particular category */
3337 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3338 break;
3339
3340 /* This code is logically tricky. Think hard before fiddling with it.
3341 The posspropstab table has four entries per row. Each row relates to
3342 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3343 Only WORD actually needs all four entries, but using repeats for the
3344 others means they can all use the same code below.
3345
3346 The first two entries in each row are Unicode general categories, and
3347 apply always, because all the characters they include are part of the
3348 PCRE character set. The third and fourth entries are a general and a
3349 particular category, respectively, that include one or more relevant
3350 characters. One or the other is used, depending on whether the check
3351 is for a general or a particular category. However, in both cases the
3352 category contains more characters than the specials that are defined
3353 for the property being tested against. Therefore, it cannot be used
3354 in a NOTPROP case.
3355
3356 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3357 Underscore is covered by ucp_P or ucp_Po. */
3358
3359 case 6: /* Left alphanum vs right general category */
3360 case 7: /* Left space vs right general category */
3361 case 8: /* Left word vs right general category */
3362 p = posspropstab[n-6];
3363 accepted = risprop && lisprop ==
3364 (list[3] != p[0] &&
3365 list[3] != p[1] &&
3366 (list[3] != p[2] || !lisprop));
3367 break;
3368
3369 case 9: /* Right alphanum vs left general category */
3370 case 10: /* Right space vs left general category */
3371 case 11: /* Right word vs left general category */
3372 p = posspropstab[n-9];
3373 accepted = lisprop && risprop ==
3374 (base_list[3] != p[0] &&
3375 base_list[3] != p[1] &&
3376 (base_list[3] != p[2] || !risprop));
3377 break;
3378
3379 case 12: /* Left alphanum vs right particular category */
3380 case 13: /* Left space vs right particular category */
3381 case 14: /* Left word vs right particular category */
3382 p = posspropstab[n-12];
3383 accepted = risprop && lisprop ==
3384 (catposstab[p[0]][list[3]] &&
3385 catposstab[p[1]][list[3]] &&
3386 (list[3] != p[3] || !lisprop));
3387 break;
3388
3389 case 15: /* Right alphanum vs left particular category */
3390 case 16: /* Right space vs left particular category */
3391 case 17: /* Right word vs left particular category */
3392 p = posspropstab[n-15];
3393 accepted = lisprop && risprop ==
3394 (catposstab[p[0]][base_list[3]] &&
3395 catposstab[p[1]][base_list[3]] &&
3396 (base_list[3] != p[3] || !risprop));
3397 break;
3398 }
3399 }
3400 }
3401
3402 else
3403 #endif /* SUPPORT_UCP */
3404
3405 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3406 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3407 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3408
3409 if (!accepted)
3410 return FALSE;
3411
3412 if (list[1] == 0) return TRUE;
3413 /* Might be an empty repeat. */
3414 continue;
3415 }
3416
3417 /* Control reaches here only if one of the items is a small character list.
3418 All characters are checked against the other side. */
3419
3420 do
3421 {
3422 chr = *chr_ptr;
3423
3424 switch(list_ptr[0])
3425 {
3426 case OP_CHAR:
3427 ochr_ptr = list_ptr + 2;
3428 do
3429 {
3430 if (chr == *ochr_ptr) return FALSE;
3431 ochr_ptr++;
3432 }
3433 while(*ochr_ptr != NOTACHAR);
3434 break;
3435
3436 case OP_NOT:
3437 ochr_ptr = list_ptr + 2;
3438 do
3439 {
3440 if (chr == *ochr_ptr)
3441 break;
3442 ochr_ptr++;
3443 }
3444 while(*ochr_ptr != NOTACHAR);
3445 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3446 break;
3447
3448 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3449 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3450
3451 case OP_DIGIT:
3452 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3453 break;
3454
3455 case OP_NOT_DIGIT:
3456 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3457 break;
3458
3459 case OP_WHITESPACE:
3460 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3461 break;
3462
3463 case OP_NOT_WHITESPACE:
3464 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3465 break;
3466
3467 case OP_WORDCHAR:
3468 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3469 break;
3470
3471 case OP_NOT_WORDCHAR:
3472 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3473 break;
3474
3475 case OP_HSPACE:
3476 switch(chr)
3477 {
3478 HSPACE_CASES: return FALSE;
3479 default: break;
3480 }
3481 break;
3482
3483 case OP_NOT_HSPACE:
3484 switch(chr)
3485 {
3486 HSPACE_CASES: break;
3487 default: return FALSE;
3488 }
3489 break;
3490
3491 case OP_ANYNL:
3492 case OP_VSPACE:
3493 switch(chr)
3494 {
3495 VSPACE_CASES: return FALSE;
3496 default: break;
3497 }
3498 break;
3499
3500 case OP_NOT_VSPACE:
3501 switch(chr)
3502 {
3503 VSPACE_CASES: break;
3504 default: return FALSE;
3505 }
3506 break;
3507
3508 case OP_DOLL:
3509 case OP_EODN:
3510 switch (chr)
3511 {
3512 case CHAR_CR:
3513 case CHAR_LF:
3514 case CHAR_VT:
3515 case CHAR_FF:
3516 case CHAR_NEL:
3517 #ifndef EBCDIC
3518 case 0x2028:
3519 case 0x2029:
3520 #endif /* Not EBCDIC */
3521 return FALSE;
3522 }
3523 break;
3524
3525 case OP_EOD: /* Can always possessify before \z */
3526 break;
3527
3528 #ifdef SUPPORT_UCP
3529 case OP_PROP:
3530 case OP_NOTPROP:
3531 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3532 list_ptr[0] == OP_NOTPROP))
3533 return FALSE;
3534 break;
3535 #endif
3536
3537 case OP_NCLASS:
3538 if (chr > 255) return FALSE;
3539 /* Fall through */
3540
3541 case OP_CLASS:
3542 if (chr > 255) break;
3543 class_bitset = (pcre_uint8 *)
3544 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3545 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3546 break;
3547
3548 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3549 case OP_XCLASS:
3550 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3551 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3552 break;
3553 #endif
3554
3555 default:
3556 return FALSE;
3557 }
3558
3559 chr_ptr++;
3560 }
3561 while(*chr_ptr != NOTACHAR);
3562
3563 /* At least one character must be matched from this opcode. */
3564
3565 if (list[1] == 0) return TRUE;
3566 }
3567
3568 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3569 but some compilers complain about an unreachable statement. */
3570
3571 }
3572
3573
3574
3575 /*************************************************
3576 * Scan compiled regex for auto-possession *
3577 *************************************************/
3578
3579 /* Replaces single character iterations with their possessive alternatives
3580 if appropriate. This function modifies the compiled opcode!
3581
3582 Arguments:
3583 code points to start of the byte code
3584 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3585 cd static compile data
3586
3587 Returns: nothing
3588 */
3589
3590 static void
3591 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3592 {
3593 register pcre_uchar c;
3594 const pcre_uchar *end;
3595 pcre_uchar *repeat_opcode;
3596 pcre_uint32 list[8];
3597
3598 for (;;)
3599 {
3600 c = *code;
3601
3602 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3603 {
3604 c -= get_repeat_base(c) - OP_STAR;
3605 end = (c <= OP_MINUPTO) ?
3606 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3607 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3608
3609 if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3610 {
3611 switch(c)
3612 {
3613 case OP_STAR:
3614 *code += OP_POSSTAR - OP_STAR;
3615 break;
3616
3617 case OP_MINSTAR:
3618 *code += OP_POSSTAR - OP_MINSTAR;
3619 break;
3620
3621 case OP_PLUS:
3622 *code += OP_POSPLUS - OP_PLUS;
3623 break;
3624
3625 case OP_MINPLUS:
3626 *code += OP_POSPLUS - OP_MINPLUS;
3627 break;
3628
3629 case OP_QUERY:
3630 *code += OP_POSQUERY - OP_QUERY;
3631 break;
3632
3633 case OP_MINQUERY:
3634 *code += OP_POSQUERY - OP_MINQUERY;
3635 break;
3636
3637 case OP_UPTO:
3638 *code += OP_POSUPTO - OP_UPTO;
3639 break;
3640
3641 case OP_MINUPTO:
3642 *code += OP_POSUPTO - OP_MINUPTO;
3643 break;
3644 }
3645 }
3646 c = *code;
3647 }
3648 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3649 {
3650 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3651 if (c == OP_XCLASS)
3652 repeat_opcode = code + GET(code, 1);
3653 else
3654 #endif
3655 repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3656
3657 c = *repeat_opcode;
3658 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3659 {
3660 /* end must not be NULL. */
3661 end = get_chr_property_list(code, utf, cd->fcc, list);
3662
3663 list[1] = (c & 1) == 0;
3664
3665 if (compare_opcodes(end, utf, cd, list, end))
3666 {
3667 switch (c)
3668 {
3669 case OP_CRSTAR:
3670 case OP_CRMINSTAR:
3671 *repeat_opcode = OP_CRPOSSTAR;
3672 break;
3673
3674 case OP_CRPLUS:
3675 case OP_CRMINPLUS:
3676 *repeat_opcode = OP_CRPOSPLUS;
3677 break;
3678
3679 case OP_CRQUERY:
3680 case OP_CRMINQUERY:
3681 *repeat_opcode = OP_CRPOSQUERY;
3682 break;
3683
3684 case OP_CRRANGE:
3685 case OP_CRMINRANGE:
3686 *repeat_opcode = OP_CRPOSRANGE;
3687 break;
3688 }
3689 }
3690 }
3691 c = *code;
3692 }
3693
3694 switch(c)
3695 {
3696 case OP_END:
3697 return;
3698
3699 case OP_TYPESTAR:
3700 case OP_TYPEMINSTAR:
3701 case OP_TYPEPLUS:
3702 case OP_TYPEMINPLUS:
3703 case OP_TYPEQUERY:
3704 case OP_TYPEMINQUERY:
3705 case OP_TYPEPOSSTAR:
3706 case OP_TYPEPOSPLUS:
3707 case OP_TYPEPOSQUERY:
3708 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3709 break;
3710
3711 case OP_TYPEUPTO:
3712 case OP_TYPEMINUPTO:
3713 case OP_TYPEEXACT:
3714 case OP_TYPEPOSUPTO:
3715 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3716 code += 2;
3717 break;
3718
3719 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3720 case OP_XCLASS:
3721 code += GET(code, 1);
3722 break;
3723 #endif
3724
3725 case OP_MARK:
3726 case OP_PRUNE_ARG:
3727 case OP_SKIP_ARG:
3728 case OP_THEN_ARG:
3729 code += code[1];
3730 break;
3731 }
3732
3733 /* Add in the fixed length from the table */
3734
3735 code += PRIV(OP_lengths)[c];
3736
3737 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3738 a multi-byte character. The length in the table is a minimum, so we have to
3739 arrange to skip the extra bytes. */
3740
3741 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3742 if (utf) switch(c)
3743 {
3744 case OP_CHAR:
3745 case OP_CHARI:
3746 case OP_NOT:
3747 case OP_NOTI:
3748 case OP_STAR:
3749 case OP_MINSTAR:
3750 case OP_PLUS:
3751 case OP_MINPLUS:
3752 case OP_QUERY:
3753 case OP_MINQUERY:
3754 case OP_UPTO:
3755 case OP_MINUPTO:
3756 case OP_EXACT:
3757 case OP_POSSTAR:
3758 case OP_POSPLUS:
3759 case OP_POSQUERY:
3760 case OP_POSUPTO:
3761 case OP_STARI:
3762 case OP_MINSTARI:
3763 case OP_PLUSI:
3764 case OP_MINPLUSI:
3765 case OP_QUERYI:
3766 case OP_MINQUERYI:
3767 case OP_UPTOI:
3768 case OP_MINUPTOI:
3769 case OP_EXACTI:
3770 case OP_POSSTARI:
3771 case OP_POSPLUSI:
3772 case OP_POSQUERYI:
3773 case OP_POSUPTOI:
3774 case OP_NOTSTAR:
3775 case OP_NOTMINSTAR:
3776 case OP_NOTPLUS:
3777 case OP_NOTMINPLUS:
3778 case OP_NOTQUERY:
3779 case OP_NOTMINQUERY:
3780 case OP_NOTUPTO:
3781 case OP_NOTMINUPTO:
3782 case OP_NOTEXACT:
3783 case OP_NOTPOSSTAR:
3784 case OP_NOTPOSPLUS:
3785 case OP_NOTPOSQUERY:
3786 case OP_NOTPOSUPTO:
3787 case OP_NOTSTARI:
3788 case OP_NOTMINSTARI:
3789 case OP_NOTPLUSI:
3790 case OP_NOTMINPLUSI:
3791 case OP_NOTQUERYI:
3792 case OP_NOTMINQUERYI:
3793 case OP_NOTUPTOI:
3794 case OP_NOTMINUPTOI:
3795 case OP_NOTEXACTI:
3796 case OP_NOTPOSSTARI:
3797 case OP_NOTPOSPLUSI:
3798 case OP_NOTPOSQUERYI:
3799 case OP_NOTPOSUPTOI:
3800 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3801 break;
3802 }
3803 #else
3804 (void)(utf); /* Keep compiler happy by referencing function argument */
3805 #endif
3806 }
3807 }
3808
3809
3810
3811 /*************************************************
3812 * Check for POSIX class syntax *
3813 *************************************************/
3814
3815 /* This function is called when the sequence "[:" or "[." or "[=" is
3816 encountered in a character class. It checks whether this is followed by a
3817 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3818 reach an unescaped ']' without the special preceding character, return FALSE.
3819
3820 Originally, this function only recognized a sequence of letters between the
3821 terminators, but it seems that Perl recognizes any sequence of characters,
3822 though of course unknown POSIX names are subsequently rejected. Perl gives an
3823 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3824 didn't consider this to be a POSIX class. Likewise for [:1234:].
3825
3826 The problem in trying to be exactly like Perl is in the handling of escapes. We
3827 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3828 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3829 below handles the special case of \], but does not try to do any other escape
3830 processing. This makes it different from Perl for cases such as [:l\ower:]
3831 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3832 "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
3833 I think.
3834
3835 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3836 It seems that the appearance of a nested POSIX class supersedes an apparent
3837 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3838 a digit.
3839
3840 In Perl, unescaped square brackets may also appear as part of class names. For
3841 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3842 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3843 seem right at all. PCRE does not allow closing square brackets in POSIX class
3844 names.
3845
3846 Arguments:
3847 ptr pointer to the initial [
3848 endptr where to return the end pointer
3849
3850 Returns: TRUE or FALSE
3851 */
3852
3853 static BOOL
3854 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3855 {
3856 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3857 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3858 for (++ptr; *ptr != CHAR_NULL; ptr++)
3859 {
3860 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3861 ptr++;
3862 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3863 else
3864 {
3865 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3866 {
3867 *endptr = ptr;
3868 return TRUE;
3869 }
3870 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3871 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3872 ptr[1] == CHAR_EQUALS_SIGN) &&
3873 check_posix_syntax(ptr, endptr))
3874 return FALSE;
3875 }
3876 }
3877 return FALSE;
3878 }
3879
3880
3881
3882
3883 /*************************************************
3884 * Check POSIX class name *
3885 *************************************************/
3886
3887 /* This function is called to check the name given in a POSIX-style class entry
3888 such as [:alnum:].
3889
3890 Arguments:
3891 ptr points to the first letter
3892 len the length of the name
3893
3894 Returns: a value representing the name, or -1 if unknown
3895 */
3896
3897 static int
3898 check_posix_name(const pcre_uchar *ptr, int len)
3899 {
3900 const char *pn = posix_names;
3901 register int yield = 0;
3902 while (posix_name_lengths[yield] != 0)
3903 {
3904 if (len == posix_name_lengths[yield] &&
3905 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3906 pn += posix_name_lengths[yield] + 1;
3907 yield++;
3908 }
3909 return -1;
3910 }
3911
3912
3913 /*************************************************
3914 * Adjust OP_RECURSE items in repeated group *
3915 *************************************************/
3916
3917 /* OP_RECURSE items contain an offset from the start of the regex to the group
3918 that is referenced. This means that groups can be replicated for fixed
3919 repetition simply by copying (because the recursion is allowed to refer to
3920 earlier groups that are outside the current group). However, when a group is
3921 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3922 inserted before it, after it has been compiled. This means that any OP_RECURSE
3923 items within it that refer to the group itself or any contained groups have to
3924 have their offsets adjusted. That one of the jobs of this function. Before it
3925 is called, the partially compiled regex must be temporarily terminated with
3926 OP_END.
3927
3928 This function has been extended with the possibility of forward references for
3929 recursions and subroutine calls. It must also check the list of such references
3930 for the group we are dealing with. If it finds that one of the recursions in
3931 the current group is on this list, it adjusts the offset in the list, not the
3932 value in the reference (which is a group number).
3933
3934 Arguments:
3935 group points to the start of the group
3936 adjust the amount by which the group is to be moved
3937 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3938 cd contains pointers to tables etc.
3939 save_hwm the hwm forward reference pointer at the start of the group
3940
3941 Returns: nothing
3942 */
3943
3944 static void
3945 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3946 pcre_uchar *save_hwm)
3947 {
3948 pcre_uchar *ptr = group;
3949
3950 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3951 {
3952 int offset;
3953 pcre_uchar *hc;
3954
3955 /* See if this recursion is on the forward reference list. If so, adjust the
3956 reference. */
3957
3958 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3959 {
3960 offset = (int)GET(hc, 0);
3961 if (cd->start_code + offset == ptr + 1)
3962 {
3963 PUT(hc, 0, offset + adjust);
3964 break;
3965 }
3966 }
3967
3968 /* Otherwise, adjust the recursion offset if it's after the start of this
3969 group. */
3970
3971 if (hc >= cd->hwm)
3972 {
3973 offset = (int)GET(ptr, 1);
3974 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3975 }
3976
3977 ptr += 1 + LINK_SIZE;
3978 }
3979 }
3980
3981
3982
3983 /*************************************************
3984 * Insert an automatic callout point *
3985 *************************************************/
3986
3987 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3988 callout points before each pattern item.
3989
3990 Arguments:
3991 code current code pointer
3992 ptr current pattern pointer
3993 cd pointers to tables etc
3994
3995 Returns: new code pointer
3996 */
3997
3998 static pcre_uchar *
3999 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4000 {
4001 *code++ = OP_CALLOUT;
4002 *code++ = 255;
4003 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
4004 PUT(code, LINK_SIZE, 0); /* Default length */
4005 return code + 2 * LINK_SIZE;
4006 }
4007
4008
4009
4010 /*************************************************
4011 * Complete a callout item *
4012 *************************************************/
4013
4014 /* A callout item contains the length of the next item in the pattern, which
4015 we can't fill in till after we have reached the relevant point. This is used
4016 for both automatic and manual callouts.
4017
4018 Arguments:
4019 previous_callout points to previous callout item
4020 ptr current pattern pointer
4021 cd pointers to tables etc
4022
4023 Returns: nothing
4024 */
4025
4026 static void
4027 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4028 {
4029 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4030 PUT(previous_callout, 2 + LINK_SIZE, length);
4031 }
4032
4033
4034
4035 #ifdef SUPPORT_UCP
4036 /*************************************************
4037 * Get othercase range *
4038 *************************************************/
4039
4040 /* This function is passed the start and end of a class range, in UTF-8 mode
4041 with UCP support. It searches up the characters, looking for ranges of
4042 characters in the "other" case. Each call returns the next one, updating the
4043 start address. A character with multiple other cases is returned on its own
4044 with a special return value.
4045
4046 Arguments:
4047 cptr points to starting character value; updated
4048 d end value
4049 ocptr where to put start of othercase range
4050 odptr where to put end of othercase range
4051
4052 Yield: -1 when no more
4053 0 when a range is returned
4054 >0 the CASESET offset for char with multiple other cases
4055 in this case, ocptr contains the original
4056 */
4057
4058 static int
4059 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4060 pcre_uint32 *odptr)
4061 {
4062 pcre_uint32 c, othercase, next;
4063 unsigned int co;
4064
4065 /* Find the first character that has an other case. If it has multiple other
4066 cases, return its case offset value. */
4067
4068 for (c = *cptr; c <= d; c++)
4069 {
4070 if ((co = UCD_CASESET(c)) != 0)
4071 {
4072 *ocptr = c++; /* Character that has the set */
4073 *cptr = c; /* Rest of input range */
4074 return (int)co;
4075 }
4076 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4077 }
4078
4079 if (c > d) return -1; /* Reached end of range */
4080
4081 /* Found a character that has a single other case. Search for the end of the
4082 range, which is either the end of the input range, or a character that has zero
4083 or more than one other cases. */
4084
4085 *ocptr = othercase;
4086 next = othercase + 1;
4087
4088 for (++c; c <= d; c++)
4089 {
4090 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4091 next++;
4092 }
4093
4094 *odptr = next - 1; /* End of othercase range */
4095 *cptr = c; /* Rest of input range */
4096 return 0;
4097 }
4098 #endif /* SUPPORT_UCP */
4099
4100
4101
4102 /*************************************************
4103 * Add a character or range to a class *
4104 *************************************************/
4105
4106 /* This function packages up the logic of adding a character or range of
4107 characters to a class. The character values in the arguments will be within the
4108 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4109 mutually recursive with the function immediately below.
4110
4111 Arguments:
4112 classbits the bit map for characters < 256
4113 uchardptr points to the pointer for extra data
4114 options the options word
4115 cd contains pointers to tables etc.
4116 start start of range character
4117 end end of range character
4118
4119 Returns: the number of < 256 characters added
4120 the pointer to extra data is updated
4121 */
4122
4123 static int
4124 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4125 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4126 {
4127 pcre_uint32 c;
4128 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4129 int n8 = 0;
4130
4131 /* If caseless matching is required, scan the range and process alternate
4132 cases. In Unicode, there are 8-bit characters that have alternate cases that
4133 are greater than 255 and vice-versa. Sometimes we can just extend the original
4134 range. */
4135
4136 if ((options & PCRE_CASELESS) != 0)
4137 {
4138 #ifdef SUPPORT_UCP
4139 if ((options & PCRE_UTF8) != 0)
4140 {
4141 int rc;
4142 pcre_uint32 oc, od;
4143
4144 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
4145 c = start;
4146
4147 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4148 {
4149 /* Handle a single character that has more than one other case. */
4150
4151 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4152 PRIV(ucd_caseless_sets) + rc, oc);
4153
4154 /* Do nothing if the other case range is within the original range. */
4155
4156 else if (oc >= start && od <= end) continue;
4157
4158 /* Extend the original range if there is overlap, noting that if oc < c, we
4159 can't have od > end because a subrange is always shorter than the basic
4160 range. Otherwise, use a recursive call to add the additional range. */
4161
4162 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4163 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
4164 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4165 }
4166 }
4167 else
4168 #endif /* SUPPORT_UCP */
4169
4170 /* Not UTF-mode, or no UCP */
4171
4172 for (c = start; c <= classbits_end; c++)
4173 {
4174 SETBIT(classbits, cd->fcc[c]);
4175 n8++;
4176 }
4177 }
4178
4179 /* Now handle the original range. Adjust the final value according to the bit
4180 length - this means that the same lists of (e.g.) horizontal spaces can be used
4181 in all cases. */
4182
4183 #if defined COMPILE_PCRE8
4184 #ifdef SUPPORT_UTF
4185 if ((options & PCRE_UTF8) == 0)
4186 #endif
4187 if (end > 0xff) end = 0xff;
4188
4189 #elif defined COMPILE_PCRE16
4190 #ifdef SUPPORT_UTF
4191 if ((options & PCRE_UTF16) == 0)
4192 #endif
4193 if (end > 0xffff) end = 0xffff;
4194
4195 #endif /* COMPILE_PCRE[8|16] */
4196
4197 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4198
4199 for (c = start; c <= classbits_end; c++)
4200 {
4201 /* Regardless of start, c will always be <= 255. */
4202 SETBIT(classbits, c);
4203 n8++;
4204 }
4205
4206 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4207 if (start <= 0xff) start = 0xff + 1;
4208
4209 if (end >= start)
4210 {
4211 pcre_uchar *uchardata = *uchardptr;
4212 #ifdef SUPPORT_UTF
4213 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
4214 {
4215 if (start < end)
4216 {
4217 *uchardata++ = XCL_RANGE;
4218 uchardata += PRIV(ord2utf)(start, uchardata);
4219 uchardata += PRIV(ord2utf)(end, uchardata);
4220 }
4221 else if (start == end)
4222 {
4223 *uchardata++ = XCL_SINGLE;
4224 uchardata += PRIV(ord2utf)(start, uchardata);
4225 }
4226 }
4227 else
4228 #endif /* SUPPORT_UTF */
4229
4230 /* Without UTF support, character values are constrained by the bit length,
4231 and can only be > 256 for 16-bit and 32-bit libraries. */
4232
4233 #ifdef COMPILE_PCRE8
4234 {}
4235 #else
4236 if (start < end)
4237 {
4238 *uchardata++ = XCL_RANGE;
4239 *uchardata++ = start;
4240 *uchardata++ = end;
4241 }
4242 else if (start == end)
4243 {
4244 *uchardata++ = XCL_SINGLE;
4245 *uchardata++ = start;
4246 }
4247 #endif
4248
4249 *uchardptr = uchardata; /* Updata extra data pointer */
4250 }
4251 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4252
4253 return n8; /* Number of 8-bit characters */
4254 }
4255
4256
4257
4258
4259 /*************************************************
4260 * Add a list of characters to a class *
4261 *************************************************/
4262
4263 /* This function is used for adding a list of case-equivalent characters to a
4264 class, and also for adding a list of horizontal or vertical whitespace. If the
4265 list is in order (which it should be), ranges of characters are detected and
4266 handled appropriately. This function is mutually recursive with the function
4267 above.
4268
4269 Arguments:
4270 classbits the bit map for characters < 256
4271 uchardptr points to the pointer for extra data
4272 options the options word
4273 cd contains pointers to tables etc.
4274 p points to row of 32-bit values, terminated by NOTACHAR
4275 except character to omit; this is used when adding lists of
4276 case-equivalent characters to avoid including the one we
4277 already know about
4278
4279 Returns: the number of < 256 characters added
4280 the pointer to extra data is updated
4281 */
4282
4283 static int
4284 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4285 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4286 {
4287 int n8 = 0;
4288 while (p[0] < NOTACHAR)
4289 {
4290 int n = 0;
4291 if (p[0] != except)
4292 {
4293 while(p[n+1] == p[0] + n + 1) n++;
4294 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4295 }
4296 p += n + 1;
4297 }
4298 return n8;
4299 }
4300
4301
4302
4303 /*************************************************
4304 * Add characters not in a list to a class *
4305 *************************************************/
4306
4307 /* This function is used for adding the complement of a list of horizontal or
4308 vertical whitespace to a class. The list must be in order.
4309
4310 Arguments:
4311 classbits the bit map for characters < 256
4312 uchardptr points to the pointer for extra data
4313 options the options word
4314 cd contains pointers to tables etc.
4315 p points to row of 32-bit values, terminated by NOTACHAR
4316
4317 Returns: the number of < 256 characters added
4318 the pointer to extra data is updated
4319 */
4320
4321 static int
4322 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4323 int options, compile_data *cd, const pcre_uint32 *p)
4324 {
4325 BOOL utf = (options & PCRE_UTF8) != 0;
4326 int n8 = 0;
4327 if (p[0] > 0)
4328 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4329 while (p[0] < NOTACHAR)
4330 {
4331 while (p[1] == p[0] + 1) p++;
4332 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4333 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4334 p++;
4335 }
4336 return n8;
4337 }
4338
4339
4340
4341 /*************************************************
4342 * Compile one branch *
4343 *************************************************/
4344
4345 /* Scan the pattern, compiling it into the a vector. If the options are
4346 changed during the branch, the pointer is used to change the external options
4347 bits. This function is used during the pre-compile phase when we are trying
4348 to find out the amount of memory needed, as well as during the real compile
4349 phase. The value of lengthptr distinguishes the two phases.
4350
4351 Arguments:
4352 optionsptr pointer to the option bits
4353 codeptr points to the pointer to the current code point
4354 ptrptr points to the current pattern pointer
4355 errorcodeptr points to error code variable
4356 firstcharptr place to put the first required character
4357 firstcharflagsptr place to put the first character flags, or a negative number
4358 reqcharptr place to put the last required character
4359 reqcharflagsptr place to put the last required character flags, or a negative number
4360 bcptr points to current branch chain
4361 cond_depth conditional nesting depth
4362 cd contains pointers to tables etc.
4363 lengthptr NULL during the real compile phase
4364 points to length accumulator during pre-compile phase
4365
4366 Returns: TRUE on success
4367 FALSE, with *errorcodeptr set non-zero on error
4368 */
4369
4370 static BOOL
4371 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4372 const pcre_uchar **ptrptr, int *errorcodeptr,
4373 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4374 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4375 branch_chain *bcptr, int cond_depth,
4376 compile_data *cd, int *lengthptr)
4377 {
4378 int repeat_type, op_type;
4379 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4380 int bravalue = 0;
4381 int greedy_default, greedy_non_default;
4382 pcre_uint32 firstchar, reqchar;
4383 pcre_int32 firstcharflags, reqcharflags;
4384 pcre_uint32 zeroreqchar, zerofirstchar;
4385 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4386 pcre_int32 req_caseopt, reqvary, tempreqvary;
4387 int options = *optionsptr; /* May change dynamically */
4388 int after_manual_callout = 0;
4389 int length_prevgroup = 0;
4390 register pcre_uint32 c;
4391 int escape;
4392 register pcre_uchar *code = *codeptr;
4393 pcre_uchar *last_code = code;
4394 pcre_uchar *orig_code = code;
4395 pcre_uchar *tempcode;
4396 BOOL inescq = FALSE;
4397 BOOL groupsetfirstchar = FALSE;
4398 const pcre_uchar *ptr = *ptrptr;
4399 const pcre_uchar *tempptr;
4400 const pcre_uchar *nestptr = NULL;
4401 pcre_uchar *previous = NULL;
4402 pcre_uchar *previous_callout = NULL;
4403 pcre_uchar *save_hwm = NULL;
4404 pcre_uint8 classbits[32];
4405
4406 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4407 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4408 dynamically as we process the pattern. */
4409
4410 #ifdef SUPPORT_UTF
4411 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4412 BOOL utf = (options & PCRE_UTF8) != 0;
4413 #ifndef COMPILE_PCRE32
4414 pcre_uchar utf_chars[6];
4415 #endif
4416 #else
4417 BOOL utf = FALSE;
4418 #endif
4419
4420 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4421 class_uchardata always so that it can be passed to add_to_class() always,
4422 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4423 alternative calls for the different cases. */
4424
4425 pcre_uchar *class_uchardata;
4426 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4427 BOOL xclass;
4428 pcre_uchar *class_uchardata_base;
4429 #endif
4430
4431 #ifdef PCRE_DEBUG
4432 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4433 #endif
4434
4435 /* Set up the default and non-default settings for greediness */
4436
4437 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4438 greedy_non_default = greedy_default ^ 1;
4439
4440 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4441 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4442 matches a non-fixed char first char; reqchar just remains unset if we never
4443 find one.
4444
4445 When we hit a repeat whose minimum is zero, we may have to adjust these values
4446 to take the zero repeat into account. This is implemented by setting them to
4447 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4448 item types that can be repeated set these backoff variables appropriately. */
4449
4450 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4451 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4452
4453 /* The variable req_caseopt contains either the REQ_CASELESS value
4454 or zero, according to the current setting of the caseless flag. The
4455 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4456 firstchar or reqchar variables to record the case status of the
4457 value. This is used only for ASCII characters. */
4458
4459 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4460
4461 /* Switch on next character until the end of the branch */
4462
4463 for (;; ptr++)
4464 {
4465 BOOL negate_class;
4466 BOOL should_flip_negation;
4467 BOOL possessive_quantifier;
4468 BOOL is_quantifier;
4469 BOOL is_recurse;
4470 BOOL reset_bracount;
4471 int class_has_8bitchar;
4472 int class_one_char;
4473 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4474 BOOL xclass_has_prop;
4475 #endif
4476 int newoptions;
4477 int recno;
4478 int refsign;
4479 int skipbytes;
4480 pcre_uint32 subreqchar, subfirstchar;
4481 pcre_int32 subreqcharflags, subfirstcharflags;
4482 int terminator;
4483 unsigned int mclength;
4484 unsigned int tempbracount;
4485 pcre_uint32 ec;
4486 pcre_uchar mcbuffer[8];
4487
4488 /* Get next character in the pattern */
4489
4490 c = *ptr;
4491
4492 /* If we are at the end of a nested substitution, revert to the outer level
4493 string. Nesting only happens one level deep. */
4494
4495 if (c == CHAR_NULL && nestptr != NULL)
4496 {
4497 ptr = nestptr;
4498 nestptr = NULL;
4499 c = *ptr;
4500 }
4501
4502 /* If we are in the pre-compile phase, accumulate the length used for the
4503 previous cycle of this loop. */
4504
4505 if (lengthptr != NULL)
4506 {
4507 #ifdef PCRE_DEBUG
4508 if (code > cd->hwm) cd->hwm = code; /* High water info */
4509 #endif
4510 if (code > cd->start_workspace + cd->workspace_size -
4511 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4512 {
4513 *errorcodeptr = ERR52;
4514 goto FAILED;
4515 }
4516
4517 /* There is at least one situation where code goes backwards: this is the
4518 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4519 the class is simply eliminated. However, it is created first, so we have to
4520 allow memory for it. Therefore, don't ever reduce the length at this point.
4521 */
4522
4523 if (code < last_code) code = last_code;
4524
4525 /* Paranoid check for integer overflow */
4526
4527 if (OFLOW_MAX - *lengthptr < code - last_code)
4528 {
4529 *errorcodeptr = ERR20;
4530 goto FAILED;
4531 }
4532
4533 *lengthptr += (int)(code - last_code);
4534 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4535 (int)(code - last_code), c, c));
4536
4537 /* If "previous" is set and it is not at the start of the work space, move
4538 it back to there, in order to avoid filling up the work space. Otherwise,
4539 if "previous" is NULL, reset the current code pointer to the start. */
4540
4541 if (previous != NULL)
4542 {
4543 if (previous > orig_code)
4544 {
4545 memmove(orig_code, previous, IN_UCHARS(code - previous));
4546 code -= previous - orig_code;
4547 previous = orig_code;
4548 }
4549 }
4550 else code = orig_code;
4551
4552 /* Remember where this code item starts so we can pick up the length
4553 next time round. */
4554
4555 last_code = code;
4556 }
4557
4558 /* In the real compile phase, just check the workspace used by the forward
4559 reference list. */
4560
4561 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4562 WORK_SIZE_SAFETY_MARGIN)
4563 {
4564 *errorcodeptr = ERR52;
4565 goto FAILED;
4566 }
4567
4568 /* If in \Q...\E, check for the end; if not, we have a literal */
4569
4570 if (inescq && c != CHAR_NULL)
4571 {
4572 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4573 {
4574 inescq = FALSE;
4575 ptr++;
4576 continue;
4577 }
4578 else
4579 {
4580 if (previous_callout != NULL)
4581 {
4582 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4583 complete_callout(previous_callout, ptr, cd);
4584 previous_callout = NULL;
4585 }
4586 if ((options & PCRE_AUTO_CALLOUT) != 0)
4587 {
4588 previous_callout = code;
4589 code = auto_callout(code, ptr, cd);
4590 }
4591 goto NORMAL_CHAR;
4592 }
4593 /* Control does not reach here. */
4594 }
4595
4596 /* In extended mode, skip white space and comments. We need a loop in order
4597 to check for more white space and more comments after a comment. */
4598
4599 if ((options & PCRE_EXTENDED) != 0)
4600 {
4601 for (;;)
4602 {
4603 while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4604 if (c != CHAR_NUMBER_SIGN) break;
4605 ptr++;
4606 while (*ptr != CHAR_NULL)
4607 {
4608 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
4609 { /* IS_NEWLINE sets cd->nllen. */
4610 ptr += cd->nllen;
4611 break;
4612 }
4613 ptr++;
4614 #ifdef SUPPORT_UTF
4615 if (utf) FORWARDCHAR(ptr);
4616 #endif
4617 }
4618 c = *ptr; /* Either NULL or the char after a newline */
4619 }
4620 }
4621
4622 /* See if the next thing is a quantifier. */
4623
4624 is_quantifier =
4625 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4626 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4627
4628 /* Fill in length of a previous callout, except when the next thing is a
4629 quantifier or when processing a property substitution string in UCP mode. */
4630
4631 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4632 after_manual_callout-- <= 0)
4633 {
4634 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4635 complete_callout(previous_callout, ptr, cd);
4636 previous_callout = NULL;
4637 }
4638
4639 /* Create auto callout, except for quantifiers, or while processing property
4640 strings that are substituted for \w etc in UCP mode. */
4641
4642 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4643 {
4644 previous_callout = code;
4645 code = auto_callout(code, ptr, cd);
4646 }
4647
4648 /* Process the next pattern item. */
4649
4650 switch(c)
4651 {
4652 /* ===================================================================*/
4653 case CHAR_NULL: /* The branch terminates at string end */
4654 case CHAR_VERTICAL_LINE: /* or | or ) */
4655 case CHAR_RIGHT_PARENTHESIS:
4656 *firstcharptr = firstchar;
4657 *firstcharflagsptr = firstcharflags;
4658 *reqcharptr = reqchar;
4659 *reqcharflagsptr = reqcharflags;
4660 *codeptr = code;
4661 *ptrptr = ptr;
4662 if (lengthptr != NULL)
4663 {
4664 if (OFLOW_MAX - *lengthptr < code - last_code)
4665 {
4666 *errorcodeptr = ERR20;
4667 goto FAILED;
4668 }
4669 *lengthptr += (int)(code - last_code); /* To include callout length */
4670 DPRINTF((">> end branch\n"));
4671 }
4672 return TRUE;
4673
4674
4675 /* ===================================================================*/
4676 /* Handle single-character metacharacters. In multiline mode, ^ disables
4677 the setting of any following char as a first character. */
4678
4679 case CHAR_CIRCUMFLEX_ACCENT:
4680 previous = NULL;
4681 if ((options & PCRE_MULTILINE) != 0)
4682 {
4683 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4684 *code++ = OP_CIRCM;
4685 }
4686 else *code++ = OP_CIRC;
4687 break;
4688
4689 case CHAR_DOLLAR_SIGN:
4690 previous = NULL;
4691 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4692 break;
4693
4694 /* There can never be a first char if '.' is first, whatever happens about
4695 repeats. The value of reqchar doesn't change either. */
4696
4697 case CHAR_DOT:
4698 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4699 zerofirstchar = firstchar;
4700 zerofirstcharflags = firstcharflags;
4701 zeroreqchar = reqchar;
4702 zeroreqcharflags = reqcharflags;
4703 previous = code;
4704 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4705 break;
4706
4707
4708 /* ===================================================================*/
4709 /* Character classes. If the included characters are all < 256, we build a
4710 32-byte bitmap of the permitted characters, except in the special case
4711 where there is only one such character. For negated classes, we build the
4712 map as usual, then invert it at the end. However, we use a different opcode
4713 so that data characters > 255 can be handled correctly.
4714
4715 If the class contains characters outside the 0-255 range, a different
4716 opcode is compiled. It may optionally have a bit map for characters < 256,
4717 but those above are are explicitly listed afterwards. A flag byte tells
4718 whether the bitmap is present, and whether this is a negated class or not.
4719
4720 In JavaScript compatibility mode, an isolated ']' causes an error. In
4721 default (Perl) mode, it is treated as a data character. */
4722
4723 case CHAR_RIGHT_SQUARE_BRACKET:
4724 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4725 {
4726 *errorcodeptr = ERR64;
4727 goto FAILED;
4728 }
4729 goto NORMAL_CHAR;
4730
4731 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4732 used for "start of word" and "end of word". As these are otherwise illegal
4733 sequences, we don't break anything by recognizing them. They are replaced
4734 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4735 erroneous and are handled by the normal code below. */
4736
4737 case CHAR_LEFT_SQUARE_BRACKET:
4738 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4739 {
4740 nestptr = ptr + 7;
4741 ptr = sub_start_of_word - 1;
4742 continue;
4743 }
4744
4745 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4746 {
4747 nestptr = ptr + 7;
4748 ptr = sub_end_of_word - 1;
4749 continue;
4750 }
4751
4752 /* Handle a real character class. */
4753
4754 previous = code;
4755
4756 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4757 they are encountered at the top level, so we'll do that too. */
4758
4759 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4760 ptr[1] == CHAR_EQUALS_SIGN) &&
4761 check_posix_syntax(ptr, &tempptr))
4762 {
4763 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4764 goto FAILED;
4765 }
4766
4767 /* If the first character is '^', set the negation flag and skip it. Also,
4768 if the first few characters (either before or after ^) are \Q\E or \E we
4769 skip them too. This makes for compatibility with Perl. */
4770
4771 negate_class = FALSE;
4772 for (;;)
4773 {
4774 c = *(++ptr);
4775 if (c == CHAR_BACKSLASH)
4776 {
4777 if (ptr[1] == CHAR_E)
4778 ptr++;
4779 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4780 ptr += 3;
4781 else
4782 break;
4783 }
4784 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4785 negate_class = TRUE;
4786 else break;
4787 }
4788
4789 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4790 an initial ']' is taken as a data character -- the code below handles
4791 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4792 [^] must match any character, so generate OP_ALLANY. */
4793
4794 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4795 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4796 {
4797 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4798 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4799 zerofirstchar = firstchar;
4800 zerofirstcharflags = firstcharflags;
4801 break;
4802 }
4803
4804 /* If a class contains a negative special such as \S, we need to flip the
4805 negation flag at the end, so that support for characters > 255 works
4806 correctly (they are all included in the class). */
4807
4808 should_flip_negation = FALSE;
4809
4810 /* Extended class (xclass) will be used when characters > 255
4811 might match. */
4812
4813 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4814 xclass = FALSE;
4815 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4816 class_uchardata_base = class_uchardata; /* Save the start */
4817 #endif
4818
4819 /* For optimization purposes, we track some properties of the class:
4820 class_has_8bitchar will be non-zero if the class contains at least one <
4821 256 character; class_one_char will be 1 if the class contains just one
4822 character; xclass_has_prop will be TRUE if unicode property checks
4823 are present in the class. */
4824
4825 class_has_8bitchar = 0;
4826 class_one_char = 0;
4827 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4828 xclass_has_prop = FALSE;
4829 #endif
4830
4831 /* Initialize the 32-char bit map to all zeros. We build the map in a
4832 temporary bit of memory, in case the class contains fewer than two
4833 8-bit characters because in that case the compiled code doesn't use the bit
4834 map. */
4835
4836 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4837
4838 /* Process characters until ] is reached. By writing this as a "do" it
4839 means that an initial ] is taken as a data character. At the start of the
4840 loop, c contains the first byte of the character. */
4841
4842 if (c != CHAR_NULL) do
4843 {
4844 const pcre_uchar *oldptr;
4845
4846 #ifdef SUPPORT_UTF
4847 if (utf && HAS_EXTRALEN(c))
4848 { /* Braces are required because the */
4849 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4850 }
4851 #endif
4852
4853 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4854 /* In the pre-compile phase, accumulate the length of any extra
4855 data and reset the pointer. This is so that very large classes that
4856 contain a zillion > 255 characters no longer overwrite the work space
4857 (which is on the stack). We have to remember that there was XCLASS data,
4858 however. */
4859
4860 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4861 {
4862 xclass = TRUE;
4863 *lengthptr += class_uchardata - class_uchardata_base;
4864 class_uchardata = class_uchardata_base;
4865 }
4866 #endif
4867
4868 /* Inside \Q...\E everything is literal except \E */
4869
4870 if (inescq)
4871 {
4872 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4873 {
4874 inescq = FALSE; /* Reset literal state */
4875 ptr++; /* Skip the 'E' */
4876 continue; /* Carry on with next */
4877 }
4878 goto CHECK_RANGE; /* Could be range if \E follows */
4879 }
4880
4881 /* Handle POSIX class names. Perl allows a negation extension of the
4882 form [:^name:]. A square bracket that doesn't match the syntax is
4883 treated as a literal. We also recognize the POSIX constructions
4884 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4885 5.6 and 5.8 do. */
4886
4887 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4888 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4889 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4890 {
4891 BOOL local_negate = FALSE;
4892 int posix_class, taboffset, tabopt;
4893 register const pcre_uint8 *cbits = cd->cbits;
4894 pcre_uint8 pbits[32];
4895
4896 if (ptr[1] != CHAR_COLON)
4897 {
4898 *errorcodeptr = ERR31;
4899 goto FAILED;
4900 }
4901
4902 ptr += 2;
4903 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4904 {
4905 local_negate = TRUE;
4906 should_flip_negation = TRUE; /* Note negative special */
4907 ptr++;
4908 }
4909
4910 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4911 if (posix_class < 0)
4912 {
4913 *errorcodeptr = ERR30;
4914 goto FAILED;
4915 }
4916
4917 /* If matching is caseless, upper and lower are converted to
4918 alpha. This relies on the fact that the class table starts with
4919 alpha, lower, upper as the first 3 entries. */
4920
4921 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4922 posix_class = 0;
4923
4924 /* When PCRE_UCP is set, some of the POSIX classes are converted to
4925 different escape sequences that use Unicode properties \p or \P. Others
4926 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4927 directly. */
4928
4929 #ifdef SUPPORT_UCP
4930 if ((options & PCRE_UCP) != 0)
4931 {
4932 unsigned int ptype = 0;
4933 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4934
4935 /* The posix_substitutes table specifies which POSIX classes can be
4936 converted to \p or \P items. */
4937
4938 if (posix_substitutes[pc] != NULL)
4939 {
4940 nestptr = tempptr + 1;
4941 ptr = posix_substitutes[pc] - 1;
4942 continue;
4943 }
4944
4945 /* There are three other classes that generate special property calls
4946 that are recognized only in an XCLASS. */
4947
4948 else switch(posix_class)
4949 {
4950 case PC_GRAPH:
4951 ptype = PT_PXGRAPH;
4952 /* Fall through */
4953 case PC_PRINT:
4954 if (ptype == 0) ptype = PT_PXPRINT;
4955 /* Fall through */
4956 case PC_PUNCT:
4957 if (ptype == 0) ptype = PT_PXPUNCT;
4958 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4959 *class_uchardata++ = ptype;
4960 *class_uchardata++ = 0;
4961 xclass_has_prop = TRUE;
4962 ptr = tempptr + 1;
4963 continue;
4964
4965 /* For all other POSIX classes, no special action is taken in UCP
4966 mode. Fall through to the non_UCP case. */
4967
4968 default:
4969 break;
4970 }
4971 }
4972 #endif
4973 /* In the non-UCP case, or when UCP makes no difference, we build the
4974 bit map for the POSIX class in a chunk of local store because we may be
4975 adding and subtracting from it, and we don't want to subtract bits that
4976 may be in the main map already. At the end we or the result into the
4977 bit map that is being built. */
4978
4979 posix_class *= 3;
4980
4981 /* Copy in the first table (always present) */
4982
4983 memcpy(pbits, cbits + posix_class_maps[posix_class],
4984 32 * sizeof(pcre_uint8));
4985
4986 /* If there is a second table, add or remove it as required. */
4987
4988 taboffset = posix_class_maps[posix_class + 1];
4989 tabopt = posix_class_maps[posix_class + 2];
4990
4991 if (taboffset >= 0)
4992 {
4993 if (tabopt >= 0)
4994 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
4995 else
4996 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
4997 }
4998
4999 /* Now see if we need to remove any special characters. An option
5000 value of 1 removes vertical space and 2 removes underscore. */
5001
5002 if (tabopt < 0) tabopt = -tabopt;
5003 if (tabopt == 1) pbits[1] &= ~0x3c;
5004 else if (tabopt == 2) pbits[11] &= 0x7f;
5005
5006 /* Add the POSIX table or its complement into the main table that is
5007 being built and we are done. */
5008
5009 if (local_negate)
5010 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5011 else
5012 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5013
5014 ptr = tempptr + 1;
5015 /* Every class contains at least one < 256 character. */
5016 class_has_8bitchar = 1;
5017 /* Every class contains at least two characters. */
5018 class_one_char = 2;
5019 continue; /* End of POSIX syntax handling */
5020 }
5021
5022 /* Backslash may introduce a single character, or it may introduce one
5023 of the specials, which just set a flag. The sequence \b is a special
5024 case. Inside a class (and only there) it is treated as backspace. We
5025 assume that other escapes have more than one character in them, so
5026 speculatively set both class_has_8bitchar and class_one_char bigger
5027 than one. Unrecognized escapes fall through and are either treated
5028 as literal characters (by default), or are faulted if
5029 PCRE_EXTRA is set. */
5030
5031 if (c == CHAR_BACKSLASH)
5032 {
5033 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5034 TRUE);
5035 if (*errorcodeptr != 0) goto FAILED;
5036 if (escape == 0) c = ec;
5037 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5038 else if (escape == ESC_N) /* \N is not supported in a class */
5039 {
5040 *errorcodeptr = ERR71;
5041 goto FAILED;
5042 }
5043 else if (escape == ESC_Q) /* Handle start of quoted string */
5044 {
5045 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5046 {
5047 ptr += 2; /* avoid empty string */
5048 }
5049 else inescq = TRUE;
5050 continue;
5051 }
5052 else if (escape == ESC_E) continue; /* Ignore orphan \E */
5053
5054 else
5055 {
5056 register const pcre_uint8 *cbits = cd->cbits;
5057 /* Every class contains at least two < 256 characters. */
5058 class_has_8bitchar++;
5059 /* Every class contains at least two characters. */
5060 class_one_char += 2;
5061
5062 switch (escape)
5063 {
5064 #ifdef SUPPORT_UCP
5065 case ESC_du: /* These are the values given for \d etc */
5066 case ESC_DU: /* when PCRE_UCP is set. We replace the */
5067 case ESC_wu: /* escape sequence with an appropriate \p */
5068 case ESC_WU: /* or \P to test Unicode properties instead */
5069 case ESC_su: /* of the default ASCII testing. */
5070 case ESC_SU:
5071 nestptr = ptr;
5072 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
5073 class_has_8bitchar--; /* Undo! */
5074 continue;
5075 #endif
5076 case ESC_d:
5077 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5078 continue;
5079
5080 case ESC_D:
5081 should_flip_negation = TRUE;
5082 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5083 continue;
5084
5085 case ESC_w:
5086 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5087 continue;
5088
5089 case ESC_W:
5090 should_flip_negation = TRUE;
5091 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5092 continue;
5093
5094 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5095 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5096 previously set by something earlier in the character class.
5097 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5098 we could just adjust the appropriate bit. From PCRE 8.34 we no
5099 longer treat \s and \S specially. */
5100
5101 case ESC_s:
5102 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5103 continue;
5104
5105 case ESC_S:
5106 should_flip_negation = TRUE;
5107 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5108 continue;
5109
5110 /* The rest apply in both UCP and non-UCP cases. */
5111
5112 case ESC_h:
5113 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5114 PRIV(hspace_list), NOTACHAR);
5115 continue;
5116
5117 case ESC_H:
5118 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5119 cd, PRIV(hspace_list));
5120 continue;
5121
5122 case ESC_v:
5123 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5124 PRIV(vspace_list), NOTACHAR);
5125 continue;
5126
5127 case ESC_V:
5128 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5129 cd, PRIV(vspace_list));
5130 continue;
5131
5132 #ifdef SUPPORT_UCP
5133 case ESC_p:
5134 case ESC_P:
5135 {
5136 BOOL negated;
5137 unsigned int ptype = 0, pdata = 0;
5138 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5139 goto FAILED;
5140 *class_uchardata++ = ((escape == ESC_p) != negated)?
5141 XCL_PROP : XCL_NOTPROP;
5142 *class_uchardata++ = ptype;
5143 *class_uchardata++ = pdata;
5144 xclass_has_prop = TRUE;
5145 class_has_8bitchar--; /* Undo! */
5146 continue;
5147 }
5148 #endif
5149 /* Unrecognized escapes are faulted if PCRE is running in its
5150 strict mode. By default, for compatibility with Perl, they are
5151 treated as literals. */
5152
5153 default:
5154 if ((options & PCRE_EXTRA) != 0)
5155 {
5156 *errorcodeptr = ERR7;
5157 goto FAILED;
5158 }
5159 class_has_8bitchar--; /* Undo the speculative increase. */
5160 class_one_char -= 2; /* Undo the speculative increase. */
5161 c = *ptr; /* Get the final character and fall through */
5162 break;
5163 }
5164 }
5165
5166 /* Fall through if the escape just defined a single character (c >= 0).
5167 This may be greater than 256. */
5168
5169 escape = 0;
5170
5171 } /* End of backslash handling */
5172
5173 /* A character may be followed by '-' to form a range. However, Perl does
5174 not permit ']' to be the end of the range. A '-' character at the end is
5175 treated as a literal. Perl ignores orphaned \E sequences entirely. The
5176 code for handling \Q and \E is messy. */
5177
5178 CHECK_RANGE:
5179 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5180 {
5181 inescq = FALSE;
5182 ptr += 2;
5183 }
5184 oldptr = ptr;
5185
5186 /* Remember if \r or \n were explicitly used */
5187
5188 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5189
5190 /* Check for range */
5191
5192 if (!inescq && ptr[1] == CHAR_MINUS)
5193 {
5194 pcre_uint32 d;
5195 ptr += 2;
5196 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5197
5198 /* If we hit \Q (not followed by \E) at this point, go into escaped
5199 mode. */
5200
5201 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5202 {
5203 ptr += 2;
5204 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5205 { ptr += 2; continue; }
5206 inescq = TRUE;
5207 break;
5208 }
5209
5210 /* Minus (hyphen) at the end of a class is treated as a literal, so put
5211 back the pointer and jump to handle the character that preceded it. */
5212
5213 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5214 {
5215 ptr = oldptr;
5216 goto CLASS_SINGLE_CHARACTER;
5217 }
5218
5219 /* Otherwise, we have a potential range; pick up the next character */
5220
5221 #ifdef SUPPORT_UTF
5222 if (utf)
5223 { /* Braces are required because the */
5224 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
5225 }
5226 else
5227 #endif
5228 d = *ptr; /* Not UTF-8 mode */
5229
5230 /* The second part of a range can be a single-character escape
5231 sequence, but not any of the other escapes. Perl treats a hyphen as a
5232 literal in such circumstances. However, in Perl's warning mode, a
5233 warning is given, so PCRE now faults it as it is almost certainly a
5234 mistake on the user's part. */
5235
5236 if (!inescq)
5237 {
5238 if (d == CHAR_BACKSLASH)
5239 {
5240 int descape;
5241 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5242 if (*errorcodeptr != 0) goto FAILED;
5243
5244 /* 0 means a character was put into d; \b is backspace; any other
5245 special causes an error. */
5246
5247 if (descape != 0)
5248 {
5249 if (descape == ESC_b) d = CHAR_BS; else
5250 {
5251 *errorcodeptr = ERR83;
5252 goto FAILED;
5253 }
5254 }
5255 }
5256
5257 /* A hyphen followed by a POSIX class is treated in the same way. */
5258
5259 else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5260 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5261 ptr[1] == CHAR_EQUALS_SIGN) &&
5262 check_posix_syntax(ptr, &tempptr))
5263 {
5264 *errorcodeptr = ERR83;
5265 goto FAILED;
5266 }
5267 }
5268
5269 /* Check that the two values are in the correct order. Optimize
5270 one-character ranges. */
5271
5272 if (d < c)
5273 {
5274 *errorcodeptr = ERR8;
5275 goto FAILED;
5276 }
5277 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
5278
5279 /* We have found a character range, so single character optimizations
5280 cannot be done anymore. Any value greater than 1 indicates that there
5281 is more than one character. */
5282
5283 class_one_char = 2;
5284
5285 /* Remember an explicit \r or \n, and add the range to the class. */
5286
5287 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5288
5289 class_has_8bitchar +=
5290 add_to_class(classbits, &class_uchardata, options, cd, c, d);
5291
5292 continue; /* Go get the next char in the class */
5293 }
5294
5295 /* Handle a single character - we can get here for a normal non-escape
5296 char, or after \ that introduces a single character or for an apparent
5297 range that isn't. Only the value 1 matters for class_one_char, so don't
5298 increase it if it is already 2 or more ... just in case there's a class
5299 with a zillion characters in it. */
5300
5301 CLASS_SINGLE_CHARACTER:
5302 if (class_one_char < 2) class_one_char++;
5303
5304 /* If class_one_char is 1, we have the first single character in the
5305 class, and there have been no prior ranges, or XCLASS items generated by
5306 escapes. If this is the final character in the class, we can optimize by
5307 turning the item into a 1-character OP_CHAR[I] if it's positive, or
5308 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
5309 to be set. Otherwise, there can be no first char if this item is first,
5310 whatever repeat count may follow. In the case of reqchar, save the
5311 previous value for reinstating. */
5312
5313 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5314 {
5315 ptr++;
5316 zeroreqchar = reqchar;
5317 zeroreqcharflags = reqcharflags;
5318
5319 if (negate_class)
5320 {
5321 #ifdef SUPPORT_UCP
5322 int d;
5323 #endif
5324 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5325 zerofirstchar = firstchar;
5326 zerofirstcharflags = firstcharflags;
5327
5328 /* For caseless UTF-8 mode when UCP support is available, check
5329 whether this character has more than one other case. If so, generate
5330 a special OP_NOTPROP item instead of OP_NOTI. */
5331
5332 #ifdef SUPPORT_UCP
5333 if (utf && (options & PCRE_CASELESS) != 0 &&
5334 (d = UCD_CASESET(c)) != 0)
5335 {
5336 *code++ = OP_NOTPROP;
5337 *code++ = PT_CLIST;
5338 *code++ = d;
5339 }
5340 else
5341 #endif
5342 /* Char has only one other case, or UCP not available */
5343
5344 {
5345 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5346 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5347 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5348 code += PRIV(ord2utf)(c, code);
5349 else
5350 #endif
5351 *code++ = c;
5352 }
5353
5354 /* We are finished with this character class */
5355
5356 goto END_CLASS;
5357 }
5358
5359 /* For a single, positive character, get the value into mcbuffer, and
5360 then we can handle this with the normal one-character code. */
5361
5362 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5363 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5364 mclength = PRIV(ord2utf)(c, mcbuffer);
5365 else
5366 #endif
5367 {
5368 mcbuffer[0] = c;
5369 mclength = 1;
5370 }
5371 goto ONE_CHAR;
5372 } /* End of 1-char optimization */
5373
5374 /* There is more than one character in the class, or an XCLASS item
5375 has been generated. Add this character to the class. */
5376
5377 class_has_8bitchar +=
5378 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5379 }
5380
5381 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5382 If we are at the end of an internal nested string, revert to the outer
5383 string. */
5384
5385 while (((c = *(++ptr)) != CHAR_NULL ||
5386 (nestptr != NULL &&
5387 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5388 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5389
5390 /* Check for missing terminating ']' */
5391
5392 if (c == CHAR_NULL)
5393 {
5394 *errorcodeptr = ERR6;
5395 goto FAILED;
5396 }
5397
5398 /* We will need an XCLASS if data has been placed in class_uchardata. In
5399 the second phase this is a sufficient test. However, in the pre-compile
5400 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5401 only if the very last character in the class needs XCLASS will it contain
5402 anything at this point. For this reason, xclass gets set TRUE above when
5403 uchar_classdata is emptied, and that's why this code is the way it is here
5404 instead of just doing a test on class_uchardata below. */
5405
5406 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5407 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5408 #endif
5409
5410 /* If this is the first thing in the branch, there can be no first char
5411 setting, whatever the repeat count. Any reqchar setting must remain
5412 unchanged after any kind of repeat. */
5413
5414 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5415 zerofirstchar = firstchar;
5416 zerofirstcharflags = firstcharflags;
5417 zeroreqchar = reqchar;
5418 zeroreqcharflags = reqcharflags;
5419
5420 /* If there are characters with values > 255, we have to compile an
5421 extended class, with its own opcode, unless there was a negated special
5422 such as \S in the class, and PCRE_UCP is not set, because in that case all
5423 characters > 255 are in the class, so any that were explicitly given as
5424 well can be ignored. If (when there are explicit characters > 255 that must
5425 be listed) there are no characters < 256, we can omit the bitmap in the
5426 actual compiled code. */
5427
5428 #ifdef SUPPORT_UTF
5429 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5430 #elif !defined COMPILE_PCRE8
5431 if (xclass && !should_flip_negation)
5432 #endif
5433 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5434 {
5435 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5436 *code++ = OP_XCLASS;
5437 code += LINK_SIZE;
5438 *code = negate_class? XCL_NOT:0;
5439 if (xclass_has_prop) *code |= XCL_HASPROP;
5440
5441 /* If the map is required, move up the extra data to make room for it;
5442 otherwise just move the code pointer to the end of the extra data. */
5443
5444 if (class_has_8bitchar > 0)
5445 {
5446 *code++ |= XCL_MAP;
5447 memmove(code + (32 / sizeof(pcre_uchar)), code,
5448 IN_UCHARS(class_uchardata - code));
5449 if (negate_class && !xclass_has_prop)
5450 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5451 memcpy(code, classbits, 32);
5452 code = class_uchardata + (32 / sizeof(pcre_uchar));
5453 }
5454 else code = class_uchardata;
5455
5456 /* Now fill in the complete length of the item */
5457
5458 PUT(previous, 1, (int)(code - previous));
5459 break; /* End of class handling */
5460 }
5461 #endif
5462
5463 /* If there are no characters > 255, or they are all to be included or
5464 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5465 whole class was negated and whether there were negative specials such as \S
5466 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5467 negating it if necessary. */
5468
5469 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5470 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5471 {
5472 if (negate_class)
5473 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5474 memcpy(code, classbits, 32);
5475 }
5476 code += 32 / sizeof(pcre_uchar);
5477
5478 END_CLASS:
5479 break;
5480
5481
5482 /* ===================================================================*/
5483 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5484 has been tested above. */
5485
5486 case CHAR_LEFT_CURLY_BRACKET:
5487 if (!is_quantifier) goto NORMAL_CHAR;
5488 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5489 if (*errorcodeptr != 0) goto FAILED;
5490 goto REPEAT;
5491
5492 case CHAR_ASTERISK:
5493 repeat_min = 0;
5494 repeat_max = -1;
5495 goto REPEAT;
5496
5497 case CHAR_PLUS:
5498 repeat_min = 1;
5499 repeat_max = -1;
5500 goto REPEAT;
5501
5502 case CHAR_QUESTION_MARK:
5503 repeat_min = 0;
5504 repeat_max = 1;
5505
5506 REPEAT:
5507 if (previous == NULL)
5508 {
5509 *errorcodeptr = ERR9;
5510 goto FAILED;
5511 }
5512
5513 if (repeat_min == 0)
5514 {
5515 firstchar = zerofirstchar; /* Adjust for zero repeat */
5516 firstcharflags = zerofirstcharflags;
5517 reqchar = zeroreqchar; /* Ditto */
5518 reqcharflags = zeroreqcharflags;
5519 }
5520
5521 /* Remember whether this is a variable length repeat */
5522
5523 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5524
5525 op_type = 0; /* Default single-char op codes */
5526 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5527
5528 /* Save start of previous item, in case we have to move it up in order to
5529 insert something before it. */
5530
5531 tempcode = previous;
5532
5533 /* Before checking for a possessive quantifier, we must skip over
5534 whitespace and comments in extended mode because Perl allows white space at
5535 this point. */
5536
5537 if ((options & PCRE_EXTENDED) != 0)
5538 {
5539 const pcre_uchar *p = ptr + 1;
5540 for (;;)
5541 {
5542 while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5543 if (*p != CHAR_NUMBER_SIGN) break;
5544 p++;
5545 while (*p != CHAR_NULL)
5546 {
5547 if (IS_NEWLINE(p)) /* For non-fixed-length newline cases, */
5548 { /* IS_NEWLINE sets cd->nllen. */
5549 p += cd->nllen;
5550 break;
5551 }
5552 p++;
5553 #ifdef SUPPORT_UTF
5554 if (utf) FORWARDCHAR(p);
5555 #endif
5556 } /* Loop for comment characters */
5557 } /* Loop for multiple comments */
5558 ptr = p - 1; /* Character before the next significant one. */
5559 }
5560
5561 /* If the next character is '+', we have a possessive quantifier. This
5562 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5563 If the next character is '?' this is a minimizing repeat, by default,
5564 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5565 repeat type to the non-default. */
5566
5567 if (ptr[1] == CHAR_PLUS)
5568 {
5569 repeat_type = 0; /* Force greedy */
5570 possessive_quantifier = TRUE;
5571 ptr++;
5572 }
5573 else if (ptr[1] == CHAR_QUESTION_MARK)
5574 {
5575 repeat_type = greedy_non_default;
5576 ptr++;
5577 }
5578 else repeat_type = greedy_default;
5579
5580 /* If previous was a recursion call, wrap it in atomic brackets so that
5581 previous becomes the atomic group. All recursions were so wrapped in the
5582 past, but it no longer happens for non-repeated recursions. In fact, the
5583 repeated ones could be re-implemented independently so as not to need this,
5584 but for the moment we rely on the code for repeating groups. */
5585
5586 if (*previous == OP_RECURSE)
5587 {
5588 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5589 *previous = OP_ONCE;
5590 PUT(previous, 1, 2 + 2*LINK_SIZE);
5591 previous[2 + 2*LINK_SIZE] = OP_KET;
5592 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5593 code += 2 + 2 * LINK_SIZE;
5594 length_prevgroup = 3 + 3*LINK_SIZE;
5595
5596 /* When actually compiling, we need to check whether this was a forward
5597 reference, and if so, adjust the offset. */
5598
5599 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5600 {
5601 int offset = GET(cd->hwm, -LINK_SIZE);
5602 if (offset == previous + 1 - cd->start_code)
5603 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5604 }
5605 }
5606
5607 /* Now handle repetition for the different types of item. */
5608
5609 /* If previous was a character or negated character match, abolish the item
5610 and generate a repeat item instead. If a char item has a minimum of more
5611 than one, ensure that it is set in reqchar - it might not be if a sequence
5612 such as x{3} is the first thing in a branch because the x will have gone
5613 into firstchar instead. */
5614
5615 if (*previous == OP_CHAR || *previous == OP_CHARI
5616 || *previous == OP_NOT || *previous == OP_NOTI)
5617 {
5618 switch (*previous)
5619 {
5620 default: /* Make compiler happy. */
5621 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5622 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5623 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5624 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5625 }
5626
5627 /* Deal with UTF characters that take up more than one character. It's
5628 easier to write this out separately than try to macrify it. Use c to
5629 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5630 it's a length rather than a small character. */
5631
5632 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5633 if (utf && NOT_FIRSTCHAR(code[-1]))
5634 {
5635 pcre_uchar *lastchar = code - 1;
5636 BACKCHAR(lastchar);
5637 c = (int)(code - lastchar); /* Length of UTF-8 character */
5638 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5639 c |= UTF_LENGTH; /* Flag c as a length */
5640 }
5641 else
5642 #endif /* SUPPORT_UTF */
5643
5644 /* Handle the case of a single charater - either with no UTF support, or
5645 with UTF disabled, or for a single character UTF character. */
5646 {
5647 c = code[-1];
5648 if (*previous <= OP_CHARI && repeat_min > 1)
5649 {
5650 reqchar = c;
5651 reqcharflags = req_caseopt | cd->req_varyopt;
5652 }
5653 }
5654
5655 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5656 }
5657
5658 /* If previous was a character type match (\d or similar), abolish it and
5659 create a suitable repeat item. The code is shared with single-character
5660 repeats by setting op_type to add a suitable offset into repeat_type. Note
5661 the the Unicode property types will be present only when SUPPORT_UCP is
5662 defined, but we don't wrap the little bits of code here because it just
5663 makes it horribly messy. */
5664
5665 else if (*previous < OP_EODN)
5666 {
5667 pcre_uchar *oldcode;
5668 int prop_type, prop_value;
5669 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5670 c = *previous;
5671
5672 OUTPUT_SINGLE_REPEAT:
5673 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5674 {
5675 prop_type = previous[1];
5676 prop_value = previous[2];
5677 }
5678 else prop_type = prop_value = -1;
5679
5680 oldcode = code;
5681 code = previous; /* Usually overwrite previous item */
5682
5683 /* If the maximum is zero then the minimum must also be zero; Perl allows
5684 this case, so we do too - by simply omitting the item altogether. */
5685
5686 if (repeat_max == 0) goto END_REPEAT;
5687
5688 /* Combine the op_type with the repeat_type */
5689
5690 repeat_type += op_type;
5691
5692 /* A minimum of zero is handled either as the special case * or ?, or as
5693 an UPTO, with the maximum given. */
5694
5695 if (repeat_min == 0)
5696 {
5697 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5698 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5699 else
5700 {
5701 *code++ = OP_UPTO + repeat_type;
5702 PUT2INC(code, 0, repeat_max);
5703 }
5704 }
5705
5706 /* A repeat minimum of 1 is optimized into some special cases. If the
5707 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5708 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5709 one less than the maximum. */
5710
5711 else if (repeat_min == 1)
5712 {
5713 if (repeat_max == -1)
5714 *code++ = OP_PLUS + repeat_type;
5715 else
5716 {
5717 code = oldcode; /* leave previous item in place */
5718 if (repeat_max == 1) goto END_REPEAT;
5719 *code++ = OP_UPTO + repeat_type;
5720 PUT2INC(code, 0, repeat_max - 1);
5721 }
5722 }
5723
5724 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5725 handled as an EXACT followed by an UPTO. */
5726
5727 else
5728 {
5729 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5730 PUT2INC(code, 0, repeat_min);
5731
5732 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5733 we have to insert the character for the previous code. For a repeated
5734 Unicode property match, there are two extra bytes that define the
5735 required property. In UTF-8 mode, long characters have their length in
5736 c, with the UTF_LENGTH bit as a flag. */
5737
5738 if (repeat_max < 0)
5739 {
5740 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5741 if (utf && (c & UTF_LENGTH) != 0)
5742 {
5743 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5744 code += c & 7;
5745 }
5746 else
5747 #endif
5748 {
5749 *code++ = c;
5750 if (prop_type >= 0)
5751 {
5752 *code++ = prop_type;
5753 *code++ = prop_value;
5754 }
5755 }
5756 *code++ = OP_STAR + repeat_type;
5757 }
5758
5759 /* Else insert an UPTO if the max is greater than the min, again
5760 preceded by the character, for the previously inserted code. If the
5761 UPTO is just for 1 instance, we can use QUERY instead. */
5762
5763 else if (repeat_max != repeat_min)
5764 {
5765 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5766 if (utf && (c & UTF_LENGTH) != 0)
5767 {
5768 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5769 code += c & 7;
5770 }
5771 else
5772 #endif
5773 *code++ = c;
5774 if (prop_type >= 0)
5775 {
5776 *code++ = prop_type;
5777 *code++ = prop_value;
5778 }
5779 repeat_max -= repeat_min;
5780
5781 if (repeat_max == 1)
5782 {
5783 *code++ = OP_QUERY + repeat_type;
5784 }
5785 else
5786 {
5787 *code++ = OP_UPTO + repeat_type;
5788 PUT2INC(code, 0, repeat_max);
5789 }
5790 }
5791 }
5792
5793 /* The character or character type itself comes last in all cases. */
5794
5795 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5796 if (utf && (c & UTF_LENGTH) != 0)
5797 {
5798 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5799 code += c & 7;
5800 }
5801 else
5802 #endif
5803 *code++ = c;
5804
5805 /* For a repeated Unicode property match, there are two extra bytes that
5806 define the required property. */
5807
5808 #ifdef SUPPORT_UCP
5809 if (prop_type >= 0)
5810 {
5811 *code++ = prop_type;
5812 *code++ = prop_value;
5813 }
5814 #endif
5815 }
5816
5817 /* If previous was a character class or a back reference, we put the repeat
5818 stuff after it, but just skip the item if the repeat was {0,0}. */
5819
5820 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5821 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5822 *previous == OP_XCLASS ||
5823 #endif
5824 *previous == OP_REF || *previous == OP_REFI ||
5825 *previous == OP_DNREF || *previous == OP_DNREFI)
5826 {
5827 if (repeat_max == 0)
5828 {
5829 code = previous;
5830 goto END_REPEAT;
5831 }
5832
5833 if (repeat_min == 0 && repeat_max == -1)
5834 *code++ = OP_CRSTAR + repeat_type;
5835 else if (repeat_min == 1 && repeat_max == -1)
5836 *code++ = OP_CRPLUS + repeat_type;
5837 else if (repeat_min == 0 && repeat_max == 1)
5838 *code++ = OP_CRQUERY + repeat_type;
5839 else
5840 {
5841 *code++ = OP_CRRANGE + repeat_type;
5842 PUT2INC(code, 0, repeat_min);
5843 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5844 PUT2INC(code, 0, repeat_max);
5845 }
5846 }
5847
5848 /* If previous was a bracket group, we may have to replicate it in certain
5849 cases. Note that at this point we can encounter only the "basic" bracket
5850 opcodes such as BRA and CBRA, as this is the place where they get converted
5851 into the more special varieties such as BRAPOS and SBRA. A test for >=
5852 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5853 ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5854 Originally, PCRE did not allow repetition of assertions, but now it does,
5855 for Perl compatibility. */
5856
5857 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5858 {
5859 register int i;
5860 int len = (int)(code - previous);
5861 pcre_uchar *bralink = NULL;
5862 pcre_uchar *brazeroptr = NULL;
5863
5864 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5865 we just ignore the repeat. */
5866
5867 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5868 goto END_REPEAT;
5869
5870 /* There is no sense in actually repeating assertions. The only potential
5871 use of repetition is in cases when the assertion is optional. Therefore,
5872 if the minimum is greater than zero, just ignore the repeat. If the
5873 maximum is not zero or one, set it to 1. */
5874
5875 if (*previous < OP_ONCE) /* Assertion */
5876 {
5877 if (repeat_min > 0) goto END_REPEAT;
5878 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5879 }
5880
5881 /* The case of a zero minimum is special because of the need to stick
5882 OP_BRAZERO in front of it, and because the group appears once in the
5883 data, whereas in other cases it appears the minimum number of times. For
5884 this reason, it is simplest to treat this case separately, as otherwise
5885 the code gets far too messy. There are several special subcases when the
5886 minimum is zero. */
5887
5888 if (repeat_min == 0)
5889 {
5890 /* If the maximum is also zero, we used to just omit the group from the
5891 output altogether, like this:
5892
5893 ** if (repeat_max == 0)
5894 ** {
5895 ** code = previous;
5896 ** goto END_REPEAT;
5897 ** }
5898
5899 However, that fails when a group or a subgroup within it is referenced
5900 as a subroutine from elsewhere in the pattern, so now we stick in
5901 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5902 don't have a list of which groups are referenced, we cannot do this
5903 selectively.
5904
5905 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5906 and do no more at this point. However, we do need to adjust any
5907 OP_RECURSE calls inside the group that refer to the group itself or any
5908 internal or forward referenced group, because the offset is from the
5909 start of the whole regex. Temporarily terminate the pattern while doing
5910 this. */
5911
5912 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5913 {
5914 *code = OP_END;
5915 adjust_recurse(previous, 1, utf, cd, save_hwm);
5916 memmove(previous + 1, previous, IN_UCHARS(len));
5917 code++;
5918 if (repeat_max == 0)
5919 {
5920 *previous++ = OP_SKIPZERO;
5921 goto END_REPEAT;
5922 }
5923 brazeroptr = previous; /* Save for possessive optimizing */
5924 *previous++ = OP_BRAZERO + repeat_type;
5925 }
5926
5927 /* If the maximum is greater than 1 and limited, we have to replicate
5928 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5929 The first one has to be handled carefully because it's the original
5930 copy, which has to be moved up. The remainder can be handled by code
5931 that is common with the non-zero minimum case below. We have to
5932 adjust the value or repeat_max, since one less copy is required. Once
5933 again, we may have to adjust any OP_RECURSE calls inside the group. */
5934
5935 else
5936 {
5937 int offset;
5938 *code = OP_END;
5939 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5940 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5941 code += 2 + LINK_SIZE;
5942 *previous++ = OP_BRAZERO + repeat_type;
5943 *previous++ = OP_BRA;
5944
5945 /* We chain together the bracket offset fields that have to be
5946 filled in later when the ends of the brackets are reached. */
5947
5948 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5949 bralink = previous;
5950 PUTINC(previous, 0, offset);
5951 }
5952
5953 repeat_max--;
5954 }
5955
5956 /* If the minimum is greater than zero, replicate the group as many
5957 times as necessary, and adjust the maximum to the number of subsequent
5958 copies that we need. If we set a first char from the group, and didn't
5959 set a required char, copy the latter from the former. If there are any
5960 forward reference subroutine calls in the group, there will be entries on
5961 the workspace list; replicate these with an appropriate increment. */
5962
5963 else
5964 {
5965 if (repeat_min > 1)
5966 {
5967 /* In the pre-compile phase, we don't actually do the replication. We
5968 just adjust the length as if we had. Do some paranoid checks for
5969 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5970 integer type when available, otherwise double. */
5971
5972 if (lengthptr != NULL)
5973 {
5974 int delta = (repeat_min - 1)*length_prevgroup;
5975 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5976 (INT64_OR_DOUBLE)length_prevgroup >
5977 (INT64_OR_DOUBLE)INT_MAX ||
5978 OFLOW_MAX - *lengthptr < delta)
5979 {
5980 *errorcodeptr = ERR20;
5981 goto FAILED;
5982 }
5983 *lengthptr += delta;
5984 }
5985
5986 /* This is compiling for real. If there is a set first byte for
5987 the group, and we have not yet set a "required byte", set it. Make
5988 sure there is enough workspace for copying forward references before
5989 doing the copy. */
5990
5991 else
5992 {
5993 if (groupsetfirstchar && reqcharflags < 0)
5994 {
5995 reqchar = firstchar;
5996 reqcharflags = firstcharflags;
5997 }
5998
5999 for (i = 1; i < repeat_min; i++)
6000 {
6001 pcre_uchar *hc;
6002 pcre_uchar *this_hwm = cd->hwm;
6003 memcpy(code, previous, IN_UCHARS(len));
6004
6005 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6006 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
6007 {
6008 int save_offset = save_hwm - cd->start_workspace;
6009 int this_offset = this_hwm - cd->start_workspace;
6010 *errorcodeptr = expand_workspace(cd);
6011 if (*errorcodeptr != 0) goto FAILED;
6012 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
6013 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
6014 }
6015
6016 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
6017 {
6018 PUT(cd->hwm, 0, GET(hc, 0) + len);
6019 cd->hwm += LINK_SIZE;
6020 }
6021 save_hwm = this_hwm;
6022 code += len;
6023 }
6024 }
6025 }
6026
6027 if (repeat_max > 0) repeat_max -= repeat_min;
6028 }
6029
6030 /* This code is common to both the zero and non-zero minimum cases. If
6031 the maximum is limited, it replicates the group in a nested fashion,
6032 remembering the bracket starts on a stack. In the case of a zero minimum,
6033 the first one was set up above. In all cases the repeat_max now specifies
6034 the number of additional copies needed. Again, we must remember to
6035 replicate entries on the forward reference list. */
6036
6037 if (repeat_max >= 0)
6038 {
6039 /* In the pre-compile phase, we don't actually do the replication. We
6040 just adjust the length as if we had. For each repetition we must add 1
6041 to the length for BRAZERO and for all but the last repetition we must
6042 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6043 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6044 a 64-bit integer type when available, otherwise double. */
6045
6046 if (lengthptr != NULL && repeat_max > 0)
6047 {
6048 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6049 2 - 2*LINK_SIZE; /* Last one doesn't nest */
6050 if ((INT64_OR_DOUBLE)repeat_max *
6051 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6052 > (INT64_OR_DOUBLE)INT_MAX ||
6053 OFLOW_MAX - *lengthptr < delta)
6054 {
6055 *errorcodeptr = ERR20;
6056 goto FAILED;
6057 }
6058 *lengthptr += delta;
6059 }
6060
6061 /* This is compiling for real */
6062
6063 else for (i = repeat_max - 1; i >= 0; i--)
6064 {
6065 pcre_uchar *hc;
6066 pcre_uchar *this_hwm = cd->hwm;
6067
6068 *code++ = OP_BRAZERO + repeat_type;
6069
6070 /* All but the final copy start a new nesting, maintaining the
6071 chain of brackets outstanding. */
6072
6073 if (i != 0)
6074 {
6075 int offset;
6076 *code++ = OP_BRA;
6077 offset = (bralink == NULL)? 0 : (int)(code - bralink);
6078 bralink = code;
6079 PUTINC(code, 0, offset);
6080 }
6081
6082 memcpy(code, previous, IN_UCHARS(len));
6083
6084 /* Ensure there is enough workspace for forward references before
6085 copying them. */
6086
6087 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6088 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
6089 {
6090 int save_offset = save_hwm - cd->start_workspace;
6091 int this_offset = this_hwm - cd->start_workspace;
6092 *errorcodeptr = expand_workspace(cd);
6093 if (*errorcodeptr != 0) goto FAILED;
6094 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
6095 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
6096 }
6097
6098 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
6099 {
6100 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6101 cd->hwm += LINK_SIZE;
6102 }
6103 save_hwm = this_hwm;
6104 code += len;
6105 }
6106
6107 /* Now chain through the pending brackets, and fill in their length
6108 fields (which are holding the chain links pro tem). */
6109
6110 while (bralink != NULL)
6111 {
6112 int oldlinkoffset;
6113 int offset = (int)(code - bralink + 1);
6114 pcre_uchar *bra = code - offset;
6115 oldlinkoffset = GET(bra, 1);
6116 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6117 *code++ = OP_KET;
6118 PUTINC(code, 0, offset);
6119 PUT(bra, 1, offset);
6120 }
6121 }
6122
6123 /* If the maximum is unlimited, set a repeater in the final copy. For
6124 ONCE brackets, that's all we need to do. However, possessively repeated
6125 ONCE brackets can be converted into non-capturing brackets, as the
6126 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6127 deal with possessive ONCEs specially.
6128
6129 Otherwise, when we are doing the actual compile phase, check to see
6130 whether this group is one that could match an empty string. If so,
6131 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6132 that runtime checking can be done. [This check is also applied to ONCE
6133 groups at runtime, but in a different way.]
6134
6135 Then, if the quantifier was possessive and the bracket is not a
6136 conditional, we convert the BRA code to the POS form, and the KET code to
6137 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6138 subpattern at both the start and at the end.) The use of special opcodes
6139 makes it possible to reduce greatly the stack usage in pcre_exec(). If
6140 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6141
6142 Then, if the minimum number of matches is 1 or 0, cancel the possessive
6143 flag so that the default action below, of wrapping everything inside
6144 atomic brackets, does not happen. When the minimum is greater than 1,
6145 there will be earlier copies of the group, and so we still have to wrap
6146 the whole thing. */
6147
6148 else
6149 {
6150 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6151 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6152
6153 /* Convert possessive ONCE brackets to non-capturing */
6154
6155 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6156 possessive_quantifier) *bracode = OP_BRA;
6157
6158 /* For non-possessive ONCE brackets, all we need to do is to
6159 set the KET. */
6160
6161 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6162 *ketcode = OP_KETRMAX + repeat_type;
6163
6164 /* Handle non-ONCE brackets and possessive ONCEs (which have been
6165 converted to non-capturing above). */
6166
6167 else
6168 {
6169 /* In the compile phase, check for empty string matching. */
6170
6171 if (lengthptr == NULL)
6172 {
6173 pcre_uchar *scode = bracode;
6174 do
6175 {
6176 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6177 {
6178 *bracode += OP_SBRA - OP_BRA;
6179 break;
6180 }
6181 scode += GET(scode, 1);
6182 }
6183 while (*scode == OP_ALT);
6184 }
6185
6186 /* Handle possessive quantifiers. */
6187
6188 if (possessive_quantifier)
6189 {
6190 /* For COND brackets, we wrap the whole thing in a possessively
6191 repeated non-capturing bracket, because we have not invented POS
6192 versions of the COND opcodes. Because we are moving code along, we
6193 must ensure that any pending recursive references are updated. */
6194
6195 if (*bracode == OP_COND || *bracode == OP_SCOND)
6196 {
6197 int nlen = (int)(code - bracode);
6198 *code = OP_END;
6199 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
6200 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6201 code += 1 + LINK_SIZE;
6202 nlen += 1 + LINK_SIZE;
6203 *bracode = OP_BRAPOS;
6204 *code++ = OP_KETRPOS;
6205 PUTINC(code, 0, nlen);
6206 PUT(bracode, 1, nlen);
6207 }
6208
6209 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6210
6211 else
6212 {
6213 *bracode += 1; /* Switch to xxxPOS opcodes */
6214 *ketcode = OP_KETRPOS;
6215 }
6216
6217 /* If the minimum is zero, mark it as possessive, then unset the
6218 possessive flag when the minimum is 0 or 1. */
6219
6220 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6221 if (repeat_min < 2) possessive_quantifier = FALSE;
6222 }
6223
6224 /* Non-possessive quantifier */
6225
6226 else *ketcode = OP_KETRMAX + repeat_type;
6227 }
6228 }
6229 }
6230
6231 /* If previous is OP_FAIL, it was generated by an empty class [] in
6232 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6233 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6234 error above. We can just ignore the repeat in JS case. */
6235
6236 else if (*previous == OP_FAIL) goto END_REPEAT;
6237
6238 /* Else there's some kind of shambles */
6239
6240 else
6241 {
6242 *errorcodeptr = ERR11;
6243 goto FAILED;
6244 }
6245
6246 /* If the character following a repeat is '+', possessive_quantifier is
6247 TRUE. For some opcodes, there are special alternative opcodes for this
6248 case. For anything else, we wrap the entire repeated item inside OP_ONCE
6249 brackets. Logically, the '+' notation is just syntactic sugar, taken from
6250 Sun's Java package, but the special opcodes can optimize it.
6251
6252 Some (but not all) possessively repeated subpatterns have already been
6253 completely handled in the code just above. For them, possessive_quantifier
6254 is always FALSE at this stage. Note that the repeated item starts at
6255 tempcode, not at previous, which might be the first part of a string whose
6256 (former) last char we repeated. */
6257
6258 if (possessive_quantifier)
6259 {
6260 int len;
6261
6262 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6263 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6264 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6265 remains is greater than zero, there's a further opcode that can be
6266 handled. If not, do nothing, leaving the EXACT alone. */
6267
6268 switch(*tempcode)
6269 {
6270 case OP_TYPEEXACT:
6271 tempcode += PRIV(OP_lengths)[*tempcode] +
6272 ((tempcode[1 + IMM2_SIZE] == OP_PROP
6273 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6274 break;
6275
6276 /* CHAR opcodes are used for exacts whose count is 1. */
6277
6278 case OP_CHAR:
6279 case OP_CHARI:
6280 case OP_NOT:
6281 case OP_NOTI:
6282 case OP_EXACT:
6283 case OP_EXACTI:
6284 case OP_NOTEXACT:
6285 case OP_NOTEXACTI:
6286 tempcode += PRIV(OP_lengths)[*tempcode];
6287 #ifdef SUPPORT_UTF
6288 if (utf && HAS_EXTRALEN(tempcode[-1]))
6289 tempcode += GET_EXTRALEN(tempcode[-1]);
6290 #endif
6291 break;
6292
6293 /* For the class opcodes, the repeat operator appears at the end;
6294 adjust tempcode to point to it. */
6295
6296 case OP_CLASS:
6297 case OP_NCLASS:
6298 tempcode += 1 + 32/sizeof(pcre_uchar);
6299 break;
6300
6301 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6302 case OP_XCLASS:
6303 tempcode += GET(tempcode, 1);
6304 break;
6305 #endif
6306 }
6307
6308 /* If tempcode is equal to code (which points to the end of the repeated
6309 item), it means we have skipped an EXACT item but there is no following
6310 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6311 all other cases, tempcode will be pointing to the repeat opcode, and will
6312 be less than code, so the value of len will be greater than 0. */
6313
6314 len = (int)(code - tempcode);
6315 if (len > 0)
6316 {
6317 unsigned int repcode = *tempcode;
6318
6319 /* There is a table for possessifying opcodes, all of which are less
6320 than OP_CALLOUT. A zero entry means there is no possessified version.
6321 */
6322
6323 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6324 *tempcode = opcode_possessify[repcode];
6325
6326 /* For opcode without a special possessified version, wrap the item in
6327 ONCE brackets. Because we are moving code along, we must ensure that any
6328 pending recursive references are updated. */
6329
6330 else
6331 {
6332 *code = OP_END;
6333 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6334 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6335 code += 1 + LINK_SIZE;
6336 len += 1 + LINK_SIZE;
6337 tempcode[0] = OP_ONCE;
6338 *code++ = OP_KET;
6339 PUTINC(code, 0, len);
6340 PUT(tempcode, 1, len);
6341 }
6342 }
6343
6344 #ifdef NEVER
6345 if (len > 0) switch (*tempcode)
6346 {
6347 case OP_STAR: *tempcode = OP_POSSTAR; break;
6348 case OP_PLUS: *tempcode = OP_POSPLUS; break;
6349 case OP_QUERY: *tempcode = OP_POSQUERY; break;
6350 case OP_UPTO: *tempcode = OP_POSUPTO; break;
6351
6352 case OP_STARI: *tempcode = OP_POSSTARI; break;
6353 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
6354 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6355 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
6356
6357 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
6358 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
6359 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6360 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
6361
6362 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
6363 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
6364 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6365 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
6366
6367 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
6368 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
6369 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6370 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
6371
6372 case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break;
6373 case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break;
6374 case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break;
6375 case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break;
6376
6377 /* Because we are moving code along, we must ensure that any
6378 pending recursive references are updated. */
6379
6380 default:
6381 *code = OP_END;
6382 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6383 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6384 code += 1 + LINK_SIZE;
6385 len += 1 + LINK_SIZE;
6386 tempcode[0] = OP_ONCE;
6387 *code++ = OP_KET;
6388 PUTINC(code, 0, len);
6389 PUT(tempcode, 1, len);
6390 break;
6391 }
6392 #endif
6393 }
6394
6395 /* In all case we no longer have a previous item. We also set the
6396 "follows varying string" flag for subsequently encountered reqchars if
6397 it isn't already set and we have just passed a varying length item. */
6398
6399 END_REPEAT:
6400 previous = NULL;
6401 cd->req_varyopt |= reqvary;
6402 break;
6403
6404
6405 /* ===================================================================*/
6406 /* Start of nested parenthesized sub-expression, or comment or lookahead or
6407 lookbehind or option setting or condition or all the other extended
6408 parenthesis forms. */
6409
6410 case CHAR_LEFT_PARENTHESIS:
6411 newoptions = options;
6412 skipbytes = 0;
6413 bravalue = OP_CBRA;
6414 save_hwm = cd->hwm;
6415 reset_bracount = FALSE;
6416
6417 /* First deal with various "verbs" that can be introduced by '*'. */
6418
6419 ptr++;
6420 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6421 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6422 {
6423 int i, namelen;
6424 int arglen = 0;
6425 const char *vn = verbnames;
6426 const pcre_uchar *name = ptr + 1;
6427 const pcre_uchar *arg = NULL;
6428 previous = NULL;
6429 ptr++;
6430 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6431 namelen = (int)(ptr - name);
6432
6433 /* It appears that Perl allows any characters whatsoever, other than
6434 a closing parenthesis, to appear in arguments, so we no longer insist on
6435 letters, digits, and underscores. */
6436
6437 if (*ptr == CHAR_COLON)
6438 {
6439 arg = ++ptr;
6440 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6441 arglen = (int)(ptr - arg);
6442 if ((unsigned int)arglen > MAX_MARK)
6443 {
6444 *errorcodeptr = ERR75;
6445 goto FAILED;
6446 }
6447 }
6448
6449 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6450 {
6451 *errorcodeptr = ERR60;
6452 goto FAILED;
6453 }
6454
6455 /* Scan the table of verb names */
6456
6457 for (i = 0; i < verbcount; i++)
6458 {
6459 if (namelen == verbs[i].len &&
6460 STRNCMP_UC_C8(name, vn, namelen) == 0)
6461 {
6462 int setverb;
6463
6464 /* Check for open captures before ACCEPT and convert it to
6465 ASSERT_ACCEPT if in an assertion. */
6466
6467 if (verbs[i].op == OP_ACCEPT)
6468 {
6469 open_capitem *oc;
6470 if (arglen != 0)
6471 {
6472 *errorcodeptr = ERR59;
6473 goto FAILED;
6474 }
6475 cd->had_accept = TRUE;
6476 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6477 {
6478 *code++ = OP_CLOSE;
6479 PUT2INC(code, 0, oc->number);
6480 }
6481 setverb = *code++ =
6482 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6483
6484 /* Do not set firstchar after *ACCEPT */
6485 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6486 }
6487
6488 /* Handle other cases with/without an argument */
6489
6490 else if (arglen == 0)
6491 {
6492 if (verbs[i].op < 0) /* Argument is mandatory */
6493 {
6494 *errorcodeptr = ERR66;
6495 goto FAILED;
6496 }
6497 setverb = *code++ = verbs[i].op;
6498 }
6499
6500 else
6501 {
6502 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6503 {
6504 *errorcodeptr = ERR59;
6505 goto FAILED;
6506 }
6507 setverb = *code++ = verbs[i].op_arg;
6508 *code++ = arglen;
6509 memcpy(code, arg, IN_UCHARS(arglen));
6510 code += arglen;
6511 *code++ = 0;
6512 }
6513
6514 switch (setverb)
6515 {
6516 case OP_THEN:
6517 case OP_THEN_ARG:
6518 cd->external_flags |= PCRE_HASTHEN;
6519 break;
6520
6521 case OP_PRUNE:
6522 case OP_PRUNE_ARG:
6523 case OP_SKIP:
6524 case OP_SKIP_ARG:
6525 cd->had_pruneorskip = TRUE;
6526 break;
6527 }
6528
6529 break; /* Found verb, exit loop */
6530 }
6531
6532 vn += verbs[i].len + 1;
6533 }
6534
6535 if (i < verbcount) continue; /* Successfully handled a verb */
6536 *errorcodeptr = ERR60; /* Verb not recognized */
6537 goto FAILED;
6538 }
6539
6540 /* Deal with the extended parentheses; all are introduced by '?', and the
6541 appearance of any of them means that this is not a capturing group. */
6542
6543 else if (*ptr == CHAR_QUESTION_MARK)
6544 {
6545 int i, set, unset, namelen;
6546 int *optset;
6547 const pcre_uchar *name;
6548 pcre_uchar *slot;
6549
6550 switch (*(++ptr))
6551 {
6552 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
6553 ptr++;
6554 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6555 if (*ptr == CHAR_NULL)
6556 {
6557 *errorcodeptr = ERR18;
6558 goto FAILED;
6559 }
6560 continue;
6561
6562
6563 /* ------------------------------------------------------------ */
6564 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6565 reset_bracount = TRUE;
6566 /* Fall through */
6567
6568 /* ------------------------------------------------------------ */
6569 case CHAR_COLON: /* Non-capturing bracket */
6570 bravalue = OP_BRA;
6571 ptr++;
6572 break;
6573
6574
6575 /* ------------------------------------------------------------ */
6576 case CHAR_LEFT_PARENTHESIS:
6577 bravalue = OP_COND; /* Conditional group */
6578 tempptr = ptr;
6579
6580 /* A condition can be an assertion, a number (referring to a numbered
6581 group's having been set), a name (referring to a named group), or 'R',
6582 referring to recursion. R<digits> and R&name are also permitted for
6583 recursion tests.
6584
6585 There are ways of testing a named group: (?(name)) is used by Python;
6586 Perl 5.10 onwards uses (?(<name>) or (?('name')).
6587
6588 There is one unfortunate ambiguity, caused by history. 'R' can be the
6589 recursive thing or the name 'R' (and similarly for 'R' followed by
6590 digits). We look for a name first; if not found, we try the other case.
6591
6592 For compatibility with auto-callouts, we allow a callout to be
6593 specified before a condition that is an assertion. First, check for the
6594 syntax of a callout; if found, adjust the temporary pointer that is
6595 used to check for an assertion condition. That's all that is needed! */
6596
6597 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6598 {
6599 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6600 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6601 tempptr += i + 1;
6602 }
6603
6604 /* For conditions that are assertions, check the syntax, and then exit
6605 the switch. This will take control down to where bracketed groups,
6606 including assertions, are processed. */
6607
6608 if (tempptr[1] == CHAR_QUESTION_MARK &&
6609 (tempptr[2] == CHAR_EQUALS_SIGN ||
6610 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6611 tempptr[2] == CHAR_LESS_THAN_SIGN))
6612 break;
6613
6614 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6615 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6616
6617 code[1+LINK_SIZE] = OP_CREF;
6618 skipbytes = 1+IMM2_SIZE;
6619 refsign = -1; /* => not a number */
6620 namelen = -1; /* => not a name; must set to avoid warning */
6621 name = NULL; /* Always set to avoid warning */
6622 recno = 0; /* Always set to avoid warning */
6623
6624 /* Check for a test for recursion in a named group. */
6625
6626 ptr++;
6627 if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6628 {
6629 terminator = -1;
6630 ptr += 2;
6631 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6632 }
6633
6634 /* Check for a test for a named group's having been set, using the Perl
6635 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6636 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6637
6638 else if (*ptr == CHAR_LESS_THAN_SIGN)
6639 {
6640 terminator = CHAR_GREATER_THAN_SIGN;
6641 ptr++;
6642 }
6643 else if (*ptr == CHAR_APOSTROPHE)
6644 {
6645 terminator = CHAR_APOSTROPHE;
6646 ptr++;
6647 }
6648 else
6649 {
6650 terminator = CHAR_NULL;
6651 if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6652 else if (IS_DIGIT(*ptr)) refsign = 0;
6653 }
6654
6655 /* Handle a number */
6656
6657 if (refsign >= 0)
6658 {
6659 while (IS_DIGIT(*ptr))
6660 {
6661