/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1480 - (show annotations)
Tue May 27 18:02:51 2014 UTC (5 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 311302 byte(s)
Fix auto-possessification bug.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67
68
69 /* Macro for setting individual bits in class bitmaps. */
70
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77
78 #define OFLOW_MAX (INT_MAX - 20)
79
80 /* Definitions to allow mutual recursion */
81
82 static int
83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 const pcre_uint32 *, unsigned int);
85
86 static BOOL
87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 compile_data *, int *);
90
91
92
93 /*************************************************
94 * Code parameters and static tables *
95 *************************************************/
96
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122
123 #define NAMED_GROUP_LIST_SIZE 20
124
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129
130 /* Private flags added to firstchar and reqchar. */
131
132 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
133 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET (-2)
136 #define REQ_NONE (-1)
137
138 /* Repeated character flags. */
139
140 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
141
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146
147 #ifndef EBCDIC
148
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151
152 static const short int escapes[] = {
153 0, 0,
154 0, 0,
155 0, 0,
156 0, 0,
157 0, 0,
158 CHAR_COLON, CHAR_SEMICOLON,
159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
161 CHAR_COMMERCIAL_AT, -ESC_A,
162 -ESC_B, -ESC_C,
163 -ESC_D, -ESC_E,
164 0, -ESC_G,
165 -ESC_H, 0,
166 0, -ESC_K,
167 0, 0,
168 -ESC_N, 0,
169 -ESC_P, -ESC_Q,
170 -ESC_R, -ESC_S,
171 0, 0,
172 -ESC_V, -ESC_W,
173 -ESC_X, 0,
174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
177 CHAR_GRAVE_ACCENT, 7,
178 -ESC_b, 0,
179 -ESC_d, ESC_e,
180 ESC_f, 0,
181 -ESC_h, 0,
182 0, -ESC_k,
183 0, 0,
184 ESC_n, 0,
185 -ESC_p, 0,
186 ESC_r, -ESC_s,
187 ESC_tee, 0,
188 -ESC_v, -ESC_w,
189 0, 0,
190 -ESC_z
191 };
192
193 #else
194
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197 static const short int escapes[] = {
198 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
199 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
200 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
201 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
202 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
203 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
205 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
206 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
207 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
208 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
209 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
210 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
211 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
212 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
213 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
214 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
215 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
216 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
217 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
218 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
219 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
220 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
221 };
222 #endif
223
224
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230
231 typedef struct verbitem {
232 int len; /* Length of verb name */
233 int op; /* Op when no arg, or -1 if arg mandatory */
234 int op_arg; /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236
237 static const char verbnames[] =
238 "\0" /* Empty name is a shorthand for MARK */
239 STRING_MARK0
240 STRING_ACCEPT0
241 STRING_COMMIT0
242 STRING_F0
243 STRING_FAIL0
244 STRING_PRUNE0
245 STRING_SKIP0
246 STRING_THEN;
247
248 static const verbitem verbs[] = {
249 { 0, -1, OP_MARK },
250 { 4, -1, OP_MARK },
251 { 6, OP_ACCEPT, -1 },
252 { 6, OP_COMMIT, -1 },
253 { 1, OP_FAIL, -1 },
254 { 4, OP_FAIL, -1 },
255 { 5, OP_PRUNE, OP_PRUNE_ARG },
256 { 4, OP_SKIP, OP_SKIP_ARG },
257 { 4, OP_THEN, OP_THEN_ARG }
258 };
259
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261
262
263 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
264 another regex library. */
265
266 static const pcre_uchar sub_start_of_word[] = {
267 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
268 CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
269
270 static const pcre_uchar sub_end_of_word[] = {
271 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
272 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
273 CHAR_RIGHT_PARENTHESIS, '\0' };
274
275
276 /* Tables of names of POSIX character classes and their lengths. The names are
277 now all in a single string, to reduce the number of relocations when a shared
278 library is dynamically loaded. The list of lengths is terminated by a zero
279 length entry. The first three must be alpha, lower, upper, as this is assumed
280 for handling case independence. The indices for graph, print, and punct are
281 needed, so identify them. */
282
283 static const char posix_names[] =
284 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
285 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
286 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
287 STRING_word0 STRING_xdigit;
288
289 static const pcre_uint8 posix_name_lengths[] = {
290 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
291
292 #define PC_GRAPH 8
293 #define PC_PRINT 9
294 #define PC_PUNCT 10
295
296
297 /* Table of class bit maps for each POSIX class. Each class is formed from a
298 base map, with an optional addition or removal of another map. Then, for some
299 classes, there is some additional tweaking: for [:blank:] the vertical space
300 characters are removed, and for [:alpha:] and [:alnum:] the underscore
301 character is removed. The triples in the table consist of the base map offset,
302 second map offset or -1 if no second map, and a non-negative value for map
303 addition or a negative value for map subtraction (if there are two maps). The
304 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
305 remove vertical space characters, 2 => remove underscore. */
306
307 static const int posix_class_maps[] = {
308 cbit_word, cbit_digit, -2, /* alpha */
309 cbit_lower, -1, 0, /* lower */
310 cbit_upper, -1, 0, /* upper */
311 cbit_word, -1, 2, /* alnum - word without underscore */
312 cbit_print, cbit_cntrl, 0, /* ascii */
313 cbit_space, -1, 1, /* blank - a GNU extension */
314 cbit_cntrl, -1, 0, /* cntrl */
315 cbit_digit, -1, 0, /* digit */
316 cbit_graph, -1, 0, /* graph */
317 cbit_print, -1, 0, /* print */
318 cbit_punct, -1, 0, /* punct */
319 cbit_space, -1, 0, /* space */
320 cbit_word, -1, 0, /* word - a Perl extension */
321 cbit_xdigit,-1, 0 /* xdigit */
322 };
323
324 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
325 Unicode property escapes. */
326
327 #ifdef SUPPORT_UCP
328 static const pcre_uchar string_PNd[] = {
329 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
330 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
331 static const pcre_uchar string_pNd[] = {
332 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
333 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
334 static const pcre_uchar string_PXsp[] = {
335 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
336 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337 static const pcre_uchar string_pXsp[] = {
338 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340 static const pcre_uchar string_PXwd[] = {
341 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
342 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343 static const pcre_uchar string_pXwd[] = {
344 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
345 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346
347 static const pcre_uchar *substitutes[] = {
348 string_PNd, /* \D */
349 string_pNd, /* \d */
350 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
351 string_pXsp, /* \s */ /* space and POSIX space are the same. */
352 string_PXwd, /* \W */
353 string_pXwd /* \w */
354 };
355
356 /* The POSIX class substitutes must be in the order of the POSIX class names,
357 defined above, and there are both positive and negative cases. NULL means no
358 general substitute of a Unicode property escape (\p or \P). However, for some
359 POSIX classes (e.g. graph, print, punct) a special property code is compiled
360 directly. */
361
362 static const pcre_uchar string_pL[] = {
363 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
364 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
365 static const pcre_uchar string_pLl[] = {
366 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
367 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
368 static const pcre_uchar string_pLu[] = {
369 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
371 static const pcre_uchar string_pXan[] = {
372 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
373 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
374 static const pcre_uchar string_h[] = {
375 CHAR_BACKSLASH, CHAR_h, '\0' };
376 static const pcre_uchar string_pXps[] = {
377 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
378 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
379 static const pcre_uchar string_PL[] = {
380 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
381 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
382 static const pcre_uchar string_PLl[] = {
383 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
384 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
385 static const pcre_uchar string_PLu[] = {
386 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
387 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
388 static const pcre_uchar string_PXan[] = {
389 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
390 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
391 static const pcre_uchar string_H[] = {
392 CHAR_BACKSLASH, CHAR_H, '\0' };
393 static const pcre_uchar string_PXps[] = {
394 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
395 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
396
397 static const pcre_uchar *posix_substitutes[] = {
398 string_pL, /* alpha */
399 string_pLl, /* lower */
400 string_pLu, /* upper */
401 string_pXan, /* alnum */
402 NULL, /* ascii */
403 string_h, /* blank */
404 NULL, /* cntrl */
405 string_pNd, /* digit */
406 NULL, /* graph */
407 NULL, /* print */
408 NULL, /* punct */
409 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
410 string_pXwd, /* word */ /* Perl and POSIX space are the same */
411 NULL, /* xdigit */
412 /* Negated cases */
413 string_PL, /* ^alpha */
414 string_PLl, /* ^lower */
415 string_PLu, /* ^upper */
416 string_PXan, /* ^alnum */
417 NULL, /* ^ascii */
418 string_H, /* ^blank */
419 NULL, /* ^cntrl */
420 string_PNd, /* ^digit */
421 NULL, /* ^graph */
422 NULL, /* ^print */
423 NULL, /* ^punct */
424 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
425 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
426 NULL /* ^xdigit */
427 };
428 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
429 #endif
430
431 #define STRING(a) # a
432 #define XSTRING(s) STRING(s)
433
434 /* The texts of compile-time error messages. These are "char *" because they
435 are passed to the outside world. Do not ever re-use any error number, because
436 they are documented. Always add a new error instead. Messages marked DEAD below
437 are no longer used. This used to be a table of strings, but in order to reduce
438 the number of relocations needed when a shared library is loaded dynamically,
439 it is now one long string. We cannot use a table of offsets, because the
440 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
441 simply count through to the one we want - this isn't a performance issue
442 because these strings are used only when there is a compilation error.
443
444 Each substring ends with \0 to insert a null character. This includes the final
445 substring, so that the whole string ends with \0\0, which can be detected when
446 counting through. */
447
448 static const char error_texts[] =
449 "no error\0"
450 "\\ at end of pattern\0"
451 "\\c at end of pattern\0"
452 "unrecognized character follows \\\0"
453 "numbers out of order in {} quantifier\0"
454 /* 5 */
455 "number too big in {} quantifier\0"
456 "missing terminating ] for character class\0"
457 "invalid escape sequence in character class\0"
458 "range out of order in character class\0"
459 "nothing to repeat\0"
460 /* 10 */
461 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
462 "internal error: unexpected repeat\0"
463 "unrecognized character after (? or (?-\0"
464 "POSIX named classes are supported only within a class\0"
465 "missing )\0"
466 /* 15 */
467 "reference to non-existent subpattern\0"
468 "erroffset passed as NULL\0"
469 "unknown option bit(s) set\0"
470 "missing ) after comment\0"
471 "parentheses nested too deeply\0" /** DEAD **/
472 /* 20 */
473 "regular expression is too large\0"
474 "failed to get memory\0"
475 "unmatched parentheses\0"
476 "internal error: code overflow\0"
477 "unrecognized character after (?<\0"
478 /* 25 */
479 "lookbehind assertion is not fixed length\0"
480 "malformed number or name after (?(\0"
481 "conditional group contains more than two branches\0"
482 "assertion expected after (?(\0"
483 "(?R or (?[+-]digits must be followed by )\0"
484 /* 30 */
485 "unknown POSIX class name\0"
486 "POSIX collating elements are not supported\0"
487 "this version of PCRE is compiled without UTF support\0"
488 "spare error\0" /** DEAD **/
489 "character value in \\x{} or \\o{} is too large\0"
490 /* 35 */
491 "invalid condition (?(0)\0"
492 "\\C not allowed in lookbehind assertion\0"
493 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
494 "number after (?C is > 255\0"
495 "closing ) for (?C expected\0"
496 /* 40 */
497 "recursive call could loop indefinitely\0"
498 "unrecognized character after (?P\0"
499 "syntax error in subpattern name (missing terminator)\0"
500 "two named subpatterns have the same name\0"
501 "invalid UTF-8 string\0"
502 /* 45 */
503 "support for \\P, \\p, and \\X has not been compiled\0"
504 "malformed \\P or \\p sequence\0"
505 "unknown property name after \\P or \\p\0"
506 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
507 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
508 /* 50 */
509 "repeated subpattern is too long\0" /** DEAD **/
510 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
511 "internal error: overran compiling workspace\0"
512 "internal error: previously-checked referenced subpattern not found\0"
513 "DEFINE group contains more than one branch\0"
514 /* 55 */
515 "repeating a DEFINE group is not allowed\0" /** DEAD **/
516 "inconsistent NEWLINE options\0"
517 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
518 "a numbered reference must not be zero\0"
519 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
520 /* 60 */
521 "(*VERB) not recognized or malformed\0"
522 "number is too big\0"
523 "subpattern name expected\0"
524 "digit expected after (?+\0"
525 "] is an invalid data character in JavaScript compatibility mode\0"
526 /* 65 */
527 "different names for subpatterns of the same number are not allowed\0"
528 "(*MARK) must have an argument\0"
529 "this version of PCRE is not compiled with Unicode property support\0"
530 "\\c must be followed by an ASCII character\0"
531 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
532 /* 70 */
533 "internal error: unknown opcode in find_fixedlength()\0"
534 "\\N is not supported in a class\0"
535 "too many forward references\0"
536 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
537 "invalid UTF-16 string\0"
538 /* 75 */
539 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
540 "character value in \\u.... sequence is too large\0"
541 "invalid UTF-32 string\0"
542 "setting UTF is disabled by the application\0"
543 "non-hex character in \\x{} (closing brace missing?)\0"
544 /* 80 */
545 "non-octal character in \\o{} (closing brace missing?)\0"
546 "missing opening brace after \\o\0"
547 "parentheses are too deeply nested\0"
548 "invalid range in character class\0"
549 "group name must start with a non-digit\0"
550 /* 85 */
551 "parentheses are too deeply nested (stack check)\0"
552 ;
553
554 /* Table to identify digits and hex digits. This is used when compiling
555 patterns. Note that the tables in chartables are dependent on the locale, and
556 may mark arbitrary characters as digits - but the PCRE compiling code expects
557 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
558 a private table here. It costs 256 bytes, but it is a lot faster than doing
559 character value tests (at least in some simple cases I timed), and in some
560 applications one wants PCRE to compile efficiently as well as match
561 efficiently.
562
563 For convenience, we use the same bit definitions as in chartables:
564
565 0x04 decimal digit
566 0x08 hexadecimal digit
567
568 Then we can use ctype_digit and ctype_xdigit in the code. */
569
570 /* Using a simple comparison for decimal numbers rather than a memory read
571 is much faster, and the resulting code is simpler (the compiler turns it
572 into a subtraction and unsigned comparison). */
573
574 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
575
576 #ifndef EBCDIC
577
578 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
579 UTF-8 mode. */
580
581 static const pcre_uint8 digitab[] =
582 {
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
589 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
590 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
591 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
595 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
615
616 #else
617
618 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
619
620 static const pcre_uint8 digitab[] =
621 {
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
624 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
627 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
638 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
646 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
652 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
653 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
654
655 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
656 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
657 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
658 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
660 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
664 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
665 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
667 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
669 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
672 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
673 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
674 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
675 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
676 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
677 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
678 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
679 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
680 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
681 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
682 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
683 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
684 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
685 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
686 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
687 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
688 #endif
689
690
691 /* This table is used to check whether auto-possessification is possible
692 between adjacent character-type opcodes. The left-hand (repeated) opcode is
693 used to select the row, and the right-hand opcode is use to select the column.
694 A value of 1 means that auto-possessification is OK. For example, the second
695 value in the first row means that \D+\d can be turned into \D++\d.
696
697 The Unicode property types (\P and \p) have to be present to fill out the table
698 because of what their opcode values are, but the table values should always be
699 zero because property types are handled separately in the code. The last four
700 columns apply to items that cannot be repeated, so there is no need to have
701 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
702 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
703
704 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
705 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
706
707 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
708 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
709 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
710 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
711 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
712 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
713 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
714 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
715 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
716 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
717 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
718 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
719 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
720 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
721 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
722 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
723 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
724 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
725 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
726 };
727
728
729 /* This table is used to check whether auto-possessification is possible
730 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
731 left-hand (repeated) opcode is used to select the row, and the right-hand
732 opcode is used to select the column. The values are as follows:
733
734 0 Always return FALSE (never auto-possessify)
735 1 Character groups are distinct (possessify if both are OP_PROP)
736 2 Check character categories in the same group (general or particular)
737 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
738
739 4 Check left general category vs right particular category
740 5 Check right general category vs left particular category
741
742 6 Left alphanum vs right general category
743 7 Left space vs right general category
744 8 Left word vs right general category
745
746 9 Right alphanum vs left general category
747 10 Right space vs left general category
748 11 Right word vs left general category
749
750 12 Left alphanum vs right particular category
751 13 Left space vs right particular category
752 14 Left word vs right particular category
753
754 15 Right alphanum vs left particular category
755 16 Right space vs left particular category
756 17 Right word vs left particular category
757 */
758
759 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
760 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
761 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
762 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
763 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
764 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
765 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
766 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
767 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
768 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
769 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
770 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
771 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
772 };
773
774 /* This table is used to check whether auto-possessification is possible
775 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
776 specifies a general category and the other specifies a particular category. The
777 row is selected by the general category and the column by the particular
778 category. The value is 1 if the particular category is not part of the general
779 category. */
780
781 static const pcre_uint8 catposstab[7][30] = {
782 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
783 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
784 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
785 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
786 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
787 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
788 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
789 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
790 };
791
792 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
793 a general or particular category. The properties in each row are those
794 that apply to the character set in question. Duplication means that a little
795 unnecessary work is done when checking, but this keeps things much simpler
796 because they can all use the same code. For more details see the comment where
797 this table is used.
798
799 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
800 "space", but from Perl 5.18 it's included, so both categories are treated the
801 same here. */
802
803 static const pcre_uint8 posspropstab[3][4] = {
804 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
805 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
806 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
807 };
808
809 /* This table is used when converting repeating opcodes into possessified
810 versions as a result of an explicit possessive quantifier such as ++. A zero
811 value means there is no possessified version - in those cases the item in
812 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
813 because all relevant opcodes are less than that. */
814
815 static const pcre_uint8 opcode_possessify[] = {
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
817 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
818
819 0, /* NOTI */
820 OP_POSSTAR, 0, /* STAR, MINSTAR */
821 OP_POSPLUS, 0, /* PLUS, MINPLUS */
822 OP_POSQUERY, 0, /* QUERY, MINQUERY */
823 OP_POSUPTO, 0, /* UPTO, MINUPTO */
824 0, /* EXACT */
825 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
826
827 OP_POSSTARI, 0, /* STARI, MINSTARI */
828 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
829 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
830 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
831 0, /* EXACTI */
832 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
833
834 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
835 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
836 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
837 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
838 0, /* NOTEXACT */
839 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
840
841 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
842 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
843 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
844 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
845 0, /* NOTEXACTI */
846 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
847
848 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
849 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
850 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
851 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
852 0, /* TYPEEXACT */
853 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
854
855 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
856 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
857 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
858 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
859 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
860
861 0, 0, 0, /* CLASS, NCLASS, XCLASS */
862 0, 0, /* REF, REFI */
863 0, 0, /* DNREF, DNREFI */
864 0, 0 /* RECURSE, CALLOUT */
865 };
866
867
868
869 /*************************************************
870 * Find an error text *
871 *************************************************/
872
873 /* The error texts are now all in one long string, to save on relocations. As
874 some of the text is of unknown length, we can't use a table of offsets.
875 Instead, just count through the strings. This is not a performance issue
876 because it happens only when there has been a compilation error.
877
878 Argument: the error number
879 Returns: pointer to the error string
880 */
881
882 static const char *
883 find_error_text(int n)
884 {
885 const char *s = error_texts;
886 for (; n > 0; n--)
887 {
888 while (*s++ != CHAR_NULL) {};
889 if (*s == CHAR_NULL) return "Error text not found (please report)";
890 }
891 return s;
892 }
893
894
895
896 /*************************************************
897 * Expand the workspace *
898 *************************************************/
899
900 /* This function is called during the second compiling phase, if the number of
901 forward references fills the existing workspace, which is originally a block on
902 the stack. A larger block is obtained from malloc() unless the ultimate limit
903 has been reached or the increase will be rather small.
904
905 Argument: pointer to the compile data block
906 Returns: 0 if all went well, else an error number
907 */
908
909 static int
910 expand_workspace(compile_data *cd)
911 {
912 pcre_uchar *newspace;
913 int newsize = cd->workspace_size * 2;
914
915 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
916 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
917 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
918 return ERR72;
919
920 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
921 if (newspace == NULL) return ERR21;
922 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
923 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
924 if (cd->workspace_size > COMPILE_WORK_SIZE)
925 (PUBL(free))((void *)cd->start_workspace);
926 cd->start_workspace = newspace;
927 cd->workspace_size = newsize;
928 return 0;
929 }
930
931
932
933 /*************************************************
934 * Check for counted repeat *
935 *************************************************/
936
937 /* This function is called when a '{' is encountered in a place where it might
938 start a quantifier. It looks ahead to see if it really is a quantifier or not.
939 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
940 where the ddds are digits.
941
942 Arguments:
943 p pointer to the first char after '{'
944
945 Returns: TRUE or FALSE
946 */
947
948 static BOOL
949 is_counted_repeat(const pcre_uchar *p)
950 {
951 if (!IS_DIGIT(*p)) return FALSE;
952 p++;
953 while (IS_DIGIT(*p)) p++;
954 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
955
956 if (*p++ != CHAR_COMMA) return FALSE;
957 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
958
959 if (!IS_DIGIT(*p)) return FALSE;
960 p++;
961 while (IS_DIGIT(*p)) p++;
962
963 return (*p == CHAR_RIGHT_CURLY_BRACKET);
964 }
965
966
967
968 /*************************************************
969 * Handle escapes *
970 *************************************************/
971
972 /* This function is called when a \ has been encountered. It either returns a
973 positive value for a simple escape such as \n, or 0 for a data character which
974 will be placed in chptr. A backreference to group n is returned as negative n.
975 When UTF-8 is enabled, a positive value greater than 255 may be returned in
976 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
977 character of the escape sequence.
978
979 Arguments:
980 ptrptr points to the pattern position pointer
981 chptr points to a returned data character
982 errorcodeptr points to the errorcode variable
983 bracount number of previous extracting brackets
984 options the options bits
985 isclass TRUE if inside a character class
986
987 Returns: zero => a data character
988 positive => a special escape sequence
989 negative => a back reference
990 on error, errorcodeptr is set
991 */
992
993 static int
994 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
995 int bracount, int options, BOOL isclass)
996 {
997 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
998 BOOL utf = (options & PCRE_UTF8) != 0;
999 const pcre_uchar *ptr = *ptrptr + 1;
1000 pcre_uint32 c;
1001 int escape = 0;
1002 int i;
1003
1004 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1005 ptr--; /* Set pointer back to the last byte */
1006
1007 /* If backslash is at the end of the pattern, it's an error. */
1008
1009 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1010
1011 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1012 in a table. A non-zero result is something that can be returned immediately.
1013 Otherwise further processing may be required. */
1014
1015 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1016 /* Not alphanumeric */
1017 else if (c < CHAR_0 || c > CHAR_z) {}
1018 else if ((i = escapes[c - CHAR_0]) != 0)
1019 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1020
1021 #else /* EBCDIC coding */
1022 /* Not alphanumeric */
1023 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1024 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1025 #endif
1026
1027 /* Escapes that need further processing, or are illegal. */
1028
1029 else
1030 {
1031 const pcre_uchar *oldptr;
1032 BOOL braced, negated, overflow;
1033 int s;
1034
1035 switch (c)
1036 {
1037 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1038 error. */
1039
1040 case CHAR_l:
1041 case CHAR_L:
1042 *errorcodeptr = ERR37;
1043 break;
1044
1045 case CHAR_u:
1046 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1047 {
1048 /* In JavaScript, \u must be followed by four hexadecimal numbers.
1049 Otherwise it is a lowercase u letter. */
1050 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1051 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1052 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1053 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1054 {
1055 c = 0;
1056 for (i = 0; i < 4; ++i)
1057 {
1058 register pcre_uint32 cc = *(++ptr);
1059 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1060 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1061 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1062 #else /* EBCDIC coding */
1063 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1064 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1065 #endif
1066 }
1067
1068 #if defined COMPILE_PCRE8
1069 if (c > (utf ? 0x10ffffU : 0xffU))
1070 #elif defined COMPILE_PCRE16
1071 if (c > (utf ? 0x10ffffU : 0xffffU))
1072 #elif defined COMPILE_PCRE32
1073 if (utf && c > 0x10ffffU)
1074 #endif
1075 {
1076 *errorcodeptr = ERR76;
1077 }
1078 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1079 }
1080 }
1081 else
1082 *errorcodeptr = ERR37;
1083 break;
1084
1085 case CHAR_U:
1086 /* In JavaScript, \U is an uppercase U letter. */
1087 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1088 break;
1089
1090 /* In a character class, \g is just a literal "g". Outside a character
1091 class, \g must be followed by one of a number of specific things:
1092
1093 (1) A number, either plain or braced. If positive, it is an absolute
1094 backreference. If negative, it is a relative backreference. This is a Perl
1095 5.10 feature.
1096
1097 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1098 is part of Perl's movement towards a unified syntax for back references. As
1099 this is synonymous with \k{name}, we fudge it up by pretending it really
1100 was \k.
1101
1102 (3) For Oniguruma compatibility we also support \g followed by a name or a
1103 number either in angle brackets or in single quotes. However, these are
1104 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1105 the ESC_g code (cf \k). */
1106
1107 case CHAR_g:
1108 if (isclass) break;
1109 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1110 {
1111 escape = ESC_g;
1112 break;
1113 }
1114
1115 /* Handle the Perl-compatible cases */
1116
1117 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1118 {
1119 const pcre_uchar *p;
1120 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1121 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1122 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1123 {
1124 escape = ESC_k;
1125 break;
1126 }
1127 braced = TRUE;
1128 ptr++;
1129 }
1130 else braced = FALSE;
1131
1132 if (ptr[1] == CHAR_MINUS)
1133 {
1134 negated = TRUE;
1135 ptr++;
1136 }
1137 else negated = FALSE;
1138
1139 /* The integer range is limited by the machine's int representation. */
1140 s = 0;
1141 overflow = FALSE;
1142 while (IS_DIGIT(ptr[1]))
1143 {
1144 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1145 {
1146 overflow = TRUE;
1147 break;
1148 }
1149 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1150 }
1151 if (overflow) /* Integer overflow */
1152 {
1153 while (IS_DIGIT(ptr[1]))
1154 ptr++;
1155 *errorcodeptr = ERR61;
1156 break;
1157 }
1158
1159 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1160 {
1161 *errorcodeptr = ERR57;
1162 break;
1163 }
1164
1165 if (s == 0)
1166 {
1167 *errorcodeptr = ERR58;
1168 break;
1169 }
1170
1171 if (negated)
1172 {
1173 if (s > bracount)
1174 {
1175 *errorcodeptr = ERR15;
1176 break;
1177 }
1178 s = bracount - (s - 1);
1179 }
1180
1181 escape = -s;
1182 break;
1183
1184 /* The handling of escape sequences consisting of a string of digits
1185 starting with one that is not zero is not straightforward. Perl has changed
1186 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1187 recommended to avoid the ambiguities in the old syntax.
1188
1189 Outside a character class, the digits are read as a decimal number. If the
1190 number is less than 8 (used to be 10), or if there are that many previous
1191 extracting left brackets, then it is a back reference. Otherwise, up to
1192 three octal digits are read to form an escaped byte. Thus \123 is likely to
1193 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1194 the octal value is greater than 377, the least significant 8 bits are
1195 taken. \8 and \9 are treated as the literal characters 8 and 9.
1196
1197 Inside a character class, \ followed by a digit is always either a literal
1198 8 or 9 or an octal number. */
1199
1200 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1201 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1202
1203 if (!isclass)
1204 {
1205 oldptr = ptr;
1206 /* The integer range is limited by the machine's int representation. */
1207 s = (int)(c -CHAR_0);
1208 overflow = FALSE;
1209 while (IS_DIGIT(ptr[1]))
1210 {
1211 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1212 {
1213 overflow = TRUE;
1214 break;
1215 }
1216 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1217 }
1218 if (overflow) /* Integer overflow */
1219 {
1220 while (IS_DIGIT(ptr[1]))
1221 ptr++;
1222 *errorcodeptr = ERR61;
1223 break;
1224 }
1225 if (s < 8 || s <= bracount) /* Check for back reference */
1226 {
1227 escape = -s;
1228 break;
1229 }
1230 ptr = oldptr; /* Put the pointer back and fall through */
1231 }
1232
1233 /* Handle a digit following \ when the number is not a back reference. If
1234 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1235 then treat the digit as a following literal. At least by Perl 5.18 this
1236 changed so as not to insert the binary zero. */
1237
1238 if ((c = *ptr) >= CHAR_8) break;
1239
1240 /* Fall through with a digit less than 8 */
1241
1242 /* \0 always starts an octal number, but we may drop through to here with a
1243 larger first octal digit. The original code used just to take the least
1244 significant 8 bits of octal numbers (I think this is what early Perls used
1245 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1246 but no more than 3 octal digits. */
1247
1248 case CHAR_0:
1249 c -= CHAR_0;
1250 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1251 c = c * 8 + *(++ptr) - CHAR_0;
1252 #ifdef COMPILE_PCRE8
1253 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1254 #endif
1255 break;
1256
1257 /* \o is a relatively new Perl feature, supporting a more general way of
1258 specifying character codes in octal. The only supported form is \o{ddd}. */
1259
1260 case CHAR_o:
1261 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1262 {
1263 ptr += 2;
1264 c = 0;
1265 overflow = FALSE;
1266 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1267 {
1268 register pcre_uint32 cc = *ptr++;
1269 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1270 #ifdef COMPILE_PCRE32
1271 if (c >= 0x20000000l) { overflow = TRUE; break; }
1272 #endif
1273 c = (c << 3) + cc - CHAR_0 ;
1274 #if defined COMPILE_PCRE8
1275 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1276 #elif defined COMPILE_PCRE16
1277 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1278 #elif defined COMPILE_PCRE32
1279 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1280 #endif
1281 }
1282 if (overflow)
1283 {
1284 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1285 *errorcodeptr = ERR34;
1286 }
1287 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1288 {
1289 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1290 }
1291 else *errorcodeptr = ERR80;
1292 }
1293 break;
1294
1295 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1296 numbers. Otherwise it is a lowercase x letter. */
1297
1298 case CHAR_x:
1299 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1300 {
1301 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1302 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1303 {
1304 c = 0;
1305 for (i = 0; i < 2; ++i)
1306 {
1307 register pcre_uint32 cc = *(++ptr);
1308 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1309 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1310 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1311 #else /* EBCDIC coding */
1312 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1313 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1314 #endif
1315 }
1316 }
1317 } /* End JavaScript handling */
1318
1319 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1320 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1321 digits. If not, { used to be treated as a data character. However, Perl
1322 seems to read hex digits up to the first non-such, and ignore the rest, so
1323 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1324 now gives an error. */
1325
1326 else
1327 {
1328 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1329 {
1330 ptr += 2;
1331 c = 0;
1332 overflow = FALSE;
1333 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1334 {
1335 register pcre_uint32 cc = *ptr++;
1336 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1337
1338 #ifdef COMPILE_PCRE32
1339 if (c >= 0x10000000l) { overflow = TRUE; break; }
1340 #endif
1341
1342 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1343 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1344 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1345 #else /* EBCDIC coding */
1346 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1347 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1348 #endif
1349
1350 #if defined COMPILE_PCRE8
1351 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1352 #elif defined COMPILE_PCRE16
1353 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1354 #elif defined COMPILE_PCRE32
1355 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1356 #endif
1357 }
1358
1359 if (overflow)
1360 {
1361 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1362 *errorcodeptr = ERR34;
1363 }
1364
1365 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1366 {
1367 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1368 }
1369
1370 /* If the sequence of hex digits does not end with '}', give an error.
1371 We used just to recognize this construct and fall through to the normal
1372 \x handling, but nowadays Perl gives an error, which seems much more
1373 sensible, so we do too. */
1374
1375 else *errorcodeptr = ERR79;
1376 } /* End of \x{} processing */
1377
1378 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1379
1380 else
1381 {
1382 c = 0;
1383 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1384 {
1385 pcre_uint32 cc; /* Some compilers don't like */
1386 cc = *(++ptr); /* ++ in initializers */
1387 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1388 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1389 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1390 #else /* EBCDIC coding */
1391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1392 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1393 #endif
1394 }
1395 } /* End of \xdd handling */
1396 } /* End of Perl-style \x handling */
1397 break;
1398
1399 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1400 An error is given if the byte following \c is not an ASCII character. This
1401 coding is ASCII-specific, but then the whole concept of \cx is
1402 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1403
1404 case CHAR_c:
1405 c = *(++ptr);
1406 if (c == CHAR_NULL)
1407 {
1408 *errorcodeptr = ERR2;
1409 break;
1410 }
1411 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1412 if (c > 127) /* Excludes all non-ASCII in either mode */
1413 {
1414 *errorcodeptr = ERR68;
1415 break;
1416 }
1417 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1418 c ^= 0x40;
1419 #else /* EBCDIC coding */
1420 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1421 c ^= 0xC0;
1422 #endif
1423 break;
1424
1425 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1426 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1427 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1428 odd, but there used to be some cases other than the default, and there may
1429 be again in future, so I haven't "optimized" it. */
1430
1431 default:
1432 if ((options & PCRE_EXTRA) != 0) switch(c)
1433 {
1434 default:
1435 *errorcodeptr = ERR3;
1436 break;
1437 }
1438 break;
1439 }
1440 }
1441
1442 /* Perl supports \N{name} for character names, as well as plain \N for "not
1443 newline". PCRE does not support \N{name}. However, it does support
1444 quantification such as \N{2,3}. */
1445
1446 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1447 !is_counted_repeat(ptr+2))
1448 *errorcodeptr = ERR37;
1449
1450 /* If PCRE_UCP is set, we change the values for \d etc. */
1451
1452 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1453 escape += (ESC_DU - ESC_D);
1454
1455 /* Set the pointer to the final character before returning. */
1456
1457 *ptrptr = ptr;
1458 *chptr = c;
1459 return escape;
1460 }
1461
1462
1463
1464 #ifdef SUPPORT_UCP
1465 /*************************************************
1466 * Handle \P and \p *
1467 *************************************************/
1468
1469 /* This function is called after \P or \p has been encountered, provided that
1470 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1471 pointing at the P or p. On exit, it is pointing at the final character of the
1472 escape sequence.
1473
1474 Argument:
1475 ptrptr points to the pattern position pointer
1476 negptr points to a boolean that is set TRUE for negation else FALSE
1477 ptypeptr points to an unsigned int that is set to the type value
1478 pdataptr points to an unsigned int that is set to the detailed property value
1479 errorcodeptr points to the error code variable
1480
1481 Returns: TRUE if the type value was found, or FALSE for an invalid type
1482 */
1483
1484 static BOOL
1485 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1486 unsigned int *pdataptr, int *errorcodeptr)
1487 {
1488 pcre_uchar c;
1489 int i, bot, top;
1490 const pcre_uchar *ptr = *ptrptr;
1491 pcre_uchar name[32];
1492
1493 c = *(++ptr);
1494 if (c == CHAR_NULL) goto ERROR_RETURN;
1495
1496 *negptr = FALSE;
1497
1498 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1499 negation. */
1500
1501 if (c == CHAR_LEFT_CURLY_BRACKET)
1502 {
1503 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1504 {
1505 *negptr = TRUE;
1506 ptr++;
1507 }
1508 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1509 {
1510 c = *(++ptr);
1511 if (c == CHAR_NULL) goto ERROR_RETURN;
1512 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1513 name[i] = c;
1514 }
1515 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1516 name[i] = 0;
1517 }
1518
1519 /* Otherwise there is just one following character */
1520
1521 else
1522 {
1523 name[0] = c;
1524 name[1] = 0;
1525 }
1526
1527 *ptrptr = ptr;
1528
1529 /* Search for a recognized property name using binary chop */
1530
1531 bot = 0;
1532 top = PRIV(utt_size);
1533
1534 while (bot < top)
1535 {
1536 int r;
1537 i = (bot + top) >> 1;
1538 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1539 if (r == 0)
1540 {
1541 *ptypeptr = PRIV(utt)[i].type;
1542 *pdataptr = PRIV(utt)[i].value;
1543 return TRUE;
1544 }
1545 if (r > 0) bot = i + 1; else top = i;
1546 }
1547
1548 *errorcodeptr = ERR47;
1549 *ptrptr = ptr;
1550 return FALSE;
1551
1552 ERROR_RETURN:
1553 *errorcodeptr = ERR46;
1554 *ptrptr = ptr;
1555 return FALSE;
1556 }
1557 #endif
1558
1559
1560
1561 /*************************************************
1562 * Read repeat counts *
1563 *************************************************/
1564
1565 /* Read an item of the form {n,m} and return the values. This is called only
1566 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1567 so the syntax is guaranteed to be correct, but we need to check the values.
1568
1569 Arguments:
1570 p pointer to first char after '{'
1571 minp pointer to int for min
1572 maxp pointer to int for max
1573 returned as -1 if no max
1574 errorcodeptr points to error code variable
1575
1576 Returns: pointer to '}' on success;
1577 current ptr on error, with errorcodeptr set non-zero
1578 */
1579
1580 static const pcre_uchar *
1581 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1582 {
1583 int min = 0;
1584 int max = -1;
1585
1586 while (IS_DIGIT(*p))
1587 {
1588 min = min * 10 + (int)(*p++ - CHAR_0);
1589 if (min > 65535)
1590 {
1591 *errorcodeptr = ERR5;
1592 return p;
1593 }
1594 }
1595
1596 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1597 {
1598 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1599 {
1600 max = 0;
1601 while(IS_DIGIT(*p))
1602 {
1603 max = max * 10 + (int)(*p++ - CHAR_0);
1604 if (max > 65535)
1605 {
1606 *errorcodeptr = ERR5;
1607 return p;
1608 }
1609 }
1610 if (max < min)
1611 {
1612 *errorcodeptr = ERR4;
1613 return p;
1614 }
1615 }
1616 }
1617
1618 *minp = min;
1619 *maxp = max;
1620 return p;
1621 }
1622
1623
1624
1625 /*************************************************
1626 * Find first significant op code *
1627 *************************************************/
1628
1629 /* This is called by several functions that scan a compiled expression looking
1630 for a fixed first character, or an anchoring op code etc. It skips over things
1631 that do not influence this. For some calls, it makes sense to skip negative
1632 forward and all backward assertions, and also the \b assertion; for others it
1633 does not.
1634
1635 Arguments:
1636 code pointer to the start of the group
1637 skipassert TRUE if certain assertions are to be skipped
1638
1639 Returns: pointer to the first significant opcode
1640 */
1641
1642 static const pcre_uchar*
1643 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1644 {
1645 for (;;)
1646 {
1647 switch ((int)*code)
1648 {
1649 case OP_ASSERT_NOT:
1650 case OP_ASSERTBACK:
1651 case OP_ASSERTBACK_NOT:
1652 if (!skipassert) return code;
1653 do code += GET(code, 1); while (*code == OP_ALT);
1654 code += PRIV(OP_lengths)[*code];
1655 break;
1656
1657 case OP_WORD_BOUNDARY:
1658 case OP_NOT_WORD_BOUNDARY:
1659 if (!skipassert) return code;
1660 /* Fall through */
1661
1662 case OP_CALLOUT:
1663 case OP_CREF:
1664 case OP_DNCREF:
1665 case OP_RREF:
1666 case OP_DNRREF:
1667 case OP_DEF:
1668 code += PRIV(OP_lengths)[*code];
1669 break;
1670
1671 default:
1672 return code;
1673 }
1674 }
1675 /* Control never reaches here */
1676 }
1677
1678
1679
1680 /*************************************************
1681 * Find the fixed length of a branch *
1682 *************************************************/
1683
1684 /* Scan a branch and compute the fixed length of subject that will match it,
1685 if the length is fixed. This is needed for dealing with backward assertions.
1686 In UTF8 mode, the result is in characters rather than bytes. The branch is
1687 temporarily terminated with OP_END when this function is called.
1688
1689 This function is called when a backward assertion is encountered, so that if it
1690 fails, the error message can point to the correct place in the pattern.
1691 However, we cannot do this when the assertion contains subroutine calls,
1692 because they can be forward references. We solve this by remembering this case
1693 and doing the check at the end; a flag specifies which mode we are running in.
1694
1695 Arguments:
1696 code points to the start of the pattern (the bracket)
1697 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1698 atend TRUE if called when the pattern is complete
1699 cd the "compile data" structure
1700
1701 Returns: the fixed length,
1702 or -1 if there is no fixed length,
1703 or -2 if \C was encountered (in UTF-8 mode only)
1704 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1705 or -4 if an unknown opcode was encountered (internal error)
1706 */
1707
1708 static int
1709 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1710 {
1711 int length = -1;
1712
1713 register int branchlength = 0;
1714 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1715
1716 /* Scan along the opcodes for this branch. If we get to the end of the
1717 branch, check the length against that of the other branches. */
1718
1719 for (;;)
1720 {
1721 int d;
1722 pcre_uchar *ce, *cs;
1723 register pcre_uchar op = *cc;
1724
1725 switch (op)
1726 {
1727 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1728 OP_BRA (normal non-capturing bracket) because the other variants of these
1729 opcodes are all concerned with unlimited repeated groups, which of course
1730 are not of fixed length. */
1731
1732 case OP_CBRA:
1733 case OP_BRA:
1734 case OP_ONCE:
1735 case OP_ONCE_NC:
1736 case OP_COND:
1737 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1738 if (d < 0) return d;
1739 branchlength += d;
1740 do cc += GET(cc, 1); while (*cc == OP_ALT);
1741 cc += 1 + LINK_SIZE;
1742 break;
1743
1744 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1745 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1746 an ALT. If it is END it's the end of the outer call. All can be handled by
1747 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1748 because they all imply an unlimited repeat. */
1749
1750 case OP_ALT:
1751 case OP_KET:
1752 case OP_END:
1753 case OP_ACCEPT:
1754 case OP_ASSERT_ACCEPT:
1755 if (length < 0) length = branchlength;
1756 else if (length != branchlength) return -1;
1757 if (*cc != OP_ALT) return length;
1758 cc += 1 + LINK_SIZE;
1759 branchlength = 0;
1760 break;
1761
1762 /* A true recursion implies not fixed length, but a subroutine call may
1763 be OK. If the subroutine is a forward reference, we can't deal with
1764 it until the end of the pattern, so return -3. */
1765
1766 case OP_RECURSE:
1767 if (!atend) return -3;
1768 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1769 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1770 if (cc > cs && cc < ce) return -1; /* Recursion */
1771 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1772 if (d < 0) return d;
1773 branchlength += d;
1774 cc += 1 + LINK_SIZE;
1775 break;
1776
1777 /* Skip over assertive subpatterns */
1778
1779 case OP_ASSERT:
1780 case OP_ASSERT_NOT:
1781 case OP_ASSERTBACK:
1782 case OP_ASSERTBACK_NOT:
1783 do cc += GET(cc, 1); while (*cc == OP_ALT);
1784 cc += PRIV(OP_lengths)[*cc];
1785 break;
1786
1787 /* Skip over things that don't match chars */
1788
1789 case OP_MARK:
1790 case OP_PRUNE_ARG:
1791 case OP_SKIP_ARG:
1792 case OP_THEN_ARG:
1793 cc += cc[1] + PRIV(OP_lengths)[*cc];
1794 break;
1795
1796 case OP_CALLOUT:
1797 case OP_CIRC:
1798 case OP_CIRCM:
1799 case OP_CLOSE:
1800 case OP_COMMIT:
1801 case OP_CREF:
1802 case OP_DEF:
1803 case OP_DNCREF:
1804 case OP_DNRREF:
1805 case OP_DOLL:
1806 case OP_DOLLM:
1807 case OP_EOD:
1808 case OP_EODN:
1809 case OP_FAIL:
1810 case OP_NOT_WORD_BOUNDARY:
1811 case OP_PRUNE:
1812 case OP_REVERSE:
1813 case OP_RREF:
1814 case OP_SET_SOM:
1815 case OP_SKIP:
1816 case OP_SOD:
1817 case OP_SOM:
1818 case OP_THEN:
1819 case OP_WORD_BOUNDARY:
1820 cc += PRIV(OP_lengths)[*cc];
1821 break;
1822
1823 /* Handle literal characters */
1824
1825 case OP_CHAR:
1826 case OP_CHARI:
1827 case OP_NOT:
1828 case OP_NOTI:
1829 branchlength++;
1830 cc += 2;
1831 #ifdef SUPPORT_UTF
1832 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1833 #endif
1834 break;
1835
1836 /* Handle exact repetitions. The count is already in characters, but we
1837 need to skip over a multibyte character in UTF8 mode. */
1838
1839 case OP_EXACT:
1840 case OP_EXACTI:
1841 case OP_NOTEXACT:
1842 case OP_NOTEXACTI:
1843 branchlength += (int)GET2(cc,1);
1844 cc += 2 + IMM2_SIZE;
1845 #ifdef SUPPORT_UTF
1846 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1847 #endif
1848 break;
1849
1850 case OP_TYPEEXACT:
1851 branchlength += GET2(cc,1);
1852 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1853 cc += 2;
1854 cc += 1 + IMM2_SIZE + 1;
1855 break;
1856
1857 /* Handle single-char matchers */
1858
1859 case OP_PROP:
1860 case OP_NOTPROP:
1861 cc += 2;
1862 /* Fall through */
1863
1864 case OP_HSPACE:
1865 case OP_VSPACE:
1866 case OP_NOT_HSPACE:
1867 case OP_NOT_VSPACE:
1868 case OP_NOT_DIGIT:
1869 case OP_DIGIT:
1870 case OP_NOT_WHITESPACE:
1871 case OP_WHITESPACE:
1872 case OP_NOT_WORDCHAR:
1873 case OP_WORDCHAR:
1874 case OP_ANY:
1875 case OP_ALLANY:
1876 branchlength++;
1877 cc++;
1878 break;
1879
1880 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1881 otherwise \C is coded as OP_ALLANY. */
1882
1883 case OP_ANYBYTE:
1884 return -2;
1885
1886 /* Check a class for variable quantification */
1887
1888 case OP_CLASS:
1889 case OP_NCLASS:
1890 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1891 case OP_XCLASS:
1892 /* The original code caused an unsigned overflow in 64 bit systems,
1893 so now we use a conditional statement. */
1894 if (op == OP_XCLASS)
1895 cc += GET(cc, 1);
1896 else
1897 cc += PRIV(OP_lengths)[OP_CLASS];
1898 #else
1899 cc += PRIV(OP_lengths)[OP_CLASS];
1900 #endif
1901
1902 switch (*cc)
1903 {
1904 case OP_CRSTAR:
1905 case OP_CRMINSTAR:
1906 case OP_CRPLUS:
1907 case OP_CRMINPLUS:
1908 case OP_CRQUERY:
1909 case OP_CRMINQUERY:
1910 case OP_CRPOSSTAR:
1911 case OP_CRPOSPLUS:
1912 case OP_CRPOSQUERY:
1913 return -1;
1914
1915 case OP_CRRANGE:
1916 case OP_CRMINRANGE:
1917 case OP_CRPOSRANGE:
1918 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1919 branchlength += (int)GET2(cc,1);
1920 cc += 1 + 2 * IMM2_SIZE;
1921 break;
1922
1923 default:
1924 branchlength++;
1925 }
1926 break;
1927
1928 /* Anything else is variable length */
1929
1930 case OP_ANYNL:
1931 case OP_BRAMINZERO:
1932 case OP_BRAPOS:
1933 case OP_BRAPOSZERO:
1934 case OP_BRAZERO:
1935 case OP_CBRAPOS:
1936 case OP_EXTUNI:
1937 case OP_KETRMAX:
1938 case OP_KETRMIN:
1939 case OP_KETRPOS:
1940 case OP_MINPLUS:
1941 case OP_MINPLUSI:
1942 case OP_MINQUERY:
1943 case OP_MINQUERYI:
1944 case OP_MINSTAR:
1945 case OP_MINSTARI:
1946 case OP_MINUPTO:
1947 case OP_MINUPTOI:
1948 case OP_NOTMINPLUS:
1949 case OP_NOTMINPLUSI:
1950 case OP_NOTMINQUERY:
1951 case OP_NOTMINQUERYI:
1952 case OP_NOTMINSTAR:
1953 case OP_NOTMINSTARI:
1954 case OP_NOTMINUPTO:
1955 case OP_NOTMINUPTOI:
1956 case OP_NOTPLUS:
1957 case OP_NOTPLUSI:
1958 case OP_NOTPOSPLUS:
1959 case OP_NOTPOSPLUSI:
1960 case OP_NOTPOSQUERY:
1961 case OP_NOTPOSQUERYI:
1962 case OP_NOTPOSSTAR:
1963 case OP_NOTPOSSTARI:
1964 case OP_NOTPOSUPTO:
1965 case OP_NOTPOSUPTOI:
1966 case OP_NOTQUERY:
1967 case OP_NOTQUERYI:
1968 case OP_NOTSTAR:
1969 case OP_NOTSTARI:
1970 case OP_NOTUPTO:
1971 case OP_NOTUPTOI:
1972 case OP_PLUS:
1973 case OP_PLUSI:
1974 case OP_POSPLUS:
1975 case OP_POSPLUSI:
1976 case OP_POSQUERY:
1977 case OP_POSQUERYI:
1978 case OP_POSSTAR:
1979 case OP_POSSTARI:
1980 case OP_POSUPTO:
1981 case OP_POSUPTOI:
1982 case OP_QUERY:
1983 case OP_QUERYI:
1984 case OP_REF:
1985 case OP_REFI:
1986 case OP_DNREF:
1987 case OP_DNREFI:
1988 case OP_SBRA:
1989 case OP_SBRAPOS:
1990 case OP_SCBRA:
1991 case OP_SCBRAPOS:
1992 case OP_SCOND:
1993 case OP_SKIPZERO:
1994 case OP_STAR:
1995 case OP_STARI:
1996 case OP_TYPEMINPLUS:
1997 case OP_TYPEMINQUERY:
1998 case OP_TYPEMINSTAR:
1999 case OP_TYPEMINUPTO:
2000 case OP_TYPEPLUS:
2001 case OP_TYPEPOSPLUS:
2002 case OP_TYPEPOSQUERY:
2003 case OP_TYPEPOSSTAR:
2004 case OP_TYPEPOSUPTO:
2005 case OP_TYPEQUERY:
2006 case OP_TYPESTAR:
2007 case OP_TYPEUPTO:
2008 case OP_UPTO:
2009 case OP_UPTOI:
2010 return -1;
2011
2012 /* Catch unrecognized opcodes so that when new ones are added they
2013 are not forgotten, as has happened in the past. */
2014
2015 default:
2016 return -4;
2017 }
2018 }
2019 /* Control never gets here */
2020 }
2021
2022
2023
2024 /*************************************************
2025 * Scan compiled regex for specific bracket *
2026 *************************************************/
2027
2028 /* This little function scans through a compiled pattern until it finds a
2029 capturing bracket with the given number, or, if the number is negative, an
2030 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2031 so that it can be called from pcre_study() when finding the minimum matching
2032 length.
2033
2034 Arguments:
2035 code points to start of expression
2036 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2037 number the required bracket number or negative to find a lookbehind
2038
2039 Returns: pointer to the opcode for the bracket, or NULL if not found
2040 */
2041
2042 const pcre_uchar *
2043 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2044 {
2045 for (;;)
2046 {
2047 register pcre_uchar c = *code;
2048
2049 if (c == OP_END) return NULL;
2050
2051 /* XCLASS is used for classes that cannot be represented just by a bit
2052 map. This includes negated single high-valued characters. The length in
2053 the table is zero; the actual length is stored in the compiled code. */
2054
2055 if (c == OP_XCLASS) code += GET(code, 1);
2056
2057 /* Handle recursion */
2058
2059 else if (c == OP_REVERSE)
2060 {
2061 if (number < 0) return (pcre_uchar *)code;
2062 code += PRIV(OP_lengths)[c];
2063 }
2064
2065 /* Handle capturing bracket */
2066
2067 else if (c == OP_CBRA || c == OP_SCBRA ||
2068 c == OP_CBRAPOS || c == OP_SCBRAPOS)
2069 {
2070 int n = (int)GET2(code, 1+LINK_SIZE);
2071 if (n == number) return (pcre_uchar *)code;
2072 code += PRIV(OP_lengths)[c];
2073 }
2074
2075 /* Otherwise, we can get the item's length from the table, except that for
2076 repeated character types, we have to test for \p and \P, which have an extra
2077 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2078 must add in its length. */
2079
2080 else
2081 {
2082 switch(c)
2083 {
2084 case OP_TYPESTAR:
2085 case OP_TYPEMINSTAR:
2086 case OP_TYPEPLUS:
2087 case OP_TYPEMINPLUS:
2088 case OP_TYPEQUERY:
2089 case OP_TYPEMINQUERY:
2090 case OP_TYPEPOSSTAR:
2091 case OP_TYPEPOSPLUS:
2092 case OP_TYPEPOSQUERY:
2093 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2094 break;
2095
2096 case OP_TYPEUPTO:
2097 case OP_TYPEMINUPTO:
2098 case OP_TYPEEXACT:
2099 case OP_TYPEPOSUPTO:
2100 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2101 code += 2;
2102 break;
2103
2104 case OP_MARK:
2105 case OP_PRUNE_ARG:
2106 case OP_SKIP_ARG:
2107 case OP_THEN_ARG:
2108 code += code[1];
2109 break;
2110 }
2111
2112 /* Add in the fixed length from the table */
2113
2114 code += PRIV(OP_lengths)[c];
2115
2116 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2117 a multi-byte character. The length in the table is a minimum, so we have to
2118 arrange to skip the extra bytes. */
2119
2120 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2121 if (utf) switch(c)
2122 {
2123 case OP_CHAR:
2124 case OP_CHARI:
2125 case OP_EXACT:
2126 case OP_EXACTI:
2127 case OP_UPTO:
2128 case OP_UPTOI:
2129 case OP_MINUPTO:
2130 case OP_MINUPTOI:
2131 case OP_POSUPTO:
2132 case OP_POSUPTOI:
2133 case OP_STAR:
2134 case OP_STARI:
2135 case OP_MINSTAR:
2136 case OP_MINSTARI:
2137 case OP_POSSTAR:
2138 case OP_POSSTARI:
2139 case OP_PLUS:
2140 case OP_PLUSI:
2141 case OP_MINPLUS:
2142 case OP_MINPLUSI:
2143 case OP_POSPLUS:
2144 case OP_POSPLUSI:
2145 case OP_QUERY:
2146 case OP_QUERYI:
2147 case OP_MINQUERY:
2148 case OP_MINQUERYI:
2149 case OP_POSQUERY:
2150 case OP_POSQUERYI:
2151 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2152 break;
2153 }
2154 #else
2155 (void)(utf); /* Keep compiler happy by referencing function argument */
2156 #endif
2157 }
2158 }
2159 }
2160
2161
2162
2163 /*************************************************
2164 * Scan compiled regex for recursion reference *
2165 *************************************************/
2166
2167 /* This little function scans through a compiled pattern until it finds an
2168 instance of OP_RECURSE.
2169
2170 Arguments:
2171 code points to start of expression
2172 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2173
2174 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2175 */
2176
2177 static const pcre_uchar *
2178 find_recurse(const pcre_uchar *code, BOOL utf)
2179 {
2180 for (;;)
2181 {
2182 register pcre_uchar c = *code;
2183 if (c == OP_END) return NULL;
2184 if (c == OP_RECURSE) return code;
2185
2186 /* XCLASS is used for classes that cannot be represented just by a bit
2187 map. This includes negated single high-valued characters. The length in
2188 the table is zero; the actual length is stored in the compiled code. */
2189
2190 if (c == OP_XCLASS) code += GET(code, 1);
2191
2192 /* Otherwise, we can get the item's length from the table, except that for
2193 repeated character types, we have to test for \p and \P, which have an extra
2194 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2195 must add in its length. */
2196
2197 else
2198 {
2199 switch(c)
2200 {
2201 case OP_TYPESTAR:
2202 case OP_TYPEMINSTAR:
2203 case OP_TYPEPLUS:
2204 case OP_TYPEMINPLUS:
2205 case OP_TYPEQUERY:
2206 case OP_TYPEMINQUERY:
2207 case OP_TYPEPOSSTAR:
2208 case OP_TYPEPOSPLUS:
2209 case OP_TYPEPOSQUERY:
2210 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2211 break;
2212
2213 case OP_TYPEPOSUPTO:
2214 case OP_TYPEUPTO:
2215 case OP_TYPEMINUPTO:
2216 case OP_TYPEEXACT:
2217 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2218 code += 2;
2219 break;
2220
2221 case OP_MARK:
2222 case OP_PRUNE_ARG:
2223 case OP_SKIP_ARG:
2224 case OP_THEN_ARG:
2225 code += code[1];
2226 break;
2227 }
2228
2229 /* Add in the fixed length from the table */
2230
2231 code += PRIV(OP_lengths)[c];
2232
2233 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2234 by a multi-byte character. The length in the table is a minimum, so we have
2235 to arrange to skip the extra bytes. */
2236
2237 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2238 if (utf) switch(c)
2239 {
2240 case OP_CHAR:
2241 case OP_CHARI:
2242 case OP_NOT:
2243 case OP_NOTI:
2244 case OP_EXACT:
2245 case OP_EXACTI:
2246 case OP_NOTEXACT:
2247 case OP_NOTEXACTI:
2248 case OP_UPTO:
2249 case OP_UPTOI:
2250 case OP_NOTUPTO:
2251 case OP_NOTUPTOI:
2252 case OP_MINUPTO:
2253 case OP_MINUPTOI:
2254 case OP_NOTMINUPTO:
2255 case OP_NOTMINUPTOI:
2256 case OP_POSUPTO:
2257 case OP_POSUPTOI:
2258 case OP_NOTPOSUPTO:
2259 case OP_NOTPOSUPTOI:
2260 case OP_STAR:
2261 case OP_STARI:
2262 case OP_NOTSTAR:
2263 case OP_NOTSTARI:
2264 case OP_MINSTAR:
2265 case OP_MINSTARI:
2266 case OP_NOTMINSTAR:
2267 case OP_NOTMINSTARI:
2268 case OP_POSSTAR:
2269 case OP_POSSTARI:
2270 case OP_NOTPOSSTAR:
2271 case OP_NOTPOSSTARI:
2272 case OP_PLUS:
2273 case OP_PLUSI:
2274 case OP_NOTPLUS:
2275 case OP_NOTPLUSI:
2276 case OP_MINPLUS:
2277 case OP_MINPLUSI:
2278 case OP_NOTMINPLUS:
2279 case OP_NOTMINPLUSI:
2280 case OP_POSPLUS:
2281 case OP_POSPLUSI:
2282 case OP_NOTPOSPLUS:
2283 case OP_NOTPOSPLUSI:
2284 case OP_QUERY:
2285 case OP_QUERYI:
2286 case OP_NOTQUERY:
2287 case OP_NOTQUERYI:
2288 case OP_MINQUERY:
2289 case OP_MINQUERYI:
2290 case OP_NOTMINQUERY:
2291 case OP_NOTMINQUERYI:
2292 case OP_POSQUERY:
2293 case OP_POSQUERYI:
2294 case OP_NOTPOSQUERY:
2295 case OP_NOTPOSQUERYI:
2296 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2297 break;
2298 }
2299 #else
2300 (void)(utf); /* Keep compiler happy by referencing function argument */
2301 #endif
2302 }
2303 }
2304 }
2305
2306
2307
2308 /*************************************************
2309 * Scan compiled branch for non-emptiness *
2310 *************************************************/
2311
2312 /* This function scans through a branch of a compiled pattern to see whether it
2313 can match the empty string or not. It is called from could_be_empty()
2314 below and from compile_branch() when checking for an unlimited repeat of a
2315 group that can match nothing. Note that first_significant_code() skips over
2316 backward and negative forward assertions when its final argument is TRUE. If we
2317 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2318 bracket whose current branch will already have been scanned.
2319
2320 Arguments:
2321 code points to start of search
2322 endcode points to where to stop
2323 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2324 cd contains pointers to tables etc.
2325 recurses chain of recurse_check to catch mutual recursion
2326
2327 Returns: TRUE if what is matched could be empty
2328 */
2329
2330 typedef struct recurse_check {
2331 struct recurse_check *prev;
2332 const pcre_uchar *group;
2333 } recurse_check;
2334
2335 static BOOL
2336 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2337 BOOL utf, compile_data *cd, recurse_check *recurses)
2338 {
2339 register pcre_uchar c;
2340 recurse_check this_recurse;
2341
2342 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2343 code < endcode;
2344 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2345 {
2346 const pcre_uchar *ccode;
2347
2348 c = *code;
2349
2350 /* Skip over forward assertions; the other assertions are skipped by
2351 first_significant_code() with a TRUE final argument. */
2352
2353 if (c == OP_ASSERT)
2354 {
2355 do code += GET(code, 1); while (*code == OP_ALT);
2356 c = *code;
2357 continue;
2358 }
2359
2360 /* For a recursion/subroutine call, if its end has been reached, which
2361 implies a backward reference subroutine call, we can scan it. If it's a
2362 forward reference subroutine call, we can't. To detect forward reference
2363 we have to scan up the list that is kept in the workspace. This function is
2364 called only when doing the real compile, not during the pre-compile that
2365 measures the size of the compiled pattern. */
2366
2367 if (c == OP_RECURSE)
2368 {
2369 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2370 BOOL empty_branch;
2371
2372 /* Test for forward reference or uncompleted reference. This is disabled
2373 when called to scan a completed pattern by setting cd->start_workspace to
2374 NULL. */
2375
2376 if (cd->start_workspace != NULL)
2377 {
2378 const pcre_uchar *tcode;
2379 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2380 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2381 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2382 }
2383
2384 /* If we are scanning a completed pattern, there are no forward references
2385 and all groups are complete. We need to detect whether this is a recursive
2386 call, as otherwise there will be an infinite loop. If it is a recursion,
2387 just skip over it. Simple recursions are easily detected. For mutual
2388 recursions we keep a chain on the stack. */
2389
2390 else
2391 {
2392 recurse_check *r = recurses;
2393 const pcre_uchar *endgroup = scode;
2394
2395 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2396 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2397
2398 for (r = recurses; r != NULL; r = r->prev)
2399 if (r->group == scode) break;
2400 if (r != NULL) continue; /* Mutual recursion */
2401 }
2402
2403 /* Completed reference; scan the referenced group, remembering it on the
2404 stack chain to detect mutual recursions. */
2405
2406 empty_branch = FALSE;
2407 this_recurse.prev = recurses;
2408 this_recurse.group = scode;
2409
2410 do
2411 {
2412 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2413 {
2414 empty_branch = TRUE;
2415 break;
2416 }
2417 scode += GET(scode, 1);
2418 }
2419 while (*scode == OP_ALT);
2420
2421 if (!empty_branch) return FALSE; /* All branches are non-empty */
2422 continue;
2423 }
2424
2425 /* Groups with zero repeats can of course be empty; skip them. */
2426
2427 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2428 c == OP_BRAPOSZERO)
2429 {
2430 code += PRIV(OP_lengths)[c];
2431 do code += GET(code, 1); while (*code == OP_ALT);
2432 c = *code;
2433 continue;
2434 }
2435
2436 /* A nested group that is already marked as "could be empty" can just be
2437 skipped. */
2438
2439 if (c == OP_SBRA || c == OP_SBRAPOS ||
2440 c == OP_SCBRA || c == OP_SCBRAPOS)
2441 {
2442 do code += GET(code, 1); while (*code == OP_ALT);
2443 c = *code;
2444 continue;
2445 }
2446
2447 /* For other groups, scan the branches. */
2448
2449 if (c == OP_BRA || c == OP_BRAPOS ||
2450 c == OP_CBRA || c == OP_CBRAPOS ||
2451 c == OP_ONCE || c == OP_ONCE_NC ||
2452 c == OP_COND)
2453 {
2454 BOOL empty_branch;
2455 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2456
2457 /* If a conditional group has only one branch, there is a second, implied,
2458 empty branch, so just skip over the conditional, because it could be empty.
2459 Otherwise, scan the individual branches of the group. */
2460
2461 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2462 code += GET(code, 1);
2463 else
2464 {
2465 empty_branch = FALSE;
2466 do
2467 {
2468 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2469 empty_branch = TRUE;
2470 code += GET(code, 1);
2471 }
2472 while (*code == OP_ALT);
2473 if (!empty_branch) return FALSE; /* All branches are non-empty */
2474 }
2475
2476 c = *code;
2477 continue;
2478 }
2479
2480 /* Handle the other opcodes */
2481
2482 switch (c)
2483 {
2484 /* Check for quantifiers after a class. XCLASS is used for classes that
2485 cannot be represented just by a bit map. This includes negated single
2486 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2487 actual length is stored in the compiled code, so we must update "code"
2488 here. */
2489
2490 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2491 case OP_XCLASS:
2492 ccode = code += GET(code, 1);
2493 goto CHECK_CLASS_REPEAT;
2494 #endif
2495
2496 case OP_CLASS:
2497 case OP_NCLASS:
2498 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2499
2500 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2501 CHECK_CLASS_REPEAT:
2502 #endif
2503
2504 switch (*ccode)
2505 {
2506 case OP_CRSTAR: /* These could be empty; continue */
2507 case OP_CRMINSTAR:
2508 case OP_CRQUERY:
2509 case OP_CRMINQUERY:
2510 case OP_CRPOSSTAR:
2511 case OP_CRPOSQUERY:
2512 break;
2513
2514 default: /* Non-repeat => class must match */
2515 case OP_CRPLUS: /* These repeats aren't empty */
2516 case OP_CRMINPLUS:
2517 case OP_CRPOSPLUS:
2518 return FALSE;
2519
2520 case OP_CRRANGE:
2521 case OP_CRMINRANGE:
2522 case OP_CRPOSRANGE:
2523 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2524 break;
2525 }
2526 break;
2527
2528 /* Opcodes that must match a character */
2529
2530 case OP_ANY:
2531 case OP_ALLANY:
2532 case OP_ANYBYTE:
2533
2534 case OP_PROP:
2535 case OP_NOTPROP:
2536 case OP_ANYNL:
2537
2538 case OP_NOT_HSPACE:
2539 case OP_HSPACE:
2540 case OP_NOT_VSPACE:
2541 case OP_VSPACE:
2542 case OP_EXTUNI:
2543
2544 case OP_NOT_DIGIT:
2545 case OP_DIGIT:
2546 case OP_NOT_WHITESPACE:
2547 case OP_WHITESPACE:
2548 case OP_NOT_WORDCHAR:
2549 case OP_WORDCHAR:
2550
2551 case OP_CHAR:
2552 case OP_CHARI:
2553 case OP_NOT:
2554 case OP_NOTI:
2555
2556 case OP_PLUS:
2557 case OP_PLUSI:
2558 case OP_MINPLUS:
2559 case OP_MINPLUSI:
2560
2561 case OP_NOTPLUS:
2562 case OP_NOTPLUSI:
2563 case OP_NOTMINPLUS:
2564 case OP_NOTMINPLUSI:
2565
2566 case OP_POSPLUS:
2567 case OP_POSPLUSI:
2568 case OP_NOTPOSPLUS:
2569 case OP_NOTPOSPLUSI:
2570
2571 case OP_EXACT:
2572 case OP_EXACTI:
2573 case OP_NOTEXACT:
2574 case OP_NOTEXACTI:
2575
2576 case OP_TYPEPLUS:
2577 case OP_TYPEMINPLUS:
2578 case OP_TYPEPOSPLUS:
2579 case OP_TYPEEXACT:
2580
2581 return FALSE;
2582
2583 /* These are going to continue, as they may be empty, but we have to
2584 fudge the length for the \p and \P cases. */
2585
2586 case OP_TYPESTAR:
2587 case OP_TYPEMINSTAR:
2588 case OP_TYPEPOSSTAR:
2589 case OP_TYPEQUERY:
2590 case OP_TYPEMINQUERY:
2591 case OP_TYPEPOSQUERY:
2592 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2593 break;
2594
2595 /* Same for these */
2596
2597 case OP_TYPEUPTO:
2598 case OP_TYPEMINUPTO:
2599 case OP_TYPEPOSUPTO:
2600 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2601 code += 2;
2602 break;
2603
2604 /* End of branch */
2605
2606 case OP_KET:
2607 case OP_KETRMAX:
2608 case OP_KETRMIN:
2609 case OP_KETRPOS:
2610 case OP_ALT:
2611 return TRUE;
2612
2613 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2614 MINUPTO, and POSUPTO and their caseless and negative versions may be
2615 followed by a multibyte character. */
2616
2617 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2618 case OP_STAR:
2619 case OP_STARI:
2620 case OP_NOTSTAR:
2621 case OP_NOTSTARI:
2622
2623 case OP_MINSTAR:
2624 case OP_MINSTARI:
2625 case OP_NOTMINSTAR:
2626 case OP_NOTMINSTARI:
2627
2628 case OP_POSSTAR:
2629 case OP_POSSTARI:
2630 case OP_NOTPOSSTAR:
2631 case OP_NOTPOSSTARI:
2632
2633 case OP_QUERY:
2634 case OP_QUERYI:
2635 case OP_NOTQUERY:
2636 case OP_NOTQUERYI:
2637
2638 case OP_MINQUERY:
2639 case OP_MINQUERYI:
2640 case OP_NOTMINQUERY:
2641 case OP_NOTMINQUERYI:
2642
2643 case OP_POSQUERY:
2644 case OP_POSQUERYI:
2645 case OP_NOTPOSQUERY:
2646 case OP_NOTPOSQUERYI:
2647
2648 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2649 break;
2650
2651 case OP_UPTO:
2652 case OP_UPTOI:
2653 case OP_NOTUPTO:
2654 case OP_NOTUPTOI:
2655
2656 case OP_MINUPTO:
2657 case OP_MINUPTOI:
2658 case OP_NOTMINUPTO:
2659 case OP_NOTMINUPTOI:
2660
2661 case OP_POSUPTO:
2662 case OP_POSUPTOI:
2663 case OP_NOTPOSUPTO:
2664 case OP_NOTPOSUPTOI:
2665
2666 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2667 break;
2668 #endif
2669
2670 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2671 string. */
2672
2673 case OP_MARK:
2674 case OP_PRUNE_ARG:
2675 case OP_SKIP_ARG:
2676 case OP_THEN_ARG:
2677 code += code[1];
2678 break;
2679
2680 /* None of the remaining opcodes are required to match a character. */
2681
2682 default:
2683 break;
2684 }
2685 }
2686
2687 return TRUE;
2688 }
2689
2690
2691
2692 /*************************************************
2693 * Scan compiled regex for non-emptiness *
2694 *************************************************/
2695
2696 /* This function is called to check for left recursive calls. We want to check
2697 the current branch of the current pattern to see if it could match the empty
2698 string. If it could, we must look outwards for branches at other levels,
2699 stopping when we pass beyond the bracket which is the subject of the recursion.
2700 This function is called only during the real compile, not during the
2701 pre-compile.
2702
2703 Arguments:
2704 code points to start of the recursion
2705 endcode points to where to stop (current RECURSE item)
2706 bcptr points to the chain of current (unclosed) branch starts
2707 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2708 cd pointers to tables etc
2709
2710 Returns: TRUE if what is matched could be empty
2711 */
2712
2713 static BOOL
2714 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2715 branch_chain *bcptr, BOOL utf, compile_data *cd)
2716 {
2717 while (bcptr != NULL && bcptr->current_branch >= code)
2718 {
2719 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2720 return FALSE;
2721 bcptr = bcptr->outer;
2722 }
2723 return TRUE;
2724 }
2725
2726
2727
2728 /*************************************************
2729 * Base opcode of repeated opcodes *
2730 *************************************************/
2731
2732 /* Returns the base opcode for repeated single character type opcodes. If the
2733 opcode is not a repeated character type, it returns with the original value.
2734
2735 Arguments: c opcode
2736 Returns: base opcode for the type
2737 */
2738
2739 static pcre_uchar
2740 get_repeat_base(pcre_uchar c)
2741 {
2742 return (c > OP_TYPEPOSUPTO)? c :
2743 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2744 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2745 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2746 (c >= OP_STARI)? OP_STARI :
2747 OP_STAR;
2748 }
2749
2750
2751
2752 #ifdef SUPPORT_UCP
2753 /*************************************************
2754 * Check a character and a property *
2755 *************************************************/
2756
2757 /* This function is called by check_auto_possessive() when a property item
2758 is adjacent to a fixed character.
2759
2760 Arguments:
2761 c the character
2762 ptype the property type
2763 pdata the data for the type
2764 negated TRUE if it's a negated property (\P or \p{^)
2765
2766 Returns: TRUE if auto-possessifying is OK
2767 */
2768
2769 static BOOL
2770 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2771 BOOL negated)
2772 {
2773 const pcre_uint32 *p;
2774 const ucd_record *prop = GET_UCD(c);
2775
2776 switch(ptype)
2777 {
2778 case PT_LAMP:
2779 return (prop->chartype == ucp_Lu ||
2780 prop->chartype == ucp_Ll ||
2781 prop->chartype == ucp_Lt) == negated;
2782
2783 case PT_GC:
2784 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2785
2786 case PT_PC:
2787 return (pdata == prop->chartype) == negated;
2788
2789 case PT_SC:
2790 return (pdata == prop->script) == negated;
2791
2792 /* These are specials */
2793
2794 case PT_ALNUM:
2795 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2796 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2797
2798 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2799 means that Perl space and POSIX space are now identical. PCRE was changed
2800 at release 8.34. */
2801
2802 case PT_SPACE: /* Perl space */
2803 case PT_PXSPACE: /* POSIX space */
2804 switch(c)
2805 {
2806 HSPACE_CASES:
2807 VSPACE_CASES:
2808 return negated;
2809
2810 default:
2811 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2812 }
2813 break; /* Control never reaches here */
2814
2815 case PT_WORD:
2816 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2817 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2818 c == CHAR_UNDERSCORE) == negated;
2819
2820 case PT_CLIST:
2821 p = PRIV(ucd_caseless_sets) + prop->caseset;
2822 for (;;)
2823 {
2824 if (c < *p) return !negated;
2825 if (c == *p++) return negated;
2826 }
2827 break; /* Control never reaches here */
2828 }
2829
2830 return FALSE;
2831 }
2832 #endif /* SUPPORT_UCP */
2833
2834
2835
2836 /*************************************************
2837 * Fill the character property list *
2838 *************************************************/
2839
2840 /* Checks whether the code points to an opcode that can take part in auto-
2841 possessification, and if so, fills a list with its properties.
2842
2843 Arguments:
2844 code points to start of expression
2845 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2846 fcc points to case-flipping table
2847 list points to output list
2848 list[0] will be filled with the opcode
2849 list[1] will be non-zero if this opcode
2850 can match an empty character string
2851 list[2..7] depends on the opcode
2852
2853 Returns: points to the start of the next opcode if *code is accepted
2854 NULL if *code is not accepted
2855 */
2856
2857 static const pcre_uchar *
2858 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2859 const pcre_uint8 *fcc, pcre_uint32 *list)
2860 {
2861 pcre_uchar c = *code;
2862 pcre_uchar base;
2863 const pcre_uchar *end;
2864 pcre_uint32 chr;
2865
2866 #ifdef SUPPORT_UCP
2867 pcre_uint32 *clist_dest;
2868 const pcre_uint32 *clist_src;
2869 #else
2870 utf = utf; /* Suppress "unused parameter" compiler warning */
2871 #endif
2872
2873 list[0] = c;
2874 list[1] = FALSE;
2875 code++;
2876
2877 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2878 {
2879 base = get_repeat_base(c);
2880 c -= (base - OP_STAR);
2881
2882 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2883 code += IMM2_SIZE;
2884
2885 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2886
2887 switch(base)
2888 {
2889 case OP_STAR:
2890 list[0] = OP_CHAR;
2891 break;
2892
2893 case OP_STARI:
2894 list[0] = OP_CHARI;
2895 break;
2896
2897 case OP_NOTSTAR:
2898 list[0] = OP_NOT;
2899 break;
2900
2901 case OP_NOTSTARI:
2902 list[0] = OP_NOTI;
2903 break;
2904
2905 case OP_TYPESTAR:
2906 list[0] = *code;
2907 code++;
2908 break;
2909 }
2910 c = list[0];
2911 }
2912
2913 switch(c)
2914 {
2915 case OP_NOT_DIGIT:
2916 case OP_DIGIT:
2917 case OP_NOT_WHITESPACE:
2918 case OP_WHITESPACE:
2919 case OP_NOT_WORDCHAR:
2920 case OP_WORDCHAR:
2921 case OP_ANY:
2922 case OP_ALLANY:
2923 case OP_ANYNL:
2924 case OP_NOT_HSPACE:
2925 case OP_HSPACE:
2926 case OP_NOT_VSPACE:
2927 case OP_VSPACE:
2928 case OP_EXTUNI:
2929 case OP_EODN:
2930 case OP_EOD:
2931 case OP_DOLL:
2932 case OP_DOLLM:
2933 return code;
2934
2935 case OP_CHAR:
2936 case OP_NOT:
2937 GETCHARINCTEST(chr, code);
2938 list[2] = chr;
2939 list[3] = NOTACHAR;
2940 return code;
2941
2942 case OP_CHARI:
2943 case OP_NOTI:
2944 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2945 GETCHARINCTEST(chr, code);
2946 list[2] = chr;
2947
2948 #ifdef SUPPORT_UCP
2949 if (chr < 128 || (chr < 256 && !utf))
2950 list[3] = fcc[chr];
2951 else
2952 list[3] = UCD_OTHERCASE(chr);
2953 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2954 list[3] = (chr < 256) ? fcc[chr] : chr;
2955 #else
2956 list[3] = fcc[chr];
2957 #endif
2958
2959 /* The othercase might be the same value. */
2960
2961 if (chr == list[3])
2962 list[3] = NOTACHAR;
2963 else
2964 list[4] = NOTACHAR;
2965 return code;
2966
2967 #ifdef SUPPORT_UCP
2968 case OP_PROP:
2969 case OP_NOTPROP:
2970 if (code[0] != PT_CLIST)
2971 {
2972 list[2] = code[0];
2973 list[3] = code[1];
2974 return code + 2;
2975 }
2976
2977 /* Convert only if we have enough space. */
2978
2979 clist_src = PRIV(ucd_caseless_sets) + code[1];
2980 clist_dest = list + 2;
2981 code += 2;
2982
2983 do {
2984 if (clist_dest >= list + 8)
2985 {
2986 /* Early return if there is not enough space. This should never
2987 happen, since all clists are shorter than 5 character now. */
2988 list[2] = code[0];
2989 list[3] = code[1];
2990 return code;
2991 }
2992 *clist_dest++ = *clist_src;
2993 }
2994 while(*clist_src++ != NOTACHAR);
2995
2996 /* All characters are stored. The terminating NOTACHAR
2997 is copied form the clist itself. */
2998
2999 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3000 return code;
3001 #endif
3002
3003 case OP_NCLASS:
3004 case OP_CLASS:
3005 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3006 case OP_XCLASS:
3007 if (c == OP_XCLASS)
3008 end = code + GET(code, 0) - 1;
3009 else
3010 #endif
3011 end = code + 32 / sizeof(pcre_uchar);
3012
3013 switch(*end)
3014 {
3015 case OP_CRSTAR:
3016 case OP_CRMINSTAR:
3017 case OP_CRQUERY:
3018 case OP_CRMINQUERY:
3019 case OP_CRPOSSTAR:
3020 case OP_CRPOSQUERY:
3021 list[1] = TRUE;
3022 end++;
3023 break;
3024
3025 case OP_CRPLUS:
3026 case OP_CRMINPLUS:
3027 case OP_CRPOSPLUS:
3028 end++;
3029 break;
3030
3031 case OP_CRRANGE:
3032 case OP_CRMINRANGE:
3033 case OP_CRPOSRANGE:
3034 list[1] = (GET2(end, 1) == 0);
3035 end += 1 + 2 * IMM2_SIZE;
3036 break;
3037 }
3038 list[2] = (pcre_uint32)(end - code);
3039 return end;
3040 }
3041 return NULL; /* Opcode not accepted */
3042 }
3043
3044
3045
3046 /*************************************************
3047 * Scan further character sets for match *
3048 *************************************************/
3049
3050 /* Checks whether the base and the current opcode have a common character, in
3051 which case the base cannot be possessified.
3052
3053 Arguments:
3054 code points to the byte code
3055 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3056 cd static compile data
3057 base_list the data list of the base opcode
3058
3059 Returns: TRUE if the auto-possessification is possible
3060 */
3061
3062 static BOOL
3063 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3064 const pcre_uint32 *base_list, const pcre_uchar *base_end)
3065 {
3066 pcre_uchar c;
3067 pcre_uint32 list[8];
3068 const pcre_uint32 *chr_ptr;
3069 const pcre_uint32 *ochr_ptr;
3070 const pcre_uint32 *list_ptr;
3071 const pcre_uchar *next_code;
3072 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3073 const pcre_uchar *xclass_flags;
3074 #endif
3075 const pcre_uint8 *class_bitset;
3076 const pcre_uint8 *set1, *set2, *set_end;
3077 pcre_uint32 chr;
3078 BOOL accepted, invert_bits;
3079 BOOL entered_a_group = FALSE;
3080
3081 /* Note: the base_list[1] contains whether the current opcode has greedy
3082 (represented by a non-zero value) quantifier. This is a different from
3083 other character type lists, which stores here that the character iterator
3084 matches to an empty string (also represented by a non-zero value). */
3085
3086 for(;;)
3087 {
3088 /* All operations move the code pointer forward.
3089 Therefore infinite recursions are not possible. */
3090
3091 c = *code;
3092
3093 /* Skip over callouts */
3094
3095 if (c == OP_CALLOUT)
3096 {
3097 code += PRIV(OP_lengths)[c];
3098 continue;
3099 }
3100
3101 if (c == OP_ALT)
3102 {
3103 do code += GET(code, 1); while (*code == OP_ALT);
3104 c = *code;
3105 }
3106
3107 switch(c)
3108 {
3109 case OP_END:
3110 case OP_KETRPOS:
3111 /* TRUE only in greedy case. The non-greedy case could be replaced by
3112 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3113 uses more memory, which we cannot get at this stage.) */
3114
3115 return base_list[1] != 0;
3116
3117 case OP_KET:
3118 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3119 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3120 cannot be converted to a possessive form. */
3121
3122 if (base_list[1] == 0) return FALSE;
3123
3124 switch(*(code - GET(code, 1)))
3125 {
3126 case OP_ASSERT:
3127 case OP_ASSERT_NOT:
3128 case OP_ASSERTBACK:
3129 case OP_ASSERTBACK_NOT:
3130 case OP_ONCE:
3131 case OP_ONCE_NC:
3132 /* Atomic sub-patterns and assertions can always auto-possessify their
3133 last iterator. However, if the group was entered as a result of checking
3134 a previous iterator, this is not possible. */
3135
3136 return !entered_a_group;
3137 }
3138
3139 code += PRIV(OP_lengths)[c];
3140 continue;
3141
3142 case OP_ONCE:
3143 case OP_ONCE_NC:
3144 case OP_BRA:
3145 case OP_CBRA:
3146 next_code = code + GET(code, 1);
3147 code += PRIV(OP_lengths)[c];
3148
3149 while (*next_code == OP_ALT)
3150 {
3151 if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3152 code = next_code + 1 + LINK_SIZE;
3153 next_code += GET(next_code, 1);
3154 }
3155
3156 entered_a_group = TRUE;
3157 continue;
3158
3159 case OP_BRAZERO:
3160 case OP_BRAMINZERO:
3161
3162 next_code = code + 1;
3163 if (*next_code != OP_BRA && *next_code != OP_CBRA
3164 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3165
3166 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3167
3168 /* The bracket content will be checked by the
3169 OP_BRA/OP_CBRA case above. */
3170 next_code += 1 + LINK_SIZE;
3171 if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3172 return FALSE;
3173
3174 code += PRIV(OP_lengths)[c];
3175 continue;
3176
3177 default:
3178 break;
3179 }
3180
3181 /* Check for a supported opcode, and load its properties. */
3182
3183 code = get_chr_property_list(code, utf, cd->fcc, list);
3184 if (code == NULL) return FALSE; /* Unsupported */
3185
3186 /* If either opcode is a small character list, set pointers for comparing
3187 characters from that list with another list, or with a property. */
3188
3189 if (base_list[0] == OP_CHAR)
3190 {
3191 chr_ptr = base_list + 2;
3192 list_ptr = list;
3193 }
3194 else if (list[0] == OP_CHAR)
3195 {
3196 chr_ptr = list + 2;
3197 list_ptr = base_list;
3198 }
3199
3200 /* Character bitsets can also be compared to certain opcodes. */
3201
3202 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3203 #ifdef COMPILE_PCRE8
3204 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3205 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3206 #endif
3207 )
3208 {
3209 #ifdef COMPILE_PCRE8
3210 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3211 #else
3212 if (base_list[0] == OP_CLASS)
3213 #endif
3214 {
3215 set1 = (pcre_uint8 *)(base_end - base_list[2]);
3216 list_ptr = list;
3217 }
3218 else
3219 {
3220 set1 = (pcre_uint8 *)(code - list[2]);
3221 list_ptr = base_list;
3222 }
3223
3224 invert_bits = FALSE;
3225 switch(list_ptr[0])
3226 {
3227 case OP_CLASS:
3228 case OP_NCLASS:
3229 set2 = (pcre_uint8 *)
3230 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3231 break;
3232
3233 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3234 case OP_XCLASS:
3235 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3236 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3237 if ((*xclass_flags & XCL_MAP) == 0)
3238 {
3239 /* No bits are set for characters < 256. */
3240 if (list[1] == 0) return TRUE;
3241 /* Might be an empty repeat. */
3242 continue;
3243 }
3244 set2 = (pcre_uint8 *)(xclass_flags + 1);
3245 break;
3246 #endif
3247
3248 case OP_NOT_DIGIT:
3249 invert_bits = TRUE;
3250 /* Fall through */
3251 case OP_DIGIT:
3252 set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3253 break;
3254
3255 case OP_NOT_WHITESPACE:
3256 invert_bits = TRUE;
3257 /* Fall through */
3258 case OP_WHITESPACE:
3259 set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3260 break;
3261
3262 case OP_NOT_WORDCHAR:
3263 invert_bits = TRUE;
3264 /* Fall through */
3265 case OP_WORDCHAR:
3266 set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3267 break;
3268
3269 default:
3270 return FALSE;
3271 }
3272
3273 /* Because the sets are unaligned, we need
3274 to perform byte comparison here. */
3275 set_end = set1 + 32;
3276 if (invert_bits)
3277 {
3278 do
3279 {
3280 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3281 }
3282 while (set1 < set_end);
3283 }
3284 else
3285 {
3286 do
3287 {
3288 if ((*set1++ & *set2++) != 0) return FALSE;
3289 }
3290 while (set1 < set_end);
3291 }
3292
3293 if (list[1] == 0) return TRUE;
3294 /* Might be an empty repeat. */
3295 continue;
3296 }
3297
3298 /* Some property combinations also acceptable. Unicode property opcodes are
3299 processed specially; the rest can be handled with a lookup table. */
3300
3301 else
3302 {
3303 pcre_uint32 leftop, rightop;
3304
3305 leftop = base_list[0];
3306 rightop = list[0];
3307
3308 #ifdef SUPPORT_UCP
3309 accepted = FALSE; /* Always set in non-unicode case. */
3310 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3311 {
3312 if (rightop == OP_EOD)
3313 accepted = TRUE;
3314 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3315 {
3316 int n;
3317 const pcre_uint8 *p;
3318 BOOL same = leftop == rightop;
3319 BOOL lisprop = leftop == OP_PROP;
3320 BOOL risprop = rightop == OP_PROP;
3321 BOOL bothprop = lisprop && risprop;
3322
3323 /* There's a table that specifies how each combination is to be
3324 processed:
3325 0 Always return FALSE (never auto-possessify)
3326 1 Character groups are distinct (possessify if both are OP_PROP)
3327 2 Check character categories in the same group (general or particular)
3328 3 Return TRUE if the two opcodes are not the same
3329 ... see comments below
3330 */
3331
3332 n = propposstab[base_list[2]][list[2]];
3333 switch(n)
3334 {
3335 case 0: break;
3336 case 1: accepted = bothprop; break;
3337 case 2: accepted = (base_list[3] == list[3]) != same; break;
3338 case 3: accepted = !same; break;
3339
3340 case 4: /* Left general category, right particular category */
3341 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3342 break;
3343
3344 case 5: /* Right general category, left particular category */
3345 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3346 break;
3347
3348 /* This code is logically tricky. Think hard before fiddling with it.
3349 The posspropstab table has four entries per row. Each row relates to
3350 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3351 Only WORD actually needs all four entries, but using repeats for the
3352 others means they can all use the same code below.
3353
3354 The first two entries in each row are Unicode general categories, and
3355 apply always, because all the characters they include are part of the
3356 PCRE character set. The third and fourth entries are a general and a
3357 particular category, respectively, that include one or more relevant
3358 characters. One or the other is used, depending on whether the check
3359 is for a general or a particular category. However, in both cases the
3360 category contains more characters than the specials that are defined
3361 for the property being tested against. Therefore, it cannot be used
3362 in a NOTPROP case.
3363
3364 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3365 Underscore is covered by ucp_P or ucp_Po. */
3366
3367 case 6: /* Left alphanum vs right general category */
3368 case 7: /* Left space vs right general category */
3369 case 8: /* Left word vs right general category */
3370 p = posspropstab[n-6];
3371 accepted = risprop && lisprop ==
3372 (list[3] != p[0] &&
3373 list[3] != p[1] &&
3374 (list[3] != p[2] || !lisprop));
3375 break;
3376
3377 case 9: /* Right alphanum vs left general category */
3378 case 10: /* Right space vs left general category */
3379 case 11: /* Right word vs left general category */
3380 p = posspropstab[n-9];
3381 accepted = lisprop && risprop ==
3382 (base_list[3] != p[0] &&
3383 base_list[3] != p[1] &&
3384 (base_list[3] != p[2] || !risprop));
3385 break;
3386
3387 case 12: /* Left alphanum vs right particular category */
3388 case 13: /* Left space vs right particular category */
3389 case 14: /* Left word vs right particular category */
3390 p = posspropstab[n-12];
3391 accepted = risprop && lisprop ==
3392 (catposstab[p[0]][list[3]] &&
3393 catposstab[p[1]][list[3]] &&
3394 (list[3] != p[3] || !lisprop));
3395 break;
3396
3397 case 15: /* Right alphanum vs left particular category */
3398 case 16: /* Right space vs left particular category */
3399 case 17: /* Right word vs left particular category */
3400 p = posspropstab[n-15];
3401 accepted = lisprop && risprop ==
3402 (catposstab[p[0]][base_list[3]] &&
3403 catposstab[p[1]][base_list[3]] &&
3404 (base_list[3] != p[3] || !risprop));
3405 break;
3406 }
3407 }
3408 }
3409
3410 else
3411 #endif /* SUPPORT_UCP */
3412
3413 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3414 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3415 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3416
3417 if (!accepted) return FALSE;
3418
3419 if (list[1] == 0) return TRUE;
3420 /* Might be an empty repeat. */
3421 continue;
3422 }
3423
3424 /* Control reaches here only if one of the items is a small character list.
3425 All characters are checked against the other side. */
3426
3427 do
3428 {
3429 chr = *chr_ptr;
3430
3431 switch(list_ptr[0])
3432 {
3433 case OP_CHAR:
3434 ochr_ptr = list_ptr + 2;
3435 do
3436 {
3437 if (chr == *ochr_ptr) return FALSE;
3438 ochr_ptr++;
3439 }
3440 while(*ochr_ptr != NOTACHAR);
3441 break;
3442
3443 case OP_NOT:
3444 ochr_ptr = list_ptr + 2;
3445 do
3446 {
3447 if (chr == *ochr_ptr)
3448 break;
3449 ochr_ptr++;
3450 }
3451 while(*ochr_ptr != NOTACHAR);
3452 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3453 break;
3454
3455 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3456 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3457
3458 case OP_DIGIT:
3459 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3460 break;
3461
3462 case OP_NOT_DIGIT:
3463 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3464 break;
3465
3466 case OP_WHITESPACE:
3467 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3468 break;
3469
3470 case OP_NOT_WHITESPACE:
3471 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3472 break;
3473
3474 case OP_WORDCHAR:
3475 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3476 break;
3477
3478 case OP_NOT_WORDCHAR:
3479 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3480 break;
3481
3482 case OP_HSPACE:
3483 switch(chr)
3484 {
3485 HSPACE_CASES: return FALSE;
3486 default: break;
3487 }
3488 break;
3489
3490 case OP_NOT_HSPACE:
3491 switch(chr)
3492 {
3493 HSPACE_CASES: break;
3494 default: return FALSE;
3495 }
3496 break;
3497
3498 case OP_ANYNL:
3499 case OP_VSPACE:
3500 switch(chr)
3501 {
3502 VSPACE_CASES: return FALSE;
3503 default: break;
3504 }
3505 break;
3506
3507 case OP_NOT_VSPACE:
3508 switch(chr)
3509 {
3510 VSPACE_CASES: break;
3511 default: return FALSE;
3512 }
3513 break;
3514
3515 case OP_DOLL:
3516 case OP_EODN:
3517 switch (chr)
3518 {
3519 case CHAR_CR:
3520 case CHAR_LF:
3521 case CHAR_VT:
3522 case CHAR_FF:
3523 case CHAR_NEL:
3524 #ifndef EBCDIC
3525 case 0x2028:
3526 case 0x2029:
3527 #endif /* Not EBCDIC */
3528 return FALSE;
3529 }
3530 break;
3531
3532 case OP_EOD: /* Can always possessify before \z */
3533 break;
3534
3535 #ifdef SUPPORT_UCP
3536 case OP_PROP:
3537 case OP_NOTPROP:
3538 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3539 list_ptr[0] == OP_NOTPROP))
3540 return FALSE;
3541 break;
3542 #endif
3543
3544 case OP_NCLASS:
3545 if (chr > 255) return FALSE;
3546 /* Fall through */
3547
3548 case OP_CLASS:
3549 if (chr > 255) break;
3550 class_bitset = (pcre_uint8 *)
3551 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3552 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3553 break;
3554
3555 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3556 case OP_XCLASS:
3557 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3558 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3559 break;
3560 #endif
3561
3562 default:
3563 return FALSE;
3564 }
3565
3566 chr_ptr++;
3567 }
3568 while(*chr_ptr != NOTACHAR);
3569
3570 /* At least one character must be matched from this opcode. */
3571
3572 if (list[1] == 0) return TRUE;
3573 }
3574
3575 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3576 but some compilers complain about an unreachable statement. */
3577
3578 }
3579
3580
3581
3582 /*************************************************
3583 * Scan compiled regex for auto-possession *
3584 *************************************************/
3585
3586 /* Replaces single character iterations with their possessive alternatives
3587 if appropriate. This function modifies the compiled opcode!
3588
3589 Arguments:
3590 code points to start of the byte code
3591 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3592 cd static compile data
3593
3594 Returns: nothing
3595 */
3596
3597 static void
3598 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3599 {
3600 register pcre_uchar c;
3601 const pcre_uchar *end;
3602 pcre_uchar *repeat_opcode;
3603 pcre_uint32 list[8];
3604
3605 for (;;)
3606 {
3607 c = *code;
3608
3609 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3610 {
3611 c -= get_repeat_base(c) - OP_STAR;
3612 end = (c <= OP_MINUPTO) ?
3613 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3614 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3615
3616 if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3617 {
3618 switch(c)
3619 {
3620 case OP_STAR:
3621 *code += OP_POSSTAR - OP_STAR;
3622 break;
3623
3624 case OP_MINSTAR:
3625 *code += OP_POSSTAR - OP_MINSTAR;
3626 break;
3627
3628 case OP_PLUS:
3629 *code += OP_POSPLUS - OP_PLUS;
3630 break;
3631
3632 case OP_MINPLUS:
3633 *code += OP_POSPLUS - OP_MINPLUS;
3634 break;
3635
3636 case OP_QUERY:
3637 *code += OP_POSQUERY - OP_QUERY;
3638 break;
3639
3640 case OP_MINQUERY:
3641 *code += OP_POSQUERY - OP_MINQUERY;
3642 break;
3643
3644 case OP_UPTO:
3645 *code += OP_POSUPTO - OP_UPTO;
3646 break;
3647
3648 case OP_MINUPTO:
3649 *code += OP_POSUPTO - OP_MINUPTO;
3650 break;
3651 }
3652 }
3653 c = *code;
3654 }
3655 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3656 {
3657 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3658 if (c == OP_XCLASS)
3659 repeat_opcode = code + GET(code, 1);
3660 else
3661 #endif
3662 repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3663
3664 c = *repeat_opcode;
3665 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3666 {
3667 /* end must not be NULL. */
3668 end = get_chr_property_list(code, utf, cd->fcc, list);
3669
3670 list[1] = (c & 1) == 0;
3671
3672 if (compare_opcodes(end, utf, cd, list, end))
3673 {
3674 switch (c)
3675 {
3676 case OP_CRSTAR:
3677 case OP_CRMINSTAR:
3678 *repeat_opcode = OP_CRPOSSTAR;
3679 break;
3680
3681 case OP_CRPLUS:
3682 case OP_CRMINPLUS:
3683 *repeat_opcode = OP_CRPOSPLUS;
3684 break;
3685
3686 case OP_CRQUERY:
3687 case OP_CRMINQUERY:
3688 *repeat_opcode = OP_CRPOSQUERY;
3689 break;
3690
3691 case OP_CRRANGE:
3692 case OP_CRMINRANGE:
3693 *repeat_opcode = OP_CRPOSRANGE;
3694 break;
3695 }
3696 }
3697 }
3698 c = *code;
3699 }
3700
3701 switch(c)
3702 {
3703 case OP_END:
3704 return;
3705
3706 case OP_TYPESTAR:
3707 case OP_TYPEMINSTAR:
3708 case OP_TYPEPLUS:
3709 case OP_TYPEMINPLUS:
3710 case OP_TYPEQUERY:
3711 case OP_TYPEMINQUERY:
3712 case OP_TYPEPOSSTAR:
3713 case OP_TYPEPOSPLUS:
3714 case OP_TYPEPOSQUERY:
3715 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3716 break;
3717
3718 case OP_TYPEUPTO:
3719 case OP_TYPEMINUPTO:
3720 case OP_TYPEEXACT:
3721 case OP_TYPEPOSUPTO:
3722 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3723 code += 2;
3724 break;
3725
3726 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3727 case OP_XCLASS:
3728 code += GET(code, 1);
3729 break;
3730 #endif
3731
3732 case OP_MARK:
3733 case OP_PRUNE_ARG:
3734 case OP_SKIP_ARG:
3735 case OP_THEN_ARG:
3736 code += code[1];
3737 break;
3738 }
3739
3740 /* Add in the fixed length from the table */
3741
3742 code += PRIV(OP_lengths)[c];
3743
3744 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3745 a multi-byte character. The length in the table is a minimum, so we have to
3746 arrange to skip the extra bytes. */
3747
3748 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3749 if (utf) switch(c)
3750 {
3751 case OP_CHAR:
3752 case OP_CHARI:
3753 case OP_NOT:
3754 case OP_NOTI:
3755 case OP_STAR:
3756 case OP_MINSTAR:
3757 case OP_PLUS:
3758 case OP_MINPLUS:
3759 case OP_QUERY:
3760 case OP_MINQUERY:
3761 case OP_UPTO:
3762 case OP_MINUPTO:
3763 case OP_EXACT:
3764 case OP_POSSTAR:
3765 case OP_POSPLUS:
3766 case OP_POSQUERY:
3767 case OP_POSUPTO:
3768 case OP_STARI:
3769 case OP_MINSTARI:
3770 case OP_PLUSI:
3771 case OP_MINPLUSI:
3772 case OP_QUERYI:
3773 case OP_MINQUERYI:
3774 case OP_UPTOI:
3775 case OP_MINUPTOI:
3776 case OP_EXACTI:
3777 case OP_POSSTARI:
3778 case OP_POSPLUSI:
3779 case OP_POSQUERYI:
3780 case OP_POSUPTOI:
3781 case OP_NOTSTAR:
3782 case OP_NOTMINSTAR:
3783 case OP_NOTPLUS:
3784 case OP_NOTMINPLUS:
3785 case OP_NOTQUERY:
3786 case OP_NOTMINQUERY:
3787 case OP_NOTUPTO:
3788 case OP_NOTMINUPTO:
3789 case OP_NOTEXACT:
3790 case OP_NOTPOSSTAR:
3791 case OP_NOTPOSPLUS:
3792 case OP_NOTPOSQUERY:
3793 case OP_NOTPOSUPTO:
3794 case OP_NOTSTARI:
3795 case OP_NOTMINSTARI:
3796 case OP_NOTPLUSI:
3797 case OP_NOTMINPLUSI:
3798 case OP_NOTQUERYI:
3799 case OP_NOTMINQUERYI:
3800 case OP_NOTUPTOI:
3801 case OP_NOTMINUPTOI:
3802 case OP_NOTEXACTI:
3803 case OP_NOTPOSSTARI:
3804 case OP_NOTPOSPLUSI:
3805 case OP_NOTPOSQUERYI:
3806 case OP_NOTPOSUPTOI:
3807 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3808 break;
3809 }
3810 #else
3811 (void)(utf); /* Keep compiler happy by referencing function argument */
3812 #endif
3813 }
3814 }
3815
3816
3817
3818 /*************************************************
3819 * Check for POSIX class syntax *
3820 *************************************************/
3821
3822 /* This function is called when the sequence "[:" or "[." or "[=" is
3823 encountered in a character class. It checks whether this is followed by a
3824 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3825 reach an unescaped ']' without the special preceding character, return FALSE.
3826
3827 Originally, this function only recognized a sequence of letters between the
3828 terminators, but it seems that Perl recognizes any sequence of characters,
3829 though of course unknown POSIX names are subsequently rejected. Perl gives an
3830 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3831 didn't consider this to be a POSIX class. Likewise for [:1234:].
3832
3833 The problem in trying to be exactly like Perl is in the handling of escapes. We
3834 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3835 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3836 below handles the special case of \], but does not try to do any other escape
3837 processing. This makes it different from Perl for cases such as [:l\ower:]
3838 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3839 "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
3840 I think.
3841
3842 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3843 It seems that the appearance of a nested POSIX class supersedes an apparent
3844 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3845 a digit.
3846
3847 In Perl, unescaped square brackets may also appear as part of class names. For
3848 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3849 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3850 seem right at all. PCRE does not allow closing square brackets in POSIX class
3851 names.
3852
3853 Arguments:
3854 ptr pointer to the initial [
3855 endptr where to return the end pointer
3856
3857 Returns: TRUE or FALSE
3858 */
3859
3860 static BOOL
3861 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3862 {
3863 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3864 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3865 for (++ptr; *ptr != CHAR_NULL; ptr++)
3866 {
3867 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3868 ptr++;
3869 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3870 else
3871 {
3872 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3873 {
3874 *endptr = ptr;
3875 return TRUE;
3876 }
3877 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3878 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3879 ptr[1] == CHAR_EQUALS_SIGN) &&
3880 check_posix_syntax(ptr, endptr))
3881 return FALSE;
3882 }
3883 }
3884 return FALSE;
3885 }
3886
3887
3888
3889
3890 /*************************************************
3891 * Check POSIX class name *
3892 *************************************************/
3893
3894 /* This function is called to check the name given in a POSIX-style class entry
3895 such as [:alnum:].
3896
3897 Arguments:
3898 ptr points to the first letter
3899 len the length of the name
3900
3901 Returns: a value representing the name, or -1 if unknown
3902 */
3903
3904 static int
3905 check_posix_name(const pcre_uchar *ptr, int len)
3906 {
3907 const char *pn = posix_names;
3908 register int yield = 0;
3909 while (posix_name_lengths[yield] != 0)
3910 {
3911 if (len == posix_name_lengths[yield] &&
3912 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3913 pn += posix_name_lengths[yield] + 1;
3914 yield++;
3915 }
3916 return -1;
3917 }
3918
3919
3920 /*************************************************
3921 * Adjust OP_RECURSE items in repeated group *
3922 *************************************************/
3923
3924 /* OP_RECURSE items contain an offset from the start of the regex to the group
3925 that is referenced. This means that groups can be replicated for fixed
3926 repetition simply by copying (because the recursion is allowed to refer to
3927 earlier groups that are outside the current group). However, when a group is
3928 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3929 inserted before it, after it has been compiled. This means that any OP_RECURSE
3930 items within it that refer to the group itself or any contained groups have to
3931 have their offsets adjusted. That one of the jobs of this function. Before it
3932 is called, the partially compiled regex must be temporarily terminated with
3933 OP_END.
3934
3935 This function has been extended with the possibility of forward references for
3936 recursions and subroutine calls. It must also check the list of such references
3937 for the group we are dealing with. If it finds that one of the recursions in
3938 the current group is on this list, it adjusts the offset in the list, not the
3939 value in the reference (which is a group number).
3940
3941 Arguments:
3942 group points to the start of the group
3943 adjust the amount by which the group is to be moved
3944 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3945 cd contains pointers to tables etc.
3946 save_hwm the hwm forward reference pointer at the start of the group
3947
3948 Returns: nothing
3949 */
3950
3951 static void
3952 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3953 pcre_uchar *save_hwm)
3954 {
3955 pcre_uchar *ptr = group;
3956
3957 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3958 {
3959 int offset;
3960 pcre_uchar *hc;
3961
3962 /* See if this recursion is on the forward reference list. If so, adjust the
3963 reference. */
3964
3965 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3966 {
3967 offset = (int)GET(hc, 0);
3968 if (cd->start_code + offset == ptr + 1)
3969 {
3970 PUT(hc, 0, offset + adjust);
3971 break;
3972 }
3973 }
3974
3975 /* Otherwise, adjust the recursion offset if it's after the start of this
3976 group. */
3977
3978 if (hc >= cd->hwm)
3979 {
3980 offset = (int)GET(ptr, 1);
3981 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3982 }
3983
3984 ptr += 1 + LINK_SIZE;
3985 }
3986 }
3987
3988
3989
3990 /*************************************************
3991 * Insert an automatic callout point *
3992 *************************************************/
3993
3994 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3995 callout points before each pattern item.
3996
3997 Arguments:
3998 code current code pointer
3999 ptr current pattern pointer
4000 cd pointers to tables etc
4001
4002 Returns: new code pointer
4003 */
4004
4005 static pcre_uchar *
4006 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4007 {
4008 *code++ = OP_CALLOUT;
4009 *code++ = 255;
4010 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
4011 PUT(code, LINK_SIZE, 0); /* Default length */
4012 return code + 2 * LINK_SIZE;
4013 }
4014
4015
4016
4017 /*************************************************
4018 * Complete a callout item *
4019 *************************************************/
4020
4021 /* A callout item contains the length of the next item in the pattern, which
4022 we can't fill in till after we have reached the relevant point. This is used
4023 for both automatic and manual callouts.
4024
4025 Arguments:
4026 previous_callout points to previous callout item
4027 ptr current pattern pointer
4028 cd pointers to tables etc
4029
4030 Returns: nothing
4031 */
4032
4033 static void
4034 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4035 {
4036 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4037 PUT(previous_callout, 2 + LINK_SIZE, length);
4038 }
4039
4040
4041
4042 #ifdef SUPPORT_UCP
4043 /*************************************************
4044 * Get othercase range *
4045 *************************************************/
4046
4047 /* This function is passed the start and end of a class range, in UTF-8 mode
4048 with UCP support. It searches up the characters, looking for ranges of
4049 characters in the "other" case. Each call returns the next one, updating the
4050 start address. A character with multiple other cases is returned on its own
4051 with a special return value.
4052
4053 Arguments:
4054 cptr points to starting character value; updated
4055 d end value
4056 ocptr where to put start of othercase range
4057 odptr where to put end of othercase range
4058
4059 Yield: -1 when no more
4060 0 when a range is returned
4061 >0 the CASESET offset for char with multiple other cases
4062 in this case, ocptr contains the original
4063 */
4064
4065 static int
4066 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4067 pcre_uint32 *odptr)
4068 {
4069 pcre_uint32 c, othercase, next;
4070 unsigned int co;
4071
4072 /* Find the first character that has an other case. If it has multiple other
4073 cases, return its case offset value. */
4074
4075 for (c = *cptr; c <= d; c++)
4076 {
4077 if ((co = UCD_CASESET(c)) != 0)
4078 {
4079 *ocptr = c++; /* Character that has the set */
4080 *cptr = c; /* Rest of input range */
4081 return (int)co;
4082 }
4083 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4084 }
4085
4086 if (c > d) return -1; /* Reached end of range */
4087
4088 /* Found a character that has a single other case. Search for the end of the
4089 range, which is either the end of the input range, or a character that has zero
4090 or more than one other cases. */
4091
4092 *ocptr = othercase;
4093 next = othercase + 1;
4094
4095 for (++c; c <= d; c++)
4096 {
4097 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4098 next++;
4099 }
4100
4101 *odptr = next - 1; /* End of othercase range */
4102 *cptr = c; /* Rest of input range */
4103 return 0;
4104 }
4105 #endif /* SUPPORT_UCP */
4106
4107
4108
4109 /*************************************************
4110 * Add a character or range to a class *
4111 *************************************************/
4112
4113 /* This function packages up the logic of adding a character or range of
4114 characters to a class. The character values in the arguments will be within the
4115 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4116 mutually recursive with the function immediately below.
4117
4118 Arguments:
4119 classbits the bit map for characters < 256
4120 uchardptr points to the pointer for extra data
4121 options the options word
4122 cd contains pointers to tables etc.
4123 start start of range character
4124 end end of range character
4125
4126 Returns: the number of < 256 characters added
4127 the pointer to extra data is updated
4128 */
4129
4130 static int
4131 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4132 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4133 {
4134 pcre_uint32 c;
4135 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4136 int n8 = 0;
4137
4138 /* If caseless matching is required, scan the range and process alternate
4139 cases. In Unicode, there are 8-bit characters that have alternate cases that
4140 are greater than 255 and vice-versa. Sometimes we can just extend the original
4141 range. */
4142
4143 if ((options & PCRE_CASELESS) != 0)
4144 {
4145 #ifdef SUPPORT_UCP
4146 if ((options & PCRE_UTF8) != 0)
4147 {
4148 int rc;
4149 pcre_uint32 oc, od;
4150
4151 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
4152 c = start;
4153
4154 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4155 {
4156 /* Handle a single character that has more than one other case. */
4157
4158 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4159 PRIV(ucd_caseless_sets) + rc, oc);
4160
4161 /* Do nothing if the other case range is within the original range. */
4162
4163 else if (oc >= start && od <= end) continue;
4164
4165 /* Extend the original range if there is overlap, noting that if oc < c, we
4166 can't have od > end because a subrange is always shorter than the basic
4167 range. Otherwise, use a recursive call to add the additional range. */
4168
4169 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4170 else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
4171 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4172 }
4173 }
4174 else
4175 #endif /* SUPPORT_UCP */
4176
4177 /* Not UTF-mode, or no UCP */
4178
4179 for (c = start; c <= classbits_end; c++)
4180 {
4181 SETBIT(classbits, cd->fcc[c]);
4182 n8++;
4183 }
4184 }
4185
4186 /* Now handle the original range. Adjust the final value according to the bit
4187 length - this means that the same lists of (e.g.) horizontal spaces can be used
4188 in all cases. */
4189
4190 #if defined COMPILE_PCRE8
4191 #ifdef SUPPORT_UTF
4192 if ((options & PCRE_UTF8) == 0)
4193 #endif
4194 if (end > 0xff) end = 0xff;
4195
4196 #elif defined COMPILE_PCRE16
4197 #ifdef SUPPORT_UTF
4198 if ((options & PCRE_UTF16) == 0)
4199 #endif
4200 if (end > 0xffff) end = 0xffff;
4201
4202 #endif /* COMPILE_PCRE[8|16] */
4203
4204 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4205
4206 for (c = start; c <= classbits_end; c++)
4207 {
4208 /* Regardless of start, c will always be <= 255. */
4209 SETBIT(classbits, c);
4210 n8++;
4211 }
4212
4213 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4214 if (start <= 0xff) start = 0xff + 1;
4215
4216 if (end >= start)
4217 {
4218 pcre_uchar *uchardata = *uchardptr;
4219 #ifdef SUPPORT_UTF
4220 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
4221 {
4222 if (start < end)
4223 {
4224 *uchardata++ = XCL_RANGE;
4225 uchardata += PRIV(ord2utf)(start, uchardata);
4226 uchardata += PRIV(ord2utf)(end, uchardata);
4227 }
4228 else if (start == end)
4229 {
4230 *uchardata++ = XCL_SINGLE;
4231 uchardata += PRIV(ord2utf)(start, uchardata);
4232 }
4233 }
4234 else
4235 #endif /* SUPPORT_UTF */
4236
4237 /* Without UTF support, character values are constrained by the bit length,
4238 and can only be > 256 for 16-bit and 32-bit libraries. */
4239
4240 #ifdef COMPILE_PCRE8
4241 {}
4242 #else
4243 if (start < end)
4244 {
4245 *uchardata++ = XCL_RANGE;
4246 *uchardata++ = start;
4247 *uchardata++ = end;
4248 }
4249 else if (start == end)
4250 {
4251 *uchardata++ = XCL_SINGLE;
4252 *uchardata++ = start;
4253 }
4254 #endif
4255
4256 *uchardptr = uchardata; /* Updata extra data pointer */
4257 }
4258 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4259
4260 return n8; /* Number of 8-bit characters */
4261 }
4262
4263
4264
4265
4266 /*************************************************
4267 * Add a list of characters to a class *
4268 *************************************************/
4269
4270 /* This function is used for adding a list of case-equivalent characters to a
4271 class, and also for adding a list of horizontal or vertical whitespace. If the
4272 list is in order (which it should be), ranges of characters are detected and
4273 handled appropriately. This function is mutually recursive with the function
4274 above.
4275
4276 Arguments:
4277 classbits the bit map for characters < 256
4278 uchardptr points to the pointer for extra data
4279 options the options word
4280 cd contains pointers to tables etc.
4281 p points to row of 32-bit values, terminated by NOTACHAR
4282 except character to omit; this is used when adding lists of
4283 case-equivalent characters to avoid including the one we
4284 already know about
4285
4286 Returns: the number of < 256 characters added
4287 the pointer to extra data is updated
4288 */
4289
4290 static int
4291 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4292 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4293 {
4294 int n8 = 0;
4295 while (p[0] < NOTACHAR)
4296 {
4297 int n = 0;
4298 if (p[0] != except)
4299 {
4300 while(p[n+1] == p[0] + n + 1) n++;
4301 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4302 }
4303 p += n + 1;
4304 }
4305 return n8;
4306 }
4307
4308
4309
4310 /*************************************************
4311 * Add characters not in a list to a class *
4312 *************************************************/
4313
4314 /* This function is used for adding the complement of a list of horizontal or
4315 vertical whitespace to a class. The list must be in order.
4316
4317 Arguments:
4318 classbits the bit map for characters < 256
4319 uchardptr points to the pointer for extra data
4320 options the options word
4321 cd contains pointers to tables etc.
4322 p points to row of 32-bit values, terminated by NOTACHAR
4323
4324 Returns: the number of < 256 characters added
4325 the pointer to extra data is updated
4326 */
4327
4328 static int
4329 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4330 int options, compile_data *cd, const pcre_uint32 *p)
4331 {
4332 BOOL utf = (options & PCRE_UTF8) != 0;
4333 int n8 = 0;
4334 if (p[0] > 0)
4335 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4336 while (p[0] < NOTACHAR)
4337 {
4338 while (p[1] == p[0] + 1) p++;
4339 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4340 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4341 p++;
4342 }
4343 return n8;
4344 }
4345
4346
4347
4348 /*************************************************
4349 * Compile one branch *
4350 *************************************************/
4351
4352 /* Scan the pattern, compiling it into the a vector. If the options are
4353 changed during the branch, the pointer is used to change the external options
4354 bits. This function is used during the pre-compile phase when we are trying
4355 to find out the amount of memory needed, as well as during the real compile
4356 phase. The value of lengthptr distinguishes the two phases.
4357
4358 Arguments:
4359 optionsptr pointer to the option bits
4360 codeptr points to the pointer to the current code point
4361 ptrptr points to the current pattern pointer
4362 errorcodeptr points to error code variable
4363 firstcharptr place to put the first required character
4364 firstcharflagsptr place to put the first character flags, or a negative number
4365 reqcharptr place to put the last required character
4366 reqcharflagsptr place to put the last required character flags, or a negative number
4367 bcptr points to current branch chain
4368 cond_depth conditional nesting depth
4369 cd contains pointers to tables etc.
4370 lengthptr NULL during the real compile phase
4371 points to length accumulator during pre-compile phase
4372
4373 Returns: TRUE on success
4374 FALSE, with *errorcodeptr set non-zero on error
4375 */
4376
4377 static BOOL
4378 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4379 const pcre_uchar **ptrptr, int *errorcodeptr,
4380 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4381 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4382 branch_chain *bcptr, int cond_depth,
4383 compile_data *cd, int *lengthptr)
4384 {
4385 int repeat_type, op_type;
4386 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4387 int bravalue = 0;
4388 int greedy_default, greedy_non_default;
4389 pcre_uint32 firstchar, reqchar;
4390 pcre_int32 firstcharflags, reqcharflags;
4391 pcre_uint32 zeroreqchar, zerofirstchar;
4392 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4393 pcre_int32 req_caseopt, reqvary, tempreqvary;
4394 int options = *optionsptr; /* May change dynamically */
4395 int after_manual_callout = 0;
4396 int length_prevgroup = 0;
4397 register pcre_uint32 c;
4398 int escape;
4399 register pcre_uchar *code = *codeptr;
4400 pcre_uchar *last_code = code;
4401 pcre_uchar *orig_code = code;
4402 pcre_uchar *tempcode;
4403 BOOL inescq = FALSE;
4404 BOOL groupsetfirstchar = FALSE;
4405 const pcre_uchar *ptr = *ptrptr;
4406 const pcre_uchar *tempptr;
4407 const pcre_uchar *nestptr = NULL;
4408 pcre_uchar *previous = NULL;
4409 pcre_uchar *previous_callout = NULL;
4410 pcre_uchar *save_hwm = NULL;
4411 pcre_uint8 classbits[32];
4412
4413 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4414 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4415 dynamically as we process the pattern. */
4416
4417 #ifdef SUPPORT_UTF
4418 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4419 BOOL utf = (options & PCRE_UTF8) != 0;
4420 #ifndef COMPILE_PCRE32
4421 pcre_uchar utf_chars[6];
4422 #endif
4423 #else
4424 BOOL utf = FALSE;
4425 #endif
4426
4427 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4428 class_uchardata always so that it can be passed to add_to_class() always,
4429 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4430 alternative calls for the different cases. */
4431
4432 pcre_uchar *class_uchardata;
4433 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4434 BOOL xclass;
4435 pcre_uchar *class_uchardata_base;
4436 #endif
4437
4438 #ifdef PCRE_DEBUG
4439 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4440 #endif
4441
4442 /* Set up the default and non-default settings for greediness */
4443
4444 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4445 greedy_non_default = greedy_default ^ 1;
4446
4447 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4448 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4449 matches a non-fixed char first char; reqchar just remains unset if we never
4450 find one.
4451
4452 When we hit a repeat whose minimum is zero, we may have to adjust these values
4453 to take the zero repeat into account. This is implemented by setting them to
4454 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4455 item types that can be repeated set these backoff variables appropriately. */
4456
4457 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4458 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4459
4460 /* The variable req_caseopt contains either the REQ_CASELESS value
4461 or zero, according to the current setting of the caseless flag. The
4462 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4463 firstchar or reqchar variables to record the case status of the
4464 value. This is used only for ASCII characters. */
4465
4466 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4467
4468 /* Switch on next character until the end of the branch */
4469
4470 for (;; ptr++)
4471 {
4472 BOOL negate_class;
4473 BOOL should_flip_negation;
4474 BOOL possessive_quantifier;
4475 BOOL is_quantifier;
4476 BOOL is_recurse;
4477 BOOL reset_bracount;
4478 int class_has_8bitchar;
4479 int class_one_char;
4480 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4481 BOOL xclass_has_prop;
4482 #endif
4483 int newoptions;
4484 int recno;
4485 int refsign;
4486 int skipbytes;
4487 pcre_uint32 subreqchar, subfirstchar;
4488 pcre_int32 subreqcharflags, subfirstcharflags;
4489 int terminator;
4490 unsigned int mclength;
4491 unsigned int tempbracount;
4492 pcre_uint32 ec;
4493 pcre_uchar mcbuffer[8];
4494
4495 /* Get next character in the pattern */
4496
4497 c = *ptr;
4498
4499 /* If we are at the end of a nested substitution, revert to the outer level
4500 string. Nesting only happens one level deep. */
4501
4502 if (c == CHAR_NULL && nestptr != NULL)
4503 {
4504 ptr = nestptr;
4505 nestptr = NULL;
4506 c = *ptr;
4507 }
4508
4509 /* If we are in the pre-compile phase, accumulate the length used for the
4510 previous cycle of this loop. */
4511
4512 if (lengthptr != NULL)
4513 {
4514 #ifdef PCRE_DEBUG
4515 if (code > cd->hwm) cd->hwm = code; /* High water info */
4516 #endif
4517 if (code > cd->start_workspace + cd->workspace_size -
4518 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4519 {
4520 *errorcodeptr = ERR52;
4521 goto FAILED;
4522 }
4523
4524 /* There is at least one situation where code goes backwards: this is the
4525 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4526 the class is simply eliminated. However, it is created first, so we have to
4527 allow memory for it. Therefore, don't ever reduce the length at this point.
4528 */
4529
4530 if (code < last_code) code = last_code;
4531
4532 /* Paranoid check for integer overflow */
4533
4534 if (OFLOW_MAX - *lengthptr < code - last_code)
4535 {
4536 *errorcodeptr = ERR20;
4537 goto FAILED;
4538 }
4539
4540 *lengthptr += (int)(code - last_code);
4541 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4542 (int)(code - last_code), c, c));
4543
4544 /* If "previous" is set and it is not at the start of the work space, move
4545 it back to there, in order to avoid filling up the work space. Otherwise,
4546 if "previous" is NULL, reset the current code pointer to the start. */
4547
4548 if (previous != NULL)
4549 {
4550 if (previous > orig_code)
4551 {
4552 memmove(orig_code, previous, IN_UCHARS(code - previous));
4553 code -= previous - orig_code;
4554 previous = orig_code;
4555 }
4556 }
4557 else code = orig_code;
4558
4559 /* Remember where this code item starts so we can pick up the length
4560 next time round. */
4561
4562 last_code = code;
4563 }
4564
4565 /* In the real compile phase, just check the workspace used by the forward
4566 reference list. */
4567
4568 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4569 WORK_SIZE_SAFETY_MARGIN)
4570 {
4571 *errorcodeptr = ERR52;
4572 goto FAILED;
4573 }
4574
4575 /* If in \Q...\E, check for the end; if not, we have a literal */
4576
4577 if (inescq && c != CHAR_NULL)
4578 {
4579 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4580 {
4581 inescq = FALSE;
4582 ptr++;
4583 continue;
4584 }
4585 else
4586 {
4587 if (previous_callout != NULL)
4588 {
4589 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4590 complete_callout(previous_callout, ptr, cd);
4591 previous_callout = NULL;
4592 }
4593 if ((options & PCRE_AUTO_CALLOUT) != 0)
4594 {
4595 previous_callout = code;
4596 code = auto_callout(code, ptr, cd);
4597 }
4598 goto NORMAL_CHAR;
4599 }
4600 /* Control does not reach here. */
4601 }
4602
4603 /* In extended mode, skip white space and comments. We need a loop in order
4604 to check for more white space and more comments after a comment. */
4605
4606 if ((options & PCRE_EXTENDED) != 0)
4607 {
4608 for (;;)
4609 {
4610 while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4611 if (c != CHAR_NUMBER_SIGN) break;
4612 ptr++;
4613 while (*ptr != CHAR_NULL)
4614 {
4615 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
4616 { /* IS_NEWLINE sets cd->nllen. */
4617 ptr += cd->nllen;
4618 break;
4619 }
4620 ptr++;
4621 #ifdef SUPPORT_UTF
4622 if (utf) FORWARDCHAR(ptr);
4623 #endif
4624 }
4625 c = *ptr; /* Either NULL or the char after a newline */
4626 }
4627 }
4628
4629 /* See if the next thing is a quantifier. */
4630
4631 is_quantifier =
4632 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4633 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4634
4635 /* Fill in length of a previous callout, except when the next thing is a
4636 quantifier or when processing a property substitution string in UCP mode. */
4637
4638 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4639 after_manual_callout-- <= 0)
4640 {
4641 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4642 complete_callout(previous_callout, ptr, cd);
4643 previous_callout = NULL;
4644 }
4645
4646 /* Create auto callout, except for quantifiers, or while processing property
4647 strings that are substituted for \w etc in UCP mode. */
4648
4649 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4650 {
4651 previous_callout = code;
4652 code = auto_callout(code, ptr, cd);
4653 }
4654
4655 /* Process the next pattern item. */
4656
4657 switch(c)
4658 {
4659 /* ===================================================================*/
4660 case CHAR_NULL: /* The branch terminates at string end */
4661 case CHAR_VERTICAL_LINE: /* or | or ) */
4662 case CHAR_RIGHT_PARENTHESIS:
4663 *firstcharptr = firstchar;
4664 *firstcharflagsptr = firstcharflags;
4665 *reqcharptr = reqchar;
4666 *reqcharflagsptr = reqcharflags;
4667 *codeptr = code;
4668 *ptrptr = ptr;
4669 if (lengthptr != NULL)
4670 {
4671 if (OFLOW_MAX - *lengthptr < code - last_code)
4672 {
4673 *errorcodeptr = ERR20;
4674 goto FAILED;
4675 }
4676 *lengthptr += (int)(code - last_code); /* To include callout length */
4677 DPRINTF((">> end branch\n"));
4678 }
4679 return TRUE;
4680
4681
4682 /* ===================================================================*/
4683 /* Handle single-character metacharacters. In multiline mode, ^ disables
4684 the setting of any following char as a first character. */
4685
4686 case CHAR_CIRCUMFLEX_ACCENT:
4687 previous = NULL;
4688 if ((options & PCRE_MULTILINE) != 0)
4689 {
4690 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4691 *code++ = OP_CIRCM;
4692 }
4693 else *code++ = OP_CIRC;
4694 break;
4695
4696 case CHAR_DOLLAR_SIGN:
4697 previous = NULL;
4698 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4699 break;
4700
4701 /* There can never be a first char if '.' is first, whatever happens about
4702 repeats. The value of reqchar doesn't change either. */
4703
4704 case CHAR_DOT:
4705 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4706 zerofirstchar = firstchar;
4707 zerofirstcharflags = firstcharflags;
4708 zeroreqchar = reqchar;
4709 zeroreqcharflags = reqcharflags;
4710 previous = code;
4711 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4712 break;
4713
4714
4715 /* ===================================================================*/
4716 /* Character classes. If the included characters are all < 256, we build a
4717 32-byte bitmap of the permitted characters, except in the special case
4718 where there is only one such character. For negated classes, we build the
4719 map as usual, then invert it at the end. However, we use a different opcode
4720 so that data characters > 255 can be handled correctly.
4721
4722 If the class contains characters outside the 0-255 range, a different
4723 opcode is compiled. It may optionally have a bit map for characters < 256,
4724 but those above are are explicitly listed afterwards. A flag byte tells
4725 whether the bitmap is present, and whether this is a negated class or not.
4726
4727 In JavaScript compatibility mode, an isolated ']' causes an error. In
4728 default (Perl) mode, it is treated as a data character. */
4729
4730 case CHAR_RIGHT_SQUARE_BRACKET:
4731 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4732 {
4733 *errorcodeptr = ERR64;
4734 goto FAILED;
4735 }
4736 goto NORMAL_CHAR;
4737
4738 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4739 used for "start of word" and "end of word". As these are otherwise illegal
4740 sequences, we don't break anything by recognizing them. They are replaced
4741 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4742 erroneous and are handled by the normal code below. */
4743
4744 case CHAR_LEFT_SQUARE_BRACKET:
4745 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4746 {
4747 nestptr = ptr + 7;
4748 ptr = sub_start_of_word - 1;
4749 continue;
4750 }
4751
4752 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4753 {
4754 nestptr = ptr + 7;
4755 ptr = sub_end_of_word - 1;
4756 continue;
4757 }
4758
4759 /* Handle a real character class. */
4760
4761 previous = code;
4762
4763 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4764 they are encountered at the top level, so we'll do that too. */
4765
4766 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4767 ptr[1] == CHAR_EQUALS_SIGN) &&
4768 check_posix_syntax(ptr, &tempptr))
4769 {
4770 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4771 goto FAILED;
4772 }
4773
4774 /* If the first character is '^', set the negation flag and skip it. Also,
4775 if the first few characters (either before or after ^) are \Q\E or \E we
4776 skip them too. This makes for compatibility with Perl. */
4777
4778 negate_class = FALSE;
4779 for (;;)
4780 {
4781 c = *(++ptr);
4782 if (c == CHAR_BACKSLASH)
4783 {
4784 if (ptr[1] == CHAR_E)
4785 ptr++;
4786 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4787 ptr += 3;
4788 else
4789 break;
4790 }
4791 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4792 negate_class = TRUE;
4793 else break;
4794 }
4795
4796 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4797 an initial ']' is taken as a data character -- the code below handles
4798 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4799 [^] must match any character, so generate OP_ALLANY. */
4800
4801 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4802 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4803 {
4804 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4805 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4806 zerofirstchar = firstchar;
4807 zerofirstcharflags = firstcharflags;
4808 break;
4809 }
4810
4811 /* If a class contains a negative special such as \S, we need to flip the
4812 negation flag at the end, so that support for characters > 255 works
4813 correctly (they are all included in the class). */
4814
4815 should_flip_negation = FALSE;
4816
4817 /* Extended class (xclass) will be used when characters > 255
4818 might match. */
4819
4820 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4821 xclass = FALSE;
4822 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4823 class_uchardata_base = class_uchardata; /* Save the start */
4824 #endif
4825
4826 /* For optimization purposes, we track some properties of the class:
4827 class_has_8bitchar will be non-zero if the class contains at least one <
4828 256 character; class_one_char will be 1 if the class contains just one
4829 character; xclass_has_prop will be TRUE if unicode property checks
4830 are present in the class. */
4831
4832 class_has_8bitchar = 0;
4833 class_one_char = 0;
4834 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4835 xclass_has_prop = FALSE;
4836 #endif
4837
4838 /* Initialize the 32-char bit map to all zeros. We build the map in a
4839 temporary bit of memory, in case the class contains fewer than two
4840 8-bit characters because in that case the compiled code doesn't use the bit
4841 map. */
4842
4843 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4844
4845 /* Process characters until ] is reached. By writing this as a "do" it
4846 means that an initial ] is taken as a data character. At the start of the
4847 loop, c contains the first byte of the character. */
4848
4849 if (c != CHAR_NULL) do
4850 {
4851 const pcre_uchar *oldptr;
4852
4853 #ifdef SUPPORT_UTF
4854 if (utf && HAS_EXTRALEN(c))
4855 { /* Braces are required because the */
4856 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4857 }
4858 #endif
4859
4860 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4861 /* In the pre-compile phase, accumulate the length of any extra
4862 data and reset the pointer. This is so that very large classes that
4863 contain a zillion > 255 characters no longer overwrite the work space
4864 (which is on the stack). We have to remember that there was XCLASS data,
4865 however. */
4866
4867 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4868 {
4869 xclass = TRUE;
4870 *lengthptr += (int)(class_uchardata - class_uchardata_base);
4871 class_uchardata = class_uchardata_base;
4872 }
4873 #endif
4874
4875 /* Inside \Q...\E everything is literal except \E */
4876
4877 if (inescq)
4878 {
4879 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4880 {
4881 inescq = FALSE; /* Reset literal state */
4882 ptr++; /* Skip the 'E' */
4883 continue; /* Carry on with next */
4884 }
4885 goto CHECK_RANGE; /* Could be range if \E follows */
4886 }
4887
4888 /* Handle POSIX class names. Perl allows a negation extension of the
4889 form [:^name:]. A square bracket that doesn't match the syntax is
4890 treated as a literal. We also recognize the POSIX constructions
4891 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4892 5.6 and 5.8 do. */
4893
4894 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4895 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4896 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4897 {
4898 BOOL local_negate = FALSE;
4899 int posix_class, taboffset, tabopt;
4900 register const pcre_uint8 *cbits = cd->cbits;
4901 pcre_uint8 pbits[32];
4902
4903 if (ptr[1] != CHAR_COLON)
4904 {
4905 *errorcodeptr = ERR31;
4906 goto FAILED;
4907 }
4908
4909 ptr += 2;
4910 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4911 {
4912 local_negate = TRUE;
4913 should_flip_negation = TRUE; /* Note negative special */
4914 ptr++;
4915 }
4916
4917 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4918 if (posix_class < 0)
4919 {
4920 *errorcodeptr = ERR30;
4921 goto FAILED;
4922 }
4923
4924 /* If matching is caseless, upper and lower are converted to
4925 alpha. This relies on the fact that the class table starts with
4926 alpha, lower, upper as the first 3 entries. */
4927
4928 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4929 posix_class = 0;
4930
4931 /* When PCRE_UCP is set, some of the POSIX classes are converted to
4932 different escape sequences that use Unicode properties \p or \P. Others
4933 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4934 directly. */
4935
4936 #ifdef SUPPORT_UCP
4937 if ((options & PCRE_UCP) != 0)
4938 {
4939 unsigned int ptype = 0;
4940 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4941
4942 /* The posix_substitutes table specifies which POSIX classes can be
4943 converted to \p or \P items. */
4944
4945 if (posix_substitutes[pc] != NULL)
4946 {
4947 nestptr = tempptr + 1;
4948 ptr = posix_substitutes[pc] - 1;
4949 continue;
4950 }
4951
4952 /* There are three other classes that generate special property calls
4953 that are recognized only in an XCLASS. */
4954
4955 else switch(posix_class)
4956 {
4957 case PC_GRAPH:
4958 ptype = PT_PXGRAPH;
4959 /* Fall through */
4960 case PC_PRINT:
4961 if (ptype == 0) ptype = PT_PXPRINT;
4962 /* Fall through */
4963 case PC_PUNCT:
4964 if (ptype == 0) ptype = PT_PXPUNCT;
4965 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4966 *class_uchardata++ = ptype;
4967 *class_uchardata++ = 0;
4968 xclass_has_prop = TRUE;
4969 ptr = tempptr + 1;
4970 continue;
4971
4972 /* For all other POSIX classes, no special action is taken in UCP
4973 mode. Fall through to the non_UCP case. */
4974
4975 default:
4976 break;
4977 }
4978 }
4979 #endif
4980 /* In the non-UCP case, or when UCP makes no difference, we build the
4981 bit map for the POSIX class in a chunk of local store because we may be
4982 adding and subtracting from it, and we don't want to subtract bits that
4983 may be in the main map already. At the end we or the result into the
4984 bit map that is being built. */
4985
4986 posix_class *= 3;
4987
4988 /* Copy in the first table (always present) */
4989
4990 memcpy(pbits, cbits + posix_class_maps[posix_class],
4991 32 * sizeof(pcre_uint8));
4992
4993 /* If there is a second table, add or remove it as required. */
4994
4995 taboffset = posix_class_maps[posix_class + 1];
4996 tabopt = posix_class_maps[posix_class + 2];
4997
4998 if (taboffset >= 0)
4999 {
5000 if (tabopt >= 0)
5001 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5002 else
5003 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5004 }
5005
5006 /* Now see if we need to remove any special characters. An option
5007 value of 1 removes vertical space and 2 removes underscore. */
5008
5009 if (tabopt < 0) tabopt = -tabopt;
5010 if (tabopt == 1) pbits[1] &= ~0x3c;
5011 else if (tabopt == 2) pbits[11] &= 0x7f;
5012
5013 /* Add the POSIX table or its complement into the main table that is
5014 being built and we are done. */
5015
5016 if (local_negate)
5017 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5018 else
5019 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5020
5021 ptr = tempptr + 1;
5022 /* Every class contains at least one < 256 character. */
5023 class_has_8bitchar = 1;
5024 /* Every class contains at least two characters. */
5025 class_one_char = 2;
5026 continue; /* End of POSIX syntax handling */
5027 }
5028
5029 /* Backslash may introduce a single character, or it may introduce one
5030 of the specials, which just set a flag. The sequence \b is a special
5031 case. Inside a class (and only there) it is treated as backspace. We
5032 assume that other escapes have more than one character in them, so
5033 speculatively set both class_has_8bitchar and class_one_char bigger
5034 than one. Unrecognized escapes fall through and are either treated
5035 as literal characters (by default), or are faulted if
5036 PCRE_EXTRA is set. */
5037
5038 if (c == CHAR_BACKSLASH)
5039 {
5040 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5041 TRUE);
5042 if (*errorcodeptr != 0) goto FAILED;
5043 if (escape == 0) c = ec;
5044 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5045 else if (escape == ESC_N) /* \N is not supported in a class */
5046 {
5047 *errorcodeptr = ERR71;
5048 goto FAILED;
5049 }
5050 else if (escape == ESC_Q) /* Handle start of quoted string */
5051 {
5052 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5053 {
5054 ptr += 2; /* avoid empty string */
5055 }
5056 else inescq = TRUE;
5057 continue;
5058 }
5059 else if (escape == ESC_E) continue; /* Ignore orphan \E */
5060
5061 else
5062 {
5063 register const pcre_uint8 *cbits = cd->cbits;
5064 /* Every class contains at least two < 256 characters. */
5065 class_has_8bitchar++;
5066 /* Every class contains at least two characters. */
5067 class_one_char += 2;
5068
5069 switch (escape)
5070 {
5071 #ifdef SUPPORT_UCP
5072 case ESC_du: /* These are the values given for \d etc */
5073 case ESC_DU: /* when PCRE_UCP is set. We replace the */
5074 case ESC_wu: /* escape sequence with an appropriate \p */
5075 case ESC_WU: /* or \P to test Unicode properties instead */
5076 case ESC_su: /* of the default ASCII testing. */
5077 case ESC_SU:
5078 nestptr = ptr;
5079 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
5080 class_has_8bitchar--; /* Undo! */
5081 continue;
5082 #endif
5083 case ESC_d:
5084 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5085 continue;
5086
5087 case ESC_D:
5088 should_flip_negation = TRUE;
5089 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5090 continue;
5091
5092 case ESC_w:
5093 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5094 continue;
5095
5096 case ESC_W:
5097 should_flip_negation = TRUE;
5098 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5099 continue;
5100
5101 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5102 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5103 previously set by something earlier in the character class.
5104 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5105 we could just adjust the appropriate bit. From PCRE 8.34 we no
5106 longer treat \s and \S specially. */
5107
5108 case ESC_s:
5109 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5110 continue;
5111
5112 case ESC_S:
5113 should_flip_negation = TRUE;
5114 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5115 continue;
5116
5117 /* The rest apply in both UCP and non-UCP cases. */
5118
5119 case ESC_h:
5120 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5121 PRIV(hspace_list), NOTACHAR);
5122 continue;
5123
5124 case ESC_H:
5125 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5126 cd, PRIV(hspace_list));
5127 continue;
5128
5129 case ESC_v:
5130 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5131 PRIV(vspace_list), NOTACHAR);
5132 continue;
5133
5134 case ESC_V:
5135 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5136 cd, PRIV(vspace_list));
5137 continue;
5138
5139 #ifdef SUPPORT_UCP
5140 case ESC_p:
5141 case ESC_P:
5142 {
5143 BOOL negated;
5144 unsigned int ptype = 0, pdata = 0;
5145 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5146 goto FAILED;
5147 *class_uchardata++ = ((escape == ESC_p) != negated)?
5148 XCL_PROP : XCL_NOTPROP;
5149 *class_uchardata++ = ptype;
5150 *class_uchardata++ = pdata;
5151 xclass_has_prop = TRUE;
5152 class_has_8bitchar--; /* Undo! */
5153 continue;
5154 }
5155 #endif
5156 /* Unrecognized escapes are faulted if PCRE is running in its
5157 strict mode. By default, for compatibility with Perl, they are
5158 treated as literals. */
5159
5160 default:
5161 if ((options & PCRE_EXTRA) != 0)
5162 {
5163 *errorcodeptr = ERR7;
5164 goto FAILED;
5165 }
5166 class_has_8bitchar--; /* Undo the speculative increase. */
5167 class_one_char -= 2; /* Undo the speculative increase. */
5168 c = *ptr; /* Get the final character and fall through */
5169 break;
5170 }
5171 }
5172
5173 /* Fall through if the escape just defined a single character (c >= 0).
5174 This may be greater than 256. */
5175
5176 escape = 0;
5177
5178 } /* End of backslash handling */
5179
5180 /* A character may be followed by '-' to form a range. However, Perl does
5181 not permit ']' to be the end of the range. A '-' character at the end is
5182 treated as a literal. Perl ignores orphaned \E sequences entirely. The
5183 code for handling \Q and \E is messy. */
5184
5185 CHECK_RANGE:
5186 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5187 {
5188 inescq = FALSE;
5189 ptr += 2;
5190 }
5191 oldptr = ptr;
5192
5193 /* Remember if \r or \n were explicitly used */
5194
5195 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5196
5197 /* Check for range */
5198
5199 if (!inescq && ptr[1] == CHAR_MINUS)
5200 {
5201 pcre_uint32 d;
5202 ptr += 2;
5203 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5204
5205 /* If we hit \Q (not followed by \E) at this point, go into escaped
5206 mode. */
5207
5208 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5209 {
5210 ptr += 2;
5211 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5212 { ptr += 2; continue; }
5213 inescq = TRUE;
5214 break;
5215 }
5216
5217 /* Minus (hyphen) at the end of a class is treated as a literal, so put
5218 back the pointer and jump to handle the character that preceded it. */
5219
5220 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5221 {
5222 ptr = oldptr;
5223 goto CLASS_SINGLE_CHARACTER;
5224 }
5225
5226 /* Otherwise, we have a potential range; pick up the next character */
5227
5228 #ifdef SUPPORT_UTF
5229 if (utf)
5230 { /* Braces are required because the */
5231 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
5232 }
5233 else
5234 #endif
5235 d = *ptr; /* Not UTF-8 mode */
5236
5237 /* The second part of a range can be a single-character escape
5238 sequence, but not any of the other escapes. Perl treats a hyphen as a
5239 literal in such circumstances. However, in Perl's warning mode, a
5240 warning is given, so PCRE now faults it as it is almost certainly a
5241 mistake on the user's part. */
5242
5243 if (!inescq)
5244 {
5245 if (d == CHAR_BACKSLASH)
5246 {
5247 int descape;
5248 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5249 if (*errorcodeptr != 0) goto FAILED;
5250
5251 /* 0 means a character was put into d; \b is backspace; any other
5252 special causes an error. */
5253
5254 if (descape != 0)
5255 {
5256 if (descape == ESC_b) d = CHAR_BS; else
5257 {
5258 *errorcodeptr = ERR83;
5259 goto FAILED;
5260 }
5261 }
5262 }
5263
5264 /* A hyphen followed by a POSIX class is treated in the same way. */
5265
5266 else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5267 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5268 ptr[1] == CHAR_EQUALS_SIGN) &&
5269 check_posix_syntax(ptr, &tempptr))
5270 {
5271 *errorcodeptr = ERR83;
5272 goto FAILED;
5273 }
5274 }
5275
5276 /* Check that the two values are in the correct order. Optimize
5277 one-character ranges. */
5278
5279 if (d < c)
5280 {
5281 *errorcodeptr = ERR8;
5282 goto FAILED;
5283 }
5284 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
5285
5286 /* We have found a character range, so single character optimizations
5287 cannot be done anymore. Any value greater than 1 indicates that there
5288 is more than one character. */
5289
5290 class_one_char = 2;
5291
5292 /* Remember an explicit \r or \n, and add the range to the class. */
5293
5294 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5295
5296 class_has_8bitchar +=
5297 add_to_class(classbits, &class_uchardata, options, cd, c, d);
5298
5299 continue; /* Go get the next char in the class */
5300 }
5301
5302 /* Handle a single character - we can get here for a normal non-escape
5303 char, or after \ that introduces a single character or for an apparent
5304 range that isn't. Only the value 1 matters for class_one_char, so don't
5305 increase it if it is already 2 or more ... just in case there's a class
5306 with a zillion characters in it. */
5307
5308 CLASS_SINGLE_CHARACTER:
5309 if (class_one_char < 2) class_one_char++;
5310
5311 /* If class_one_char is 1, we have the first single character in the
5312 class, and there have been no prior ranges, or XCLASS items generated by
5313 escapes. If this is the final character in the class, we can optimize by
5314 turning the item into a 1-character OP_CHAR[I] if it's positive, or
5315 OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
5316 to be set. Otherwise, there can be no first char if this item is first,
5317 whatever repeat count may follow. In the case of reqchar, save the
5318 previous value for reinstating. */
5319
5320 if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5321 {
5322 ptr++;
5323 zeroreqchar = reqchar;
5324 zeroreqcharflags = reqcharflags;
5325
5326 if (negate_class)
5327 {
5328 #ifdef SUPPORT_UCP
5329 int d;
5330 #endif
5331 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5332 zerofirstchar = firstchar;
5333 zerofirstcharflags = firstcharflags;
5334
5335 /* For caseless UTF-8 mode when UCP support is available, check
5336 whether this character has more than one other case. If so, generate
5337 a special OP_NOTPROP item instead of OP_NOTI. */
5338
5339 #ifdef SUPPORT_UCP
5340 if (utf && (options & PCRE_CASELESS) != 0 &&
5341 (d = UCD_CASESET(c)) != 0)
5342 {
5343 *code++ = OP_NOTPROP;
5344 *code++ = PT_CLIST;
5345 *code++ = d;
5346 }
5347 else
5348 #endif
5349 /* Char has only one other case, or UCP not available */
5350
5351 {
5352 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5353 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5354 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5355 code += PRIV(ord2utf)(c, code);
5356 else
5357 #endif
5358 *code++ = c;
5359 }
5360
5361 /* We are finished with this character class */
5362
5363 goto END_CLASS;
5364 }
5365
5366 /* For a single, positive character, get the value into mcbuffer, and
5367 then we can handle this with the normal one-character code. */
5368
5369 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5370 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5371 mclength = PRIV(ord2utf)(c, mcbuffer);
5372 else
5373 #endif
5374 {
5375 mcbuffer[0] = c;
5376 mclength = 1;
5377 }
5378 goto ONE_CHAR;
5379 } /* End of 1-char optimization */
5380
5381 /* There is more than one character in the class, or an XCLASS item
5382 has been generated. Add this character to the class. */
5383
5384 class_has_8bitchar +=
5385 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5386 }
5387
5388 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5389 If we are at the end of an internal nested string, revert to the outer
5390 string. */
5391
5392 while (((c = *(++ptr)) != CHAR_NULL ||
5393 (nestptr != NULL &&
5394 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5395 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5396
5397 /* Check for missing terminating ']' */
5398
5399 if (c == CHAR_NULL)
5400 {
5401 *errorcodeptr = ERR6;
5402 goto FAILED;
5403 }
5404
5405 /* We will need an XCLASS if data has been placed in class_uchardata. In
5406 the second phase this is a sufficient test. However, in the pre-compile
5407 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5408 only if the very last character in the class needs XCLASS will it contain
5409 anything at this point. For this reason, xclass gets set TRUE above when
5410 uchar_classdata is emptied, and that's why this code is the way it is here
5411 instead of just doing a test on class_uchardata below. */
5412
5413 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5414 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5415 #endif
5416
5417 /* If this is the first thing in the branch, there can be no first char
5418 setting, whatever the repeat count. Any reqchar setting must remain
5419 unchanged after any kind of repeat. */
5420
5421 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5422 zerofirstchar = firstchar;
5423 zerofirstcharflags = firstcharflags;
5424 zeroreqchar = reqchar;
5425 zeroreqcharflags = reqcharflags;
5426
5427 /* If there are characters with values > 255, we have to compile an
5428 extended class, with its own opcode, unless there was a negated special
5429 such as \S in the class, and PCRE_UCP is not set, because in that case all
5430 characters > 255 are in the class, so any that were explicitly given as
5431 well can be ignored. If (when there are explicit characters > 255 that must
5432 be listed) there are no characters < 256, we can omit the bitmap in the
5433 actual compiled code. */
5434
5435 #ifdef SUPPORT_UTF
5436 if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5437 #elif !defined COMPILE_PCRE8
5438 if (xclass && !should_flip_negation)
5439 #endif
5440 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5441 {
5442 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5443 *code++ = OP_XCLASS;
5444 code += LINK_SIZE;
5445 *code = negate_class? XCL_NOT:0;
5446 if (xclass_has_prop) *code |= XCL_HASPROP;
5447
5448 /* If the map is required, move up the extra data to make room for it;
5449 otherwise just move the code pointer to the end of the extra data. */
5450
5451 if (class_has_8bitchar > 0)
5452 {
5453 *code++ |= XCL_MAP;
5454 memmove(code + (32 / sizeof(pcre_uchar)), code,
5455 IN_UCHARS(class_uchardata - code));
5456 if (negate_class && !xclass_has_prop)
5457 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5458 memcpy(code, classbits, 32);
5459 code = class_uchardata + (32 / sizeof(pcre_uchar));
5460 }
5461 else code = class_uchardata;
5462
5463 /* Now fill in the complete length of the item */
5464
5465 PUT(previous, 1, (int)(code - previous));
5466 break; /* End of class handling */
5467 }
5468 #endif
5469
5470 /* If there are no characters > 255, or they are all to be included or
5471 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5472 whole class was negated and whether there were negative specials such as \S
5473 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5474 negating it if necessary. */
5475
5476 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5477 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5478 {
5479 if (negate_class)
5480 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5481 memcpy(code, classbits, 32);
5482 }
5483 code += 32 / sizeof(pcre_uchar);
5484
5485 END_CLASS:
5486 break;
5487
5488
5489 /* ===================================================================*/
5490 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5491 has been tested above. */
5492
5493 case CHAR_LEFT_CURLY_BRACKET:
5494 if (!is_quantifier) goto NORMAL_CHAR;
5495 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5496 if (*errorcodeptr != 0) goto FAILED;
5497 goto REPEAT;
5498
5499 case CHAR_ASTERISK:
5500 repeat_min = 0;
5501 repeat_max = -1;
5502 goto REPEAT;
5503
5504 case CHAR_PLUS:
5505 repeat_min = 1;
5506 repeat_max = -1;
5507 goto REPEAT;
5508
5509 case CHAR_QUESTION_MARK:
5510 repeat_min = 0;
5511 repeat_max = 1;
5512
5513 REPEAT:
5514 if (previous == NULL)
5515 {
5516 *errorcodeptr = ERR9;
5517 goto FAILED;
5518 }
5519
5520 if (repeat_min == 0)
5521 {
5522 firstchar = zerofirstchar; /* Adjust for zero repeat */
5523 firstcharflags = zerofirstcharflags;
5524 reqchar = zeroreqchar; /* Ditto */
5525 reqcharflags = zeroreqcharflags;
5526 }
5527
5528 /* Remember whether this is a variable length repeat */
5529
5530 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5531
5532 op_type = 0; /* Default single-char op codes */
5533 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5534
5535 /* Save start of previous item, in case we have to move it up in order to
5536 insert something before it. */
5537
5538 tempcode = previous;
5539
5540 /* Before checking for a possessive quantifier, we must skip over
5541 whitespace and comments in extended mode because Perl allows white space at
5542 this point. */
5543
5544 if ((options & PCRE_EXTENDED) != 0)
5545 {
5546 const pcre_uchar *p = ptr + 1;
5547 for (;;)
5548 {
5549 while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5550 if (*p != CHAR_NUMBER_SIGN) break;
5551 p++;
5552 while (*p != CHAR_NULL)
5553 {
5554 if (IS_NEWLINE(p)) /* For non-fixed-length newline cases, */
5555 { /* IS_NEWLINE sets cd->nllen. */
5556 p += cd->nllen;
5557 break;
5558 }
5559 p++;
5560 #ifdef SUPPORT_UTF
5561 if (utf) FORWARDCHAR(p);
5562 #endif
5563 } /* Loop for comment characters */
5564 } /* Loop for multiple comments */
5565 ptr = p - 1; /* Character before the next significant one. */
5566 }
5567
5568 /* If the next character is '+', we have a possessive quantifier. This
5569 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5570 If the next character is '?' this is a minimizing repeat, by default,
5571 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5572 repeat type to the non-default. */
5573
5574 if (ptr[1] == CHAR_PLUS)
5575 {
5576 repeat_type = 0; /* Force greedy */
5577 possessive_quantifier = TRUE;
5578 ptr++;
5579 }
5580 else if (ptr[1] == CHAR_QUESTION_MARK)
5581 {
5582 repeat_type = greedy_non_default;
5583 ptr++;
5584 }
5585 else repeat_type = greedy_default;
5586
5587 /* If previous was a recursion call, wrap it in atomic brackets so that
5588 previous becomes the atomic group. All recursions were so wrapped in the
5589 past, but it no longer happens for non-repeated recursions. In fact, the
5590 repeated ones could be re-implemented independently so as not to need this,
5591 but for the moment we rely on the code for repeating groups. */
5592
5593 if (*previous == OP_RECURSE)
5594 {
5595 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5596 *previous = OP_ONCE;
5597 PUT(previous, 1, 2 + 2*LINK_SIZE);
5598 previous[2 + 2*LINK_SIZE] = OP_KET;
5599 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5600 code += 2 + 2 * LINK_SIZE;
5601 length_prevgroup = 3 + 3*LINK_SIZE;
5602
5603 /* When actually compiling, we need to check whether this was a forward
5604 reference, and if so, adjust the offset. */
5605
5606 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5607 {
5608 int offset = GET(cd->hwm, -LINK_SIZE);
5609 if (offset == previous + 1 - cd->start_code)
5610 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5611 }
5612 }
5613
5614 /* Now handle repetition for the different types of item. */
5615
5616 /* If previous was a character or negated character match, abolish the item
5617 and generate a repeat item instead. If a char item has a minimum of more
5618 than one, ensure that it is set in reqchar - it might not be if a sequence
5619 such as x{3} is the first thing in a branch because the x will have gone
5620 into firstchar instead. */
5621
5622 if (*previous == OP_CHAR || *previous == OP_CHARI
5623 || *previous == OP_NOT || *previous == OP_NOTI)
5624 {
5625 switch (*previous)
5626 {
5627 default: /* Make compiler happy. */
5628 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5629 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5630 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5631 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5632 }
5633
5634 /* Deal with UTF characters that take up more than one character. It's
5635 easier to write this out separately than try to macrify it. Use c to
5636 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5637 it's a length rather than a small character. */
5638
5639 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5640 if (utf && NOT_FIRSTCHAR(code[-1]))
5641 {
5642 pcre_uchar *lastchar = code - 1;
5643 BACKCHAR(lastchar);
5644 c = (int)(code - lastchar); /* Length of UTF-8 character */
5645 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5646 c |= UTF_LENGTH; /* Flag c as a length */
5647 }
5648 else
5649 #endif /* SUPPORT_UTF */
5650
5651 /* Handle the case of a single charater - either with no UTF support, or
5652 with UTF disabled, or for a single character UTF character. */
5653 {
5654 c = code[-1];
5655 if (*previous <= OP_CHARI && repeat_min > 1)
5656 {
5657 reqchar = c;
5658 reqcharflags = req_caseopt | cd->req_varyopt;
5659 }
5660 }
5661
5662 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5663 }
5664
5665 /* If previous was a character type match (\d or similar), abolish it and
5666 create a suitable repeat item. The code is shared with single-character
5667 repeats by setting op_type to add a suitable offset into repeat_type. Note
5668 the the Unicode property types will be present only when SUPPORT_UCP is
5669 defined, but we don't wrap the little bits of code here because it just
5670 makes it horribly messy. */
5671
5672 else if (*previous < OP_EODN)
5673 {
5674 pcre_uchar *oldcode;
5675 int prop_type, prop_value;
5676 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5677 c = *previous;
5678
5679 OUTPUT_SINGLE_REPEAT:
5680 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5681 {
5682 prop_type = previous[1];
5683 prop_value = previous[2];
5684 }
5685 else prop_type = prop_value = -1;
5686
5687 oldcode = code;
5688 code = previous; /* Usually overwrite previous item */
5689
5690 /* If the maximum is zero then the minimum must also be zero; Perl allows
5691 this case, so we do too - by simply omitting the item altogether. */
5692
5693 if (repeat_max == 0) goto END_REPEAT;
5694
5695 /* Combine the op_type with the repeat_type */
5696
5697 repeat_type += op_type;
5698
5699 /* A minimum of zero is handled either as the special case * or ?, or as
5700 an UPTO, with the maximum given. */
5701
5702 if (repeat_min == 0)
5703 {
5704 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5705 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5706 else
5707 {
5708 *code++ = OP_UPTO + repeat_type;
5709 PUT2INC(code, 0, repeat_max);
5710 }
5711 }
5712
5713 /* A repeat minimum of 1 is optimized into some special cases. If the
5714 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5715 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5716 one less than the maximum. */
5717
5718 else if (repeat_min == 1)
5719 {
5720 if (repeat_max == -1)
5721 *code++ = OP_PLUS + repeat_type;
5722 else
5723 {
5724 code = oldcode; /* leave previous item in place */
5725 if (repeat_max == 1) goto END_REPEAT;
5726 *code++ = OP_UPTO + repeat_type;
5727 PUT2INC(code, 0, repeat_max - 1);
5728 }
5729 }
5730
5731 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5732 handled as an EXACT followed by an UPTO. */
5733
5734 else
5735 {
5736 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5737 PUT2INC(code, 0, repeat_min);
5738
5739 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5740 we have to insert the character for the previous code. For a repeated
5741 Unicode property match, there are two extra bytes that define the
5742 required property. In UTF-8 mode, long characters have their length in
5743 c, with the UTF_LENGTH bit as a flag. */
5744
5745 if (repeat_max < 0)
5746 {
5747 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5748 if (utf && (c & UTF_LENGTH) != 0)
5749 {
5750 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5751 code += c & 7;
5752 }
5753 else
5754 #endif
5755 {
5756 *code++ = c;
5757 if (prop_type >= 0)
5758 {
5759 *code++ = prop_type;
5760 *code++ = prop_value;
5761 }
5762 }
5763 *code++ = OP_STAR + repeat_type;
5764 }
5765
5766 /* Else insert an UPTO if the max is greater than the min, again
5767 preceded by the character, for the previously inserted code. If the
5768 UPTO is just for 1 instance, we can use QUERY instead. */
5769
5770 else if (repeat_max != repeat_min)
5771 {
5772 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5773 if (utf && (c & UTF_LENGTH) != 0)
5774 {
5775 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5776 code += c & 7;
5777 }
5778 else
5779 #endif
5780 *code++ = c;
5781 if (prop_type >= 0)
5782 {
5783 *code++ = prop_type;
5784 *code++ = prop_value;
5785 }
5786 repeat_max -= repeat_min;
5787
5788 if (repeat_max == 1)
5789 {
5790 *code++ = OP_QUERY + repeat_type;
5791 }
5792 else
5793 {
5794 *code++ = OP_UPTO + repeat_type;
5795 PUT2INC(code, 0, repeat_max);
5796 }
5797 }
5798 }
5799
5800 /* The character or character type itself comes last in all cases. */
5801
5802 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5803 if (utf && (c & UTF_LENGTH) != 0)
5804 {
5805 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5806 code += c & 7;
5807 }
5808 else
5809 #endif
5810 *code++ = c;
5811
5812 /* For a repeated Unicode property match, there are two extra bytes that
5813 define the required property. */
5814
5815 #ifdef SUPPORT_UCP
5816 if (prop_type >= 0)
5817 {
5818 *code++ = prop_type;
5819 *code++ = prop_value;
5820 }
5821 #endif
5822 }
5823
5824 /* If previous was a character class or a back reference, we put the repeat
5825 stuff after it, but just skip the item if the repeat was {0,0}. */
5826
5827 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5828 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5829 *previous == OP_XCLASS ||
5830 #endif
5831 *previous == OP_REF || *previous == OP_REFI ||
5832 *previous == OP_DNREF || *previous == OP_DNREFI)
5833 {
5834 if (repeat_max == 0)
5835 {
5836 code = previous;
5837 goto END_REPEAT;
5838 }
5839
5840 if (repeat_min == 0 && repeat_max == -1)
5841 *code++ = OP_CRSTAR + repeat_type;
5842 else if (repeat_min == 1 && repeat_max == -1)
5843 *code++ = OP_CRPLUS + repeat_type;
5844 else if (repeat_min == 0 && repeat_max == 1)
5845 *code++ = OP_CRQUERY + repeat_type;
5846 else
5847 {
5848 *code++ = OP_CRRANGE + repeat_type;
5849 PUT2INC(code, 0, repeat_min);
5850 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5851 PUT2INC(code, 0, repeat_max);
5852 }
5853 }
5854
5855 /* If previous was a bracket group, we may have to replicate it in certain
5856 cases. Note that at this point we can encounter only the "basic" bracket
5857 opcodes such as BRA and CBRA, as this is the place where they get converted
5858 into the more special varieties such as BRAPOS and SBRA. A test for >=
5859 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5860 ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5861 Originally, PCRE did not allow repetition of assertions, but now it does,
5862 for Perl compatibility. */
5863
5864 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5865 {
5866 register int i;
5867 int len = (int)(code - previous);
5868 pcre_uchar *bralink = NULL;
5869 pcre_uchar *brazeroptr = NULL;
5870
5871 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5872 we just ignore the repeat. */
5873
5874 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5875 goto END_REPEAT;
5876
5877 /* There is no sense in actually repeating assertions. The only potential
5878 use of repetition is in cases when the assertion is optional. Therefore,
5879 if the minimum is greater than zero, just ignore the repeat. If the
5880 maximum is not zero or one, set it to 1. */
5881
5882 if (*previous < OP_ONCE) /* Assertion */
5883 {
5884 if (repeat_min > 0) goto END_REPEAT;
5885 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5886 }
5887
5888 /* The case of a zero minimum is special because of the need to stick
5889 OP_BRAZERO in front of it, and because the group appears once in the
5890 data, whereas in other cases it appears the minimum number of times. For
5891 this reason, it is simplest to treat this case separately, as otherwise
5892 the code gets far too messy. There are several special subcases when the
5893 minimum is zero. */
5894
5895 if (repeat_min == 0)
5896 {
5897 /* If the maximum is also zero, we used to just omit the group from the
5898 output altogether, like this:
5899
5900 ** if (repeat_max == 0)
5901 ** {
5902 ** code = previous;
5903 ** goto END_REPEAT;
5904 ** }
5905
5906 However, that fails when a group or a subgroup within it is referenced
5907 as a subroutine from elsewhere in the pattern, so now we stick in
5908 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5909 don't have a list of which groups are referenced, we cannot do this
5910 selectively.
5911
5912 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5913 and do no more at this point. However, we do need to adjust any
5914 OP_RECURSE calls inside the group that refer to the group itself or any
5915 internal or forward referenced group, because the offset is from the
5916 start of the whole regex. Temporarily terminate the pattern while doing
5917 this. */
5918
5919 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5920 {
5921 *code = OP_END;
5922 adjust_recurse(previous, 1, utf, cd, save_hwm);
5923 memmove(previous + 1, previous, IN_UCHARS(len));
5924 code++;
5925 if (repeat_max == 0)
5926 {
5927 *previous++ = OP_SKIPZERO;
5928 goto END_REPEAT;
5929 }
5930 brazeroptr = previous; /* Save for possessive optimizing */
5931 *previous++ = OP_BRAZERO + repeat_type;
5932 }
5933
5934 /* If the maximum is greater than 1 and limited, we have to replicate
5935 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5936 The first one has to be handled carefully because it's the original
5937 copy, which has to be moved up. The remainder can be handled by code
5938 that is common with the non-zero minimum case below. We have to
5939 adjust the value or repeat_max, since one less copy is required. Once
5940 again, we may have to adjust any OP_RECURSE calls inside the group. */
5941
5942 else
5943 {
5944 int offset;
5945 *code = OP_END;
5946 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5947 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5948 code += 2 + LINK_SIZE;
5949 *previous++ = OP_BRAZERO + repeat_type;
5950 *previous++ = OP_BRA;
5951
5952 /* We chain together the bracket offset fields that have to be
5953 filled in later when the ends of the brackets are reached. */
5954
5955 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5956 bralink = previous;
5957 PUTINC(previous, 0, offset);
5958 }
5959
5960 repeat_max--;
5961 }
5962
5963 /* If the minimum is greater than zero, replicate the group as many
5964 times as necessary, and adjust the maximum to the number of subsequent
5965 copies that we need. If we set a first char from the group, and didn't
5966 set a required char, copy the latter from the former. If there are any
5967 forward reference subroutine calls in the group, there will be entries on
5968 the workspace list; replicate these with an appropriate increment. */
5969
5970 else
5971 {
5972 if (repeat_min > 1)
5973 {
5974 /* In the pre-compile phase, we don't actually do the replication. We
5975 just adjust the length as if we had. Do some paranoid checks for
5976 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5977 integer type when available, otherwise double. */
5978
5979 if (lengthptr != NULL)
5980 {
5981 int delta = (repeat_min - 1)*length_prevgroup;
5982 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5983 (INT64_OR_DOUBLE)length_prevgroup >
5984 (INT64_OR_DOUBLE)INT_MAX ||
5985 OFLOW_MAX - *lengthptr < delta)
5986 {
5987 *errorcodeptr = ERR20;
5988 goto FAILED;
5989 }
5990 *lengthptr += delta;
5991 }
5992
5993 /* This is compiling for real. If there is a set first byte for
5994 the group, and we have not yet set a "required byte", set it. Make
5995 sure there is enough workspace for copying forward references before
5996 doing the copy. */
5997
5998 else
5999 {
6000 if (groupsetfirstchar && reqcharflags < 0)
6001 {
6002 reqchar = firstchar;
6003 reqcharflags = firstcharflags;
6004 }
6005
6006 for (i = 1; i < repeat_min; i++)
6007 {
6008 pcre_uchar *hc;
6009 pcre_uchar *this_hwm = cd->hwm;
6010 memcpy(code, previous, IN_UCHARS(len));
6011
6012 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6013 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
6014 {
6015 size_t save_offset = save_hwm - cd->start_workspace;
6016 size_t this_offset = this_hwm - cd->start_workspace;
6017 *errorcodeptr = expand_workspace(cd);
6018 if (*errorcodeptr != 0) goto FAILED;
6019 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
6020 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
6021 }
6022
6023 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
6024 {
6025 PUT(cd->hwm, 0, GET(hc, 0) + len);
6026 cd->hwm += LINK_SIZE;
6027 }
6028 save_hwm = this_hwm;
6029 code += len;
6030 }
6031 }
6032 }
6033
6034 if (repeat_max > 0) repeat_max -= repeat_min;
6035 }
6036
6037 /* This code is common to both the zero and non-zero minimum cases. If
6038 the maximum is limited, it replicates the group in a nested fashion,
6039 remembering the bracket starts on a stack. In the case of a zero minimum,
6040 the first one was set up above. In all cases the repeat_max now specifies
6041 the number of additional copies needed. Again, we must remember to
6042 replicate entries on the forward reference list. */
6043
6044 if (repeat_max >= 0)
6045 {
6046 /* In the pre-compile phase, we don't actually do the replication. We
6047 just adjust the length as if we had. For each repetition we must add 1
6048 to the length for BRAZERO and for all but the last repetition we must
6049 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6050 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6051 a 64-bit integer type when available, otherwise double. */
6052
6053 if (lengthptr != NULL && repeat_max > 0)
6054 {
6055 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6056 2 - 2*LINK_SIZE; /* Last one doesn't nest */
6057 if ((INT64_OR_DOUBLE)repeat_max *
6058 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6059 > (INT64_OR_DOUBLE)INT_MAX ||
6060 OFLOW_MAX - *lengthptr < delta)
6061 {
6062 *errorcodeptr = ERR20;
6063 goto FAILED;
6064 }
6065 *lengthptr += delta;
6066 }
6067
6068 /* This is compiling for real */
6069
6070 else for (i = repeat_max - 1; i >= 0; i--)
6071 {
6072 pcre_uchar *hc;
6073 pcre_uchar *this_hwm = cd->hwm;
6074
6075 *code++ = OP_BRAZERO + repeat_type;
6076
6077 /* All but the final copy start a new nesting, maintaining the
6078 chain of brackets outstanding. */
6079
6080 if (i != 0)
6081 {
6082 int offset;
6083 *code++ = OP_BRA;
6084 offset = (bralink == NULL)? 0 : (int)(code - bralink);
6085 bralink = code;
6086 PUTINC(code, 0, offset);
6087 }
6088
6089 memcpy(code, previous, IN_UCHARS(len));
6090
6091 /* Ensure there is enough workspace for forward references before
6092 copying them. */
6093
6094 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6095 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
6096 {
6097 size_t save_offset = save_hwm - cd->start_workspace;
6098 size_t this_offset = this_hwm - cd->start_workspace;
6099 *errorcodeptr = expand_workspace(cd);
6100 if (*errorcodeptr != 0) goto FAILED;
6101 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
6102 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
6103 }
6104
6105 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
6106 {
6107 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6108 cd->hwm += LINK_SIZE;
6109 }
6110 save_hwm = this_hwm;
6111 code += len;
6112 }
6113
6114 /* Now chain through the pending brackets, and fill in their length
6115 fields (which are holding the chain links pro tem). */
6116
6117 while (bralink != NULL)
6118 {
6119 int oldlinkoffset;
6120 int offset = (int)(code - bralink + 1);
6121 pcre_uchar *bra = code - offset;
6122 oldlinkoffset = GET(bra, 1);
6123 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6124 *code++ = OP_KET;
6125 PUTINC(code, 0, offset);
6126 PUT(bra, 1, offset);
6127 }
6128 }
6129
6130 /* If the maximum is unlimited, set a repeater in the final copy. For
6131 ONCE brackets, that's all we need to do. However, possessively repeated
6132 ONCE brackets can be converted into non-capturing brackets, as the
6133 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6134 deal with possessive ONCEs specially.
6135
6136 Otherwise, when we are doing the actual compile phase, check to see
6137 whether this group is one that could match an empty string. If so,
6138 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6139 that runtime checking can be done. [This check is also applied to ONCE
6140 groups at runtime, but in a different way.]
6141
6142 Then, if the quantifier was possessive and the bracket is not a
6143 conditional, we convert the BRA code to the POS form, and the KET code to
6144 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6145 subpattern at both the start and at the end.) The use of special opcodes
6146 makes it possible to reduce greatly the stack usage in pcre_exec(). If
6147 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6148
6149 Then, if the minimum number of matches is 1 or 0, cancel the possessive
6150 flag so that the default action below, of wrapping everything inside
6151 atomic brackets, does not happen. When the minimum is greater than 1,
6152 there will be earlier copies of the group, and so we still have to wrap
6153 the whole thing. */
6154
6155 else
6156 {
6157 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6158 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6159
6160 /* Convert possessive ONCE brackets to non-capturing */
6161
6162 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6163 possessive_quantifier) *bracode = OP_BRA;
6164
6165 /* For non-possessive ONCE brackets, all we need to do is to
6166 set the KET. */
6167
6168 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6169 *ketcode = OP_KETRMAX + repeat_type;
6170
6171 /* Handle non-ONCE brackets and possessive ONCEs (which have been
6172 converted to non-capturing above). */
6173
6174 else
6175 {
6176 /* In the compile phase, check for empty string matching. */
6177
6178 if (lengthptr == NULL)
6179 {
6180 pcre_uchar *scode = bracode;
6181 do
6182 {
6183 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6184 {
6185 *bracode += OP_SBRA - OP_BRA;
6186 break;
6187 }
6188 scode += GET(scode, 1);
6189 }
6190 while (*scode == OP_ALT);
6191 }
6192
6193 /* Handle possessive quantifiers. */
6194
6195 if (possessive_quantifier)
6196 {
6197 /* For COND brackets, we wrap the whole thing in a possessively
6198 repeated non-capturing bracket, because we have not invented POS
6199 versions of the COND opcodes. Because we are moving code along, we
6200 must ensure that any pending recursive references are updated. */
6201
6202 if (*bracode == OP_COND || *bracode == OP_SCOND)
6203 {
6204 int nlen = (int)(code - bracode);
6205 *code = OP_END;
6206 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
6207 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6208 code += 1 + LINK_SIZE;
6209 nlen += 1 + LINK_SIZE;
6210 *bracode = OP_BRAPOS;
6211 *code++ = OP_KETRPOS;
6212 PUTINC(code, 0, nlen);
6213 PUT(bracode, 1, nlen);
6214 }
6215
6216 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6217
6218 else
6219 {
6220 *bracode += 1; /* Switch to xxxPOS opcodes */
6221 *ketcode = OP_KETRPOS;
6222 }
6223
6224 /* If the minimum is zero, mark it as possessive, then unset the
6225 possessive flag when the minimum is 0 or 1. */
6226
6227 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6228 if (repeat_min < 2) possessive_quantifier = FALSE;
6229 }
6230
6231 /* Non-possessive quantifier */
6232
6233 else *ketcode = OP_KETRMAX + repeat_type;
6234 }
6235 }
6236 }
6237
6238 /* If previous is OP_FAIL, it was generated by an empty class [] in
6239 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6240 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6241 error above. We can just ignore the repeat in JS case. */
6242
6243 else if (*previous == OP_FAIL) goto END_REPEAT;
6244
6245 /* Else there's some kind of shambles */
6246
6247 else
6248 {
6249 *errorcodeptr = ERR11;
6250 goto FAILED;
6251 }
6252
6253 /* If the character following a repeat is '+', possessive_quantifier is
6254 TRUE. For some opcodes, there are special alternative opcodes for this
6255 case. For anything else, we wrap the entire repeated item inside OP_ONCE
6256 brackets. Logically, the '+' notation is just syntactic sugar, taken from
6257 Sun's Java package, but the special opcodes can optimize it.
6258
6259 Some (but not all) possessively repeated subpatterns have already been
6260 completely handled in the code just above. For them, possessive_quantifier
6261 is always FALSE at this stage. Note that the repeated item starts at
6262 tempcode, not at previous, which might be the first part of a string whose
6263 (former) last char we repeated. */
6264
6265 if (possessive_quantifier)
6266 {
6267 int len;
6268
6269 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6270 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6271 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6272 remains is greater than zero, there's a further opcode that can be
6273 handled. If not, do nothing, leaving the EXACT alone. */
6274
6275 switch(*tempcode)
6276 {
6277 case OP_TYPEEXACT:
6278 tempcode += PRIV(OP_lengths)[*tempcode] +
6279 ((tempcode[1 + IMM2_SIZE] == OP_PROP
6280 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6281 break;
6282
6283 /* CHAR opcodes are used for exacts whose count is 1. */
6284
6285 case OP_CHAR:
6286 case OP_CHARI:
6287 case OP_NOT:
6288 case OP_NOTI:
6289 case OP_EXACT:
6290 case OP_EXACTI:
6291 case OP_NOTEXACT:
6292 case OP_NOTEXACTI:
6293 tempcode += PRIV(OP_lengths)[*tempcode];
6294 #ifdef SUPPORT_UTF
6295 if (utf && HAS_EXTRALEN(tempcode[-1]))
6296 tempcode += GET_EXTRALEN(tempcode[-1]);
6297 #endif
6298 break;
6299
6300 /* For the class opcodes, the repeat operator appears at the end;
6301 adjust tempcode to point to it. */
6302
6303 case OP_CLASS:
6304 case OP_NCLASS:
6305 tempcode += 1 + 32/sizeof(pcre_uchar);
6306 break;
6307
6308 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6309 case OP_XCLASS:
6310 tempcode += GET(tempcode, 1);
6311 break;
6312 #endif
6313 }
6314
6315 /* If tempcode is equal to code (which points to the end of the repeated
6316 item), it means we have skipped an EXACT item but there is no following
6317 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6318 all other cases, tempcode will be pointing to the repeat opcode, and will
6319 be less than code, so the value of len will be greater than 0. */
6320
6321 len = (int)(code - tempcode);
6322 if (len > 0)
6323 {
6324 unsigned int repcode = *tempcode;
6325
6326 /* There is a table for possessifying opcodes, all of which are less
6327 than OP_CALLOUT. A zero entry means there is no possessified version.
6328 */
6329
6330 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6331 *tempcode = opcode_possessify[repcode];
6332
6333 /* For opcode without a special possessified version, wrap the item in
6334 ONCE brackets. Because we are moving code along, we must ensure that any
6335 pending recursive references are updated. */
6336
6337 else
6338 {
6339 *code = OP_END;
6340 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6341 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6342 code += 1 + LINK_SIZE;
6343 len += 1 + LINK_SIZE;
6344 tempcode[0] = OP_ONCE;
6345 *code++ = OP_KET;
6346 PUTINC(code, 0, len);
6347 PUT(tempcode, 1, len);
6348 }
6349 }
6350
6351 #ifdef NEVER
6352 if (len > 0) switch (*tempcode)
6353 {
6354 case OP_STAR: *tempcode = OP_POSSTAR; break;
6355 case OP_PLUS: *tempcode = OP_POSPLUS; break;
6356 case OP_QUERY: *tempcode = OP_POSQUERY; break;
6357 case OP_UPTO: *tempcode = OP_POSUPTO; break;
6358
6359 case OP_STARI: *tempcode = OP_POSSTARI; break;
6360 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
6361 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6362 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
6363
6364 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
6365 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
6366 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6367 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
6368
6369 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
6370 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
6371 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6372 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
6373
6374 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
6375 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
6376 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6377 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
6378
6379 case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break;
6380 case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break;
6381 case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break;
6382 case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break;
6383
6384 /* Because we are moving code along, we must ensure that any
6385 pending recursive references are updated. */
6386
6387 default:
6388 *code = OP_END;
6389 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6390 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6391 code += 1 + LINK_SIZE;
6392 len += 1 + LINK_SIZE;
6393 tempcode[0] = OP_ONCE;
6394 *code++ = OP_KET;
6395 PUTINC(code, 0, len);
6396 PUT(tempcode, 1, len);
6397 break;
6398 }
6399 #endif
6400 }
6401
6402 /* In all case we no longer have a previous item. We also set the
6403 "follows varying string" flag for subsequently encountered reqchars if
6404 it isn't already set and we have just passed a varying length item. */
6405
6406 END_REPEAT:
6407 previous = NULL;
6408 cd->req_varyopt |= reqvary;
6409 break;
6410
6411
6412 /* ===================================================================*/
6413 /* Start of nested parenthesized sub-expression, or comment or lookahead or
6414 lookbehind or option setting or condition or all the other extended
6415 parenthesis forms. */
6416
6417 case CHAR_LEFT_PARENTHESIS:
6418 newoptions = options;
6419 skipbytes = 0;
6420 bravalue = OP_CBRA;
6421 save_hwm = cd->hwm;
6422 reset_bracount = FALSE;
6423
6424 /* First deal with various "verbs" that can be introduced by '*'. */
6425
6426 ptr++;
6427 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6428 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6429 {
6430 int i, namelen;
6431 int arglen = 0;
6432 const char *vn = verbnames;
6433 const pcre_uchar *name = ptr + 1;
6434 const pcre_uchar *arg = NULL;
6435 previous = NULL;
6436 ptr++;
6437 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6438 namelen = (int)(ptr - name);
6439
6440 /* It appears that Perl allows any characters whatsoever, other than
6441 a closing parenthesis, to appear in arguments, so we no longer insist on
6442 letters, digits, and underscores. */
6443
6444 if (*ptr == CHAR_COLON)
6445 {
6446 arg = ++ptr;
6447 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6448 arglen = (int)(ptr - arg);
6449 if ((unsigned int)arglen > MAX_MARK)
6450 {
6451 *errorcodeptr = ERR75;
6452 goto FAILED;
6453 }
6454 }
6455
6456 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6457 {
6458 *errorcodeptr = ERR60;
6459 goto FAILED;
6460 }
6461
6462 /* Scan the table of verb names */
6463
6464 for (i = 0; i < verbcount; i++)
6465 {
6466 if (namelen == verbs[i].len &&
6467 STRNCMP_UC_C8(name, vn, namelen) == 0)
6468 {
6469 int setverb;
6470
6471 /* Check for open captures before ACCEPT and convert it to
6472 ASSERT_ACCEPT if in an assertion. */
6473
6474 if (verbs[i].op == OP_ACCEPT)
6475 {
6476 open_capitem *oc;
6477 if (arglen != 0)
6478 {
6479 *errorcodeptr = ERR59;
6480 goto FAILED;
6481 }
6482 cd->had_accept = TRUE;
6483 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6484 {
6485 *code++ = OP_CLOSE;
6486 PUT2INC(code, 0, oc->number);
6487 }
6488 setverb = *code++ =
6489 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6490
6491 /* Do not set firstchar after *ACCEPT */
6492 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6493 }
6494
6495 /* Handle other cases with/without an argument */
6496
6497 else if (arglen == 0)
6498 {
6499 if (verbs[i].op < 0) /* Argument is mandatory */
6500 {
6501 *errorcodeptr = ERR66;
6502 goto FAILED;
6503 }
6504 setverb = *code++ = verbs[i].op;
6505 }
6506
6507 else
6508 {
6509 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6510 {
6511 *errorcodeptr = ERR59;
6512 goto FAILED;
6513 }
6514 setverb = *code++ = verbs[i].op_arg;
6515 *code++ = arglen;
6516 memcpy(code, arg, IN_UCHARS(arglen));
6517 code += arglen;
6518 *code++ = 0;
6519 }
6520
6521 switch (setverb)
6522 {
6523 case OP_THEN:
6524 case OP_THEN_ARG:
6525 cd->external_flags |= PCRE_HASTHEN;
6526 break;
6527
6528 case OP_PRUNE:
6529 case OP_PRUNE_ARG:
6530 case OP_SKIP:
6531 case OP_SKIP_ARG:
6532 cd->had_pruneorskip = TRUE;
6533 break;
6534 }
6535
6536 break; /* Found verb, exit loop */
6537 }
6538
6539 vn += verbs[i].len + 1;
6540 }
6541
6542 if (i < verbcount) continue; /* Successfully handled a verb */
6543 *errorcodeptr = ERR60; /* Verb not recognized */
6544 goto FAILED;
6545 }
6546
6547 /* Deal with the extended parentheses; all are introduced by '?', and the
6548 appearance of any of them means that this is not a capturing group. */
6549
6550 else if (*ptr == CHAR_QUESTION_MARK)
6551 {
6552 int i, set, unset, namelen;
6553 int *optset;
6554 const pcre_uchar *name;
6555 pcre_uchar *slot;
6556
6557 switch (*(++ptr))
6558 {
6559 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
6560 ptr++;
6561 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6562 if (*ptr == CHAR_NULL)
6563 {
6564 *errorcodeptr = ERR18;
6565 goto FAILED;
6566 }
6567 continue;
6568
6569
6570 /* ------------------------------------------------------------ */
6571 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6572 reset_bracount = TRUE;
6573 /* Fall through */
6574
6575 /* ------------------------------------------------------------ */
6576 case CHAR_COLON: /* Non-capturing bracket */
6577 bravalue = OP_BRA;
6578 ptr++;
6579 break;
6580
6581
6582 /* ------------------------------------------------------------ */
6583 case CHAR_LEFT_PARENTHESIS:
6584 bravalue = OP_COND; /* Conditional group */
6585 tempptr = ptr;
6586
6587 /* A condition can be an assertion, a number (referring to a numbered
6588 group's having been set), a name (referring to a named group), or 'R',
6589 referring to recursion. R<digits> and R&name are also permitted for
6590 recursion tests.
6591
6592 There are ways of testing a named group: (?(name)) is used by Python;
6593 Perl 5.10 onwards uses (?(<name>) or (?('name')).
6594
6595 There is one unfortunate ambiguity, caused by history. 'R' can be the
6596 recursive thing or the name 'R' (and similarly for 'R' followed by
6597 digits). We look for a name first; if not found, we try the other case.
6598
6599 For compatibility with auto-callouts, we allow a callout to be
6600 specified before a condition that is an assertion. First, check for the
6601 syntax of a callout; if found, adjust the temporary pointer that is
6602 used to check for an assertion condition. That's all that is needed! */
6603
6604 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6605 {
6606 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6607 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6608 tempptr += i + 1;
6609 }
6610
6611 /* For conditions that are assertions, check the syntax, and then exit
6612 the switch. This will take control down to where bracketed groups,
6613 including assertions, are processed. */
6614
6615 if (tempptr[1] == CHAR_QUESTION_MARK &&
6616 (tempptr[2] == CHAR_EQUALS_SIGN ||
6617 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6618 tempptr[2] == CHAR_LESS_THAN_SIGN))
6619 break;
6620
6621 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6622 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6623
6624 code[1+LINK_SIZE] = OP_CREF;
6625 skipbytes = 1+IMM2_SIZE;
6626 refsign = -1; /* => not a number */
6627 namelen = -1; /* => not a name; must set to avoid warning */
6628 name = NULL; /* Always set to avoid warning */
6629 recno = 0; /* Always set to avoid warning */
6630
6631 /* Check for a test for recursion in a named group. */
6632
6633 ptr++;
6634 if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6635 {
6636 terminator = -1;
6637 ptr += 2;
6638 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6639 }
6640
6641 /* Check for a test for a named group's having been set, using the Perl
6642 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6643 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6644
6645 else if (*ptr == CHAR_LESS_THAN_SIGN)
6646 {
6647 terminator = CHAR_GREATER_THAN_SIGN;
6648 ptr++;
6649 }
6650 else if (*ptr == CHAR_APOSTROPHE)
6651 {
6652 terminator = CHAR_APOSTROPHE;
6653 ptr++;
6654 }
6655 else
6656 {
6657 terminator = CHAR_NULL;
6658 if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6659 else if (IS_DIGIT(*ptr)) refsign = 0;