/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Diff of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 406 by ph10, Mon Mar 23 12:05:43 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2009 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 53  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC
101    
102    /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103    in UTF-8 mode. */
104    
105  static const short int escapes[] = {  static const short int escapes[] = {
106       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,                       0,
107       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,                       0,
108     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */       0,                       0,
109       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,                       0,
110  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */       0,                       0,
111  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */       CHAR_COLON,              CHAR_SEMICOLON,
112     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
113       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
114  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */       CHAR_COMMERCIAL_AT,      -ESC_A,
115       0,      0, -ESC_z                                            /* x - z */       -ESC_B,                  -ESC_C,
116         -ESC_D,                  -ESC_E,
117         0,                       -ESC_G,
118         -ESC_H,                  0,
119         0,                       -ESC_K,
120         0,                       0,
121         0,                       0,
122         -ESC_P,                  -ESC_Q,
123         -ESC_R,                  -ESC_S,
124         0,                       0,
125         -ESC_V,                  -ESC_W,
126         -ESC_X,                  0,
127         -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
128         CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
129         CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
130         CHAR_GRAVE_ACCENT,       7,
131         -ESC_b,                  0,
132         -ESC_d,                  ESC_e,
133         ESC_f,                   0,
134         -ESC_h,                  0,
135         0,                       -ESC_k,
136         0,                       0,
137         ESC_n,                   0,
138         -ESC_p,                  0,
139         ESC_r,                   -ESC_s,
140         ESC_tee,                 0,
141         -ESC_v,                  -ESC_w,
142         0,                       0,
143         -ESC_z
144  };  };
145    
146  #else         /* This is the "abnormal" table for EBCDIC systems */  #else
147    
148    /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
150  static const short int escapes[] = {  static const short int escapes[] = {
151  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
152  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 96  static const short int escapes[] = { Line 156  static const short int escapes[] = {
156  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
157  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
158  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
159  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
160  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
161  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
162  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
163  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
164  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
165  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
166  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
167  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
168  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
169  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
170  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
171  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
172  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
173  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 115  static const short int escapes[] = { Line 175  static const short int escapes[] = {
175  #endif  #endif
176    
177    
178  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179  terminated by a zero length entry. The first three must be alpha, lower, upper,  searched linearly. Put all the names into a single string, in order to reduce
180  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. The
181    string is built from string macros so that it works in UTF-8 mode on EBCDIC
182  static const char *const posix_names[] = {  platforms. */
183    "alpha", "lower", "upper",  
184    "alnum", "ascii", "blank", "cntrl", "digit", "graph",  typedef struct verbitem {
185    "print", "punct", "space", "word",  "xdigit" };    int   len;
186      int   op;
187    } verbitem;
188    
189    static const char verbnames[] =
190      STRING_ACCEPT0
191      STRING_COMMIT0
192      STRING_F0
193      STRING_FAIL0
194      STRING_PRUNE0
195      STRING_SKIP0
196      STRING_THEN;
197    
198    static const verbitem verbs[] = {
199      { 6, OP_ACCEPT },
200      { 6, OP_COMMIT },
201      { 1, OP_FAIL },
202      { 4, OP_FAIL },
203      { 5, OP_PRUNE },
204      { 4, OP_SKIP  },
205      { 4, OP_THEN  }
206    };
207    
208    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209    
210    
211    /* Tables of names of POSIX character classes and their lengths. The names are
212    now all in a single string, to reduce the number of relocations when a shared
213    library is dynamically loaded. The list of lengths is terminated by a zero
214    length entry. The first three must be alpha, lower, upper, as this is assumed
215    for handling case independence. */
216    
217    static const char posix_names[] =
218      STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219      STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220      STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221      STRING_word0  STRING_xdigit;
222    
223  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
224    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 155  static const int posix_class_maps[] = { Line 251  static const int posix_class_maps[] = {
251  };  };
252    
253    
254  /* The texts of compile-time error messages. These are "char *" because they  #define STRING(a)  # a
255  are passed to the outside world. */  #define XSTRING(s) STRING(s)
256    
257  static const char *error_texts[] = {  /* The texts of compile-time error messages. These are "char *" because they
258    "no error",  are passed to the outside world. Do not ever re-use any error number, because
259    "\\ at end of pattern",  they are documented. Always add a new error instead. Messages marked DEAD below
260    "\\c at end of pattern",  are no longer used. This used to be a table of strings, but in order to reduce
261    "unrecognized character follows \\",  the number of relocations needed when a shared library is loaded dynamically,
262    "numbers out of order in {} quantifier",  it is now one long string. We cannot use a table of offsets, because the
263    lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264    simply count through to the one we want - this isn't a performance issue
265    because these strings are used only when there is a compilation error. */
266    
267    static const char error_texts[] =
268      "no error\0"
269      "\\ at end of pattern\0"
270      "\\c at end of pattern\0"
271      "unrecognized character follows \\\0"
272      "numbers out of order in {} quantifier\0"
273    /* 5 */    /* 5 */
274    "number too big in {} quantifier",    "number too big in {} quantifier\0"
275    "missing terminating ] for character class",    "missing terminating ] for character class\0"
276    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
277    "range out of order in character class",    "range out of order in character class\0"
278    "nothing to repeat",    "nothing to repeat\0"
279    /* 10 */    /* 10 */
280    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
281    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
282    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
283    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
284    "missing )",    "missing )\0"
285    /* 15 */    /* 15 */
286    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
287    "erroffset passed as NULL",    "erroffset passed as NULL\0"
288    "unknown option bit(s) set",    "unknown option bit(s) set\0"
289    "missing ) after comment",    "missing ) after comment\0"
290    "parentheses nested too deeply",    "parentheses nested too deeply\0"  /** DEAD **/
291    /* 20 */    /* 20 */
292    "regular expression too large",    "regular expression is too large\0"
293    "failed to get memory",    "failed to get memory\0"
294    "unmatched parentheses",    "unmatched parentheses\0"
295    "internal error: code overflow",    "internal error: code overflow\0"
296    "unrecognized character after (?<",    "unrecognized character after (?<\0"
297    /* 25 */    /* 25 */
298    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
299    "malformed number after (?(",    "malformed number or name after (?(\0"
300    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
301    "assertion expected after (?(",    "assertion expected after (?(\0"
302    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
303    /* 30 */    /* 30 */
304    "unknown POSIX class name",    "unknown POSIX class name\0"
305    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
306    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307    "spare error",    "spare error\0"  /** DEAD **/
308    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
309    /* 35 */    /* 35 */
310    "invalid condition (?(0)",    "invalid condition (?(0)\0"
311    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
312    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313    "number after (?C is > 255",    "number after (?C is > 255\0"
314    "closing ) for (?C expected",    "closing ) for (?C expected\0"
315    /* 40 */    /* 40 */
316    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
317    "unrecognized character after (?P",    "unrecognized character after (?P\0"
318    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)\0"
319    "two named groups have the same name",    "two named subpatterns have the same name\0"
320    "invalid UTF-8 string",    "invalid UTF-8 string\0"
321    /* 45 */    /* 45 */
322    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
323    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
324    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p\0"
325  };    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327      /* 50 */
328      "repeated subpattern is too long\0"    /** DEAD **/
329      "octal value is greater than \\377 (not in UTF-8 mode)\0"
330      "internal error: overran compiling workspace\0"
331      "internal error: previously-checked referenced subpattern not found\0"
332      "DEFINE group contains more than one branch\0"
333      /* 55 */
334      "repeating a DEFINE group is not allowed\0"
335      "inconsistent NEWLINE options\0"
336      "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337      "a numbered reference must not be zero\0"
338      "(*VERB) with an argument is not supported\0"
339      /* 60 */
340      "(*VERB) not recognized\0"
341      "number is too big\0"
342      "subpattern name expected\0"
343      "digit expected after (?+\0"
344      "] is an invalid data character in JavaScript compatibility mode";
345    
346    
347  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 235  For convenience, we use the same bit def Line 360  For convenience, we use the same bit def
360    
361  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
362    
363  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC
364    
365    /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366    UTF-8 mode. */
367    
368  static const unsigned char digitab[] =  static const unsigned char digitab[] =
369    {    {
370    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 271  static const unsigned char digitab[] = Line 400  static const unsigned char digitab[] =
400    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402    
403  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else
404    
405    /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406    
407  static const unsigned char digitab[] =  static const unsigned char digitab[] =
408    {    {
409    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 285  static const unsigned char digitab[] = Line 417  static const unsigned char digitab[] =
417    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
418    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
419    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
420    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
421    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
422    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
423    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 319  static const unsigned char ebcdic_charta Line 451  static const unsigned char ebcdic_charta
451    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
452    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
453    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
454    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
455    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
456    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
457    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 346  static const unsigned char ebcdic_charta Line 478  static const unsigned char ebcdic_charta
478  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
479    
480  static BOOL  static BOOL
481    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
482      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
483    
484    
485    
486    /*************************************************
487    *            Find an error text                  *
488    *************************************************/
489    
490    /* The error texts are now all in one long string, to save on relocations. As
491    some of the text is of unknown length, we can't use a table of offsets.
492    Instead, just count through the strings. This is not a performance issue
493    because it happens only when there has been a compilation error.
494    
495    Argument:   the error number
496    Returns:    pointer to the error string
497    */
498    
499    static const char *
500    find_error_text(int n)
501    {
502    const char *s = error_texts;
503    for (; n > 0; n--) while (*s++ != 0) {};
504    return s;
505    }
506    
507    
508  /*************************************************  /*************************************************
# Line 357  static BOOL Line 511  static BOOL
511    
512  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
513  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
514  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
515  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
516  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
517    ptr is pointing at the \. On exit, it is on the final character of the escape
518    sequence.
519    
520  Arguments:  Arguments:
521    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 370  Arguments: Line 526  Arguments:
526    
527  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
528                   negative => a special escape sequence                   negative => a special escape sequence
529                   on error, errorptr is set                   on error, errorcodeptr is set
530  */  */
531    
532  static int  static int
# Line 388  ptr--;                            /* Set Line 544  ptr--;                            /* Set
544    
545  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
546    
547  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
549  Otherwise further processing may be required. */  Otherwise further processing may be required. */
550    
551  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
552  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
553  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554    
555  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
556  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
557  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
558  #endif  #endif
559    
# Line 406  else if ((i = escapes[c - 0x48]) != 0) Line 562  else if ((i = escapes[c - 0x48]) != 0)
562  else  else
563    {    {
564    const uschar *oldptr;    const uschar *oldptr;
565      BOOL braced, negated;
566    
567    switch (c)    switch (c)
568      {      {
569      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
570      error. */      error. */
571    
572      case 'l':      case CHAR_l:
573      case 'L':      case CHAR_L:
574      case 'N':      case CHAR_N:
575      case 'u':      case CHAR_u:
576      case 'U':      case CHAR_U:
577      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
578      break;      break;
579    
580        /* \g must be followed by one of a number of specific things:
581    
582        (1) A number, either plain or braced. If positive, it is an absolute
583        backreference. If negative, it is a relative backreference. This is a Perl
584        5.10 feature.
585    
586        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587        is part of Perl's movement towards a unified syntax for back references. As
588        this is synonymous with \k{name}, we fudge it up by pretending it really
589        was \k.
590    
591        (3) For Oniguruma compatibility we also support \g followed by a name or a
592        number either in angle brackets or in single quotes. However, these are
593        (possibly recursive) subroutine calls, _not_ backreferences. Just return
594        the -ESC_g code (cf \k). */
595    
596        case CHAR_g:
597        if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598          {
599          c = -ESC_g;
600          break;
601          }
602    
603        /* Handle the Perl-compatible cases */
604    
605        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606          {
607          const uschar *p;
608          for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609            if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610          if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611            {
612            c = -ESC_k;
613            break;
614            }
615          braced = TRUE;
616          ptr++;
617          }
618        else braced = FALSE;
619    
620        if (ptr[1] == CHAR_MINUS)
621          {
622          negated = TRUE;
623          ptr++;
624          }
625        else negated = FALSE;
626    
627        c = 0;
628        while ((digitab[ptr[1]] & ctype_digit) != 0)
629          c = c * 10 + *(++ptr) - CHAR_0;
630    
631        if (c < 0)   /* Integer overflow */
632          {
633          *errorcodeptr = ERR61;
634          break;
635          }
636    
637        if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638          {
639          *errorcodeptr = ERR57;
640          break;
641          }
642    
643        if (c == 0)
644          {
645          *errorcodeptr = ERR58;
646          break;
647          }
648    
649        if (negated)
650          {
651          if (c > bracount)
652            {
653            *errorcodeptr = ERR15;
654            break;
655            }
656          c = bracount - (c - 1);
657          }
658    
659        c = -(ESC_REF + c);
660        break;
661    
662      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
663      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
664      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 431  else Line 671  else
671      value is greater than 377, the least significant 8 bits are taken. Inside a      value is greater than 377, the least significant 8 bits are taken. Inside a
672      character class, \ followed by a digit is always an octal number. */      character class, \ followed by a digit is always an octal number. */
673    
674      case '1': case '2': case '3': case '4': case '5':      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675      case '6': case '7': case '8': case '9':      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676    
677      if (!isclass)      if (!isclass)
678        {        {
679        oldptr = ptr;        oldptr = ptr;
680        c -= '0';        c -= CHAR_0;
681        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
682          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - CHAR_0;
683          if (c < 0)    /* Integer overflow */
684            {
685            *errorcodeptr = ERR61;
686            break;
687            }
688        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
689          {          {
690          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 452  else Line 697  else
697      generates a binary zero byte and treats the digit as a following literal.      generates a binary zero byte and treats the digit as a following literal.
698      Thus we have to pull back the pointer by one. */      Thus we have to pull back the pointer by one. */
699    
700      if ((c = *ptr) >= '8')      if ((c = *ptr) >= CHAR_8)
701        {        {
702        ptr--;        ptr--;
703        c = 0;        c = 0;
# Line 460  else Line 705  else
705        }        }
706    
707      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
708      larger first octal digit. */      larger first octal digit. The original code used just to take the least
709        significant 8 bits of octal numbers (I think this is what early Perls used
710      case '0':      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711      c -= '0';      than 3 octal digits. */
712      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')  
713          c = c * 8 + *(++ptr) - '0';      case CHAR_0:
714      c &= 255;     /* Take least significant 8 bits */      c -= CHAR_0;
715        while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716            c = c * 8 + *(++ptr) - CHAR_0;
717        if (!utf8 && c > 255) *errorcodeptr = ERR51;
718      break;      break;
719    
720      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
721      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722      treated as a data character. */      treated as a data character. */
723    
724      case 'x':      case CHAR_x:
725      if (ptr[1] == '{')      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726        {        {
727        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
728        int count = 0;        int count = 0;
# Line 483  else Line 731  else
731        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
732          {          {
733          register int cc = *pt++;          register int cc = *pt++;
734          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
735          count++;          count++;
736    
737  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
738          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
739          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
741          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
742          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743  #endif  #endif
744          }          }
745    
746        if (*pt == '}')        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747          {          {
748          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749          ptr = pt;          ptr = pt;
# Line 511  else Line 759  else
759      c = 0;      c = 0;
760      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761        {        {
762        int cc;                               /* Some compilers don't like ++ */        int cc;                                  /* Some compilers don't like */
763        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                           /* ++ in initializers */
764  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
765        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
766        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
768        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
769        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770  #endif  #endif
771        }        }
772      break;      break;
773    
774      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
775        This coding is ASCII-specific, but then the whole concept of \cx is
776        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777    
778      case 'c':      case CHAR_c:
779      c = *(++ptr);      c = *(++ptr);
780      if (c == 0)      if (c == 0)
781        {        {
782        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
783        return 0;        break;
784        }        }
785    
786      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
787      is ASCII-specific, but then the whole concept of \cx is ASCII-specific.      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
     if (c >= 'a' && c <= 'z') c -= 32;  
788      c ^= 0x40;      c ^= 0x40;
789  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
790      if (c >= 'a' && c <= 'z') c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
791      c ^= 0xC0;      c ^= 0xC0;
792  #endif  #endif
793      break;      break;
794    
795      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
797      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
798      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
799      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
800    
801      default:      default:
802      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 603  if (c == 0) goto ERROR_RETURN; Line 849  if (c == 0) goto ERROR_RETURN;
849  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850  negation. */  negation. */
851    
852  if (c == '{')  if (c == CHAR_LEFT_CURLY_BRACKET)
853    {    {
854    if (ptr[1] == '^')    if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855      {      {
856      *negptr = TRUE;      *negptr = TRUE;
857      ptr++;      ptr++;
858      }      }
859    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
860      {      {
861      c = *(++ptr);      c = *(++ptr);
862      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
863      if (c == '}') break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864      name[i] = c;      name[i] = c;
865      }      }
866    if (c !='}') goto ERROR_RETURN;    if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867    name[i] = 0;    name[i] = 0;
868    }    }
869    
# Line 639  top = _pcre_utt_size; Line 885  top = _pcre_utt_size;
885  while (bot < top)  while (bot < top)
886    {    {
887    i = (bot + top) >> 1;    i = (bot + top) >> 1;
888    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889    if (c == 0)    if (c == 0)
890      {      {
891      *dptr = _pcre_utt[i].value;      *dptr = _pcre_utt[i].value;
# Line 682  is_counted_repeat(const uschar *p) Line 928  is_counted_repeat(const uschar *p)
928  {  {
929  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930  while ((digitab[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
931  if (*p == '}') return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932    
933  if (*p++ != ',') return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
934  if (*p == '}') return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935    
936  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937  while ((digitab[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
938    
939  return (*p == '}');  return (*p == CHAR_RIGHT_CURLY_BRACKET);
940  }  }
941    
942    
# Line 723  int max = -1; Line 969  int max = -1;
969  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
970  an integer overflow. */  an integer overflow. */
971    
972  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
974    {    {
975    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 733  if (min < 0 || min > 65535) Line 979  if (min < 0 || min > 65535)
979  /* Read the maximum value if there is one, and again do a paranoid on its size.  /* Read the maximum value if there is one, and again do a paranoid on its size.
980  Also, max must not be less than min. */  Also, max must not be less than min. */
981    
982  if (*p == '}') max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983    {    {
984    if (*(++p) != '}')    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985      {      {
986      max = 0;      max = 0;
987      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
989        {        {
990        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 763  return p; Line 1009  return p;
1009    
1010    
1011  /*************************************************  /*************************************************
1012    *       Find forward referenced subpattern       *
1013    *************************************************/
1014    
1015    /* This function scans along a pattern's text looking for capturing
1016    subpatterns, and counting them. If it finds a named pattern that matches the
1017    name it is given, it returns its number. Alternatively, if the name is NULL, it
1018    returns when it reaches a given numbered subpattern. This is used for forward
1019    references to subpatterns. We know that if (?P< is encountered, the name will
1020    be terminated by '>' because that is checked in the first pass.
1021    
1022    Arguments:
1023      ptr          current position in the pattern
1024      cd           compile background data
1025      name         name to seek, or NULL if seeking a numbered subpattern
1026      lorn         name length, or subpattern number if name is NULL
1027      xmode        TRUE if we are in /x mode
1028    
1029    Returns:       the number of the named subpattern, or -1 if not found
1030    */
1031    
1032    static int
1033    find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
1034      BOOL xmode)
1035    {
1036    const uschar *thisname;
1037    int count = cd->bracount;
1038    
1039    for (; *ptr != 0; ptr++)
1040      {
1041      int term;
1042    
1043      /* Skip over backslashed characters and also entire \Q...\E */
1044    
1045      if (*ptr == CHAR_BACKSLASH)
1046        {
1047        if (*(++ptr) == 0) return -1;
1048        if (*ptr == CHAR_Q) for (;;)
1049          {
1050          while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1051          if (*ptr == 0) return -1;
1052          if (*(++ptr) == CHAR_E) break;
1053          }
1054        continue;
1055        }
1056    
1057      /* Skip over character classes; this logic must be similar to the way they
1058      are handled for real. If the first character is '^', skip it. Also, if the
1059      first few characters (either before or after ^) are \Q\E or \E we skip them
1060      too. This makes for compatibility with Perl. Note the use of STR macros to
1061      encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1062    
1063      if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1064        {
1065        BOOL negate_class = FALSE;
1066        for (;;)
1067          {
1068          int c = *(++ptr);
1069          if (c == CHAR_BACKSLASH)
1070            {
1071            if (ptr[1] == CHAR_E)
1072              ptr++;
1073            else if (strncmp((const char *)ptr+1,
1074                     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1075              ptr += 3;
1076            else
1077              break;
1078            }
1079          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
1080            negate_class = TRUE;
1081          else break;
1082          }
1083    
1084        /* If the next character is ']', it is a data character that must be
1085        skipped, except in JavaScript compatibility mode. */
1086    
1087        if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1088            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1089          ptr++;
1090    
1091        while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1092          {
1093          if (*ptr == 0) return -1;
1094          if (*ptr == CHAR_BACKSLASH)
1095            {
1096            if (*(++ptr) == 0) return -1;
1097            if (*ptr == CHAR_Q) for (;;)
1098              {
1099              while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1100              if (*ptr == 0) return -1;
1101              if (*(++ptr) == CHAR_E) break;
1102              }
1103            continue;
1104            }
1105          }
1106        continue;
1107        }
1108    
1109      /* Skip comments in /x mode */
1110    
1111      if (xmode && *ptr == CHAR_NUMBER_SIGN)
1112        {
1113        while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1114        if (*ptr == 0) return -1;
1115        continue;
1116        }
1117    
1118      /* An opening parens must now be a real metacharacter */
1119    
1120      if (*ptr != CHAR_LEFT_PARENTHESIS) continue;
1121      if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1122        {
1123        count++;
1124        if (name == NULL && count == lorn) return count;
1125        continue;
1126        }
1127    
1128      ptr += 2;
1129      if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1130    
1131      /* We have to disambiguate (?<! and (?<= from (?<name> */
1132    
1133      if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||
1134          ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)
1135        continue;
1136    
1137      count++;
1138    
1139      if (name == NULL && count == lorn) return count;
1140      term = *ptr++;
1141      if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1142      thisname = ptr;
1143      while (*ptr != term) ptr++;
1144      if (name != NULL && lorn == ptr - thisname &&
1145          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1146        return count;
1147      }
1148    
1149    return -1;
1150    }
1151    
1152    
1153    
1154    /*************************************************
1155  *      Find first significant op code            *  *      Find first significant op code            *
1156  *************************************************/  *************************************************/
1157    
# Line 811  for (;;) Line 1200  for (;;)
1200    
1201      case OP_CALLOUT:      case OP_CALLOUT:
1202      case OP_CREF:      case OP_CREF:
1203      case OP_BRANUMBER:      case OP_RREF:
1204        case OP_DEF:
1205      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1206      break;      break;
1207    
# Line 856  for (;;) Line 1246  for (;;)
1246    {    {
1247    int d;    int d;
1248    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1249    switch (op)    switch (op)
1250      {      {
1251        case OP_CBRA:
1252      case OP_BRA:      case OP_BRA:
1253      case OP_ONCE:      case OP_ONCE:
1254      case OP_COND:      case OP_COND:
1255      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1256      if (d < 0) return d;      if (d < 0) return d;
1257      branchlength += d;      branchlength += d;
1258      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 898  for (;;) Line 1287  for (;;)
1287      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1288    
1289      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1290      case OP_CREF:      case OP_CREF:
1291        case OP_RREF:
1292        case OP_DEF:
1293      case OP_OPT:      case OP_OPT:
1294      case OP_CALLOUT:      case OP_CALLOUT:
1295      case OP_SOD:      case OP_SOD:
# Line 917  for (;;) Line 1307  for (;;)
1307    
1308      case OP_CHAR:      case OP_CHAR:
1309      case OP_CHARNC:      case OP_CHARNC:
1310        case OP_NOT:
1311      branchlength++;      branchlength++;
1312      cc += 2;      cc += 2;
1313  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 943  for (;;) Line 1334  for (;;)
1334    
1335      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1336      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1337        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1338      cc += 4;      cc += 4;
1339      break;      break;
1340    
# Line 960  for (;;) Line 1352  for (;;)
1352      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1353      case OP_WORDCHAR:      case OP_WORDCHAR:
1354      case OP_ANY:      case OP_ANY:
1355        case OP_ALLANY:
1356      branchlength++;      branchlength++;
1357      cc++;      cc++;
1358      break;      break;
# Line 1031  Returns:      pointer to the opcode for Line 1424  Returns:      pointer to the opcode for
1424  static const uschar *  static const uschar *
1425  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1426  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1427  for (;;)  for (;;)
1428    {    {
1429    register int c = *code;    register int c = *code;
1430    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1431    else if (c > OP_BRA)  
1432      /* XCLASS is used for classes that cannot be represented just by a bit
1433      map. This includes negated single high-valued characters. The length in
1434      the table is zero; the actual length is stored in the compiled code. */
1435    
1436      if (c == OP_XCLASS) code += GET(code, 1);
1437    
1438      /* Handle capturing bracket */
1439    
1440      else if (c == OP_CBRA)
1441      {      {
1442      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1443      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1444      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1445      }      }
1446    
1447      /* Otherwise, we can get the item's length from the table, except that for
1448      repeated character types, we have to test for \p and \P, which have an extra
1449      two bytes of parameters. */
1450    
1451    else    else
1452      {      {
1453      code += _pcre_OP_lengths[c];      switch(c)
1454          {
1455          case OP_TYPESTAR:
1456          case OP_TYPEMINSTAR:
1457          case OP_TYPEPLUS:
1458          case OP_TYPEMINPLUS:
1459          case OP_TYPEQUERY:
1460          case OP_TYPEMINQUERY:
1461          case OP_TYPEPOSSTAR:
1462          case OP_TYPEPOSPLUS:
1463          case OP_TYPEPOSQUERY:
1464          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1465          break;
1466    
1467  #ifdef SUPPORT_UTF8        case OP_TYPEUPTO:
1468          case OP_TYPEMINUPTO:
1469          case OP_TYPEEXACT:
1470          case OP_TYPEPOSUPTO:
1471          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1472          break;
1473          }
1474    
1475      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* Add in the fixed length from the table */
1476      by a multi-byte character. The length in the table is a minimum, so we have  
1477      to scan along to skip the extra bytes. All opcodes are less than 128, so we      code += _pcre_OP_lengths[c];
1478      can use relatively efficient code. */  
1479      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1480      a multi-byte character. The length in the table is a minimum, so we have to
1481      arrange to skip the extra bytes. */
1482    
1483    #ifdef SUPPORT_UTF8
1484      if (utf8) switch(c)      if (utf8) switch(c)
1485        {        {
1486        case OP_CHAR:        case OP_CHAR:
# Line 1064  for (;;) Line 1488  for (;;)
1488        case OP_EXACT:        case OP_EXACT:
1489        case OP_UPTO:        case OP_UPTO:
1490        case OP_MINUPTO:        case OP_MINUPTO:
1491          case OP_POSUPTO:
1492        case OP_STAR:        case OP_STAR:
1493        case OP_MINSTAR:        case OP_MINSTAR:
1494          case OP_POSSTAR:
1495        case OP_PLUS:        case OP_PLUS:
1496        case OP_MINPLUS:        case OP_MINPLUS:
1497          case OP_POSPLUS:
1498        case OP_QUERY:        case OP_QUERY:
1499        case OP_MINQUERY:        case OP_MINQUERY:
1500        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1501        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1502        break;        break;
1503        }        }
1504    #else
1505        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1506  #endif  #endif
1507      }      }
1508    }    }
# Line 1105  Returns:      pointer to the opcode for Line 1527  Returns:      pointer to the opcode for
1527  static const uschar *  static const uschar *
1528  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1529  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1530  for (;;)  for (;;)
1531    {    {
1532    register int c = *code;    register int c = *code;
1533    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1534    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1535    else if (c > OP_BRA)  
1536      {    /* XCLASS is used for classes that cannot be represented just by a bit
1537      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1538      }    the table is zero; the actual length is stored in the compiled code. */
1539    
1540      if (c == OP_XCLASS) code += GET(code, 1);
1541    
1542      /* Otherwise, we can get the item's length from the table, except that for
1543      repeated character types, we have to test for \p and \P, which have an extra
1544      two bytes of parameters. */
1545    
1546    else    else
1547      {      {
1548      code += _pcre_OP_lengths[c];      switch(c)
1549          {
1550          case OP_TYPESTAR:
1551          case OP_TYPEMINSTAR:
1552          case OP_TYPEPLUS:
1553          case OP_TYPEMINPLUS:
1554          case OP_TYPEQUERY:
1555          case OP_TYPEMINQUERY:
1556          case OP_TYPEPOSSTAR:
1557          case OP_TYPEPOSPLUS:
1558          case OP_TYPEPOSQUERY:
1559          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1560          break;
1561    
1562  #ifdef SUPPORT_UTF8        case OP_TYPEPOSUPTO:
1563          case OP_TYPEUPTO:
1564          case OP_TYPEMINUPTO:
1565          case OP_TYPEEXACT:
1566          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1567          break;
1568          }
1569    
1570        /* Add in the fixed length from the table */
1571    
1572        code += _pcre_OP_lengths[c];
1573    
1574      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1575      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
1576      to scan along to skip the extra bytes. All opcodes are less than 128, so we      to arrange to skip the extra bytes. */
     can use relatively efficient code. */  
1577    
1578    #ifdef SUPPORT_UTF8
1579      if (utf8) switch(c)      if (utf8) switch(c)
1580        {        {
1581        case OP_CHAR:        case OP_CHAR:
# Line 1136  for (;;) Line 1583  for (;;)
1583        case OP_EXACT:        case OP_EXACT:
1584        case OP_UPTO:        case OP_UPTO:
1585        case OP_MINUPTO:        case OP_MINUPTO:
1586          case OP_POSUPTO:
1587        case OP_STAR:        case OP_STAR:
1588        case OP_MINSTAR:        case OP_MINSTAR:
1589          case OP_POSSTAR:
1590        case OP_PLUS:        case OP_PLUS:
1591        case OP_MINPLUS:        case OP_MINPLUS:
1592          case OP_POSPLUS:
1593        case OP_QUERY:        case OP_QUERY:
1594        case OP_MINQUERY:        case OP_MINQUERY:
1595        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1596        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1597        break;        break;
1598        }        }
1599    #else
1600        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1601  #endif  #endif
1602      }      }
1603    }    }
# Line 1165  for (;;) Line 1610  for (;;)
1610  *************************************************/  *************************************************/
1611    
1612  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1613  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1614  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1615  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1616  whose current branch will already have been scanned.  backward and negative forward assertions when its final argument is TRUE. If we
1617    hit an unclosed bracket, we return "empty" - this means we've struck an inner
1618    bracket whose current branch will already have been scanned.
1619    
1620  Arguments:  Arguments:
1621    code        points to start of search    code        points to start of search
# Line 1182  static BOOL Line 1629  static BOOL
1629  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1630  {  {
1631  register int c;  register int c;
1632  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1633       code < endcode;       code < endcode;
1634       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1635    {    {
# Line 1190  for (code = first_significant_code(code Line 1637  for (code = first_significant_code(code
1637    
1638    c = *code;    c = *code;
1639    
1640    if (c >= OP_BRA)    /* Skip over forward assertions; the other assertions are skipped by
1641      first_significant_code() with a TRUE final argument. */
1642    
1643      if (c == OP_ASSERT)
1644        {
1645        do code += GET(code, 1); while (*code == OP_ALT);
1646        c = *code;
1647        continue;
1648        }
1649    
1650      /* Groups with zero repeats can of course be empty; skip them. */
1651    
1652      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1653        {
1654        code += _pcre_OP_lengths[c];
1655        do code += GET(code, 1); while (*code == OP_ALT);
1656        c = *code;
1657        continue;
1658        }
1659    
1660      /* For other groups, scan the branches. */
1661    
1662      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1663      {      {
1664      BOOL empty_branch;      BOOL empty_branch;
1665      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1666    
1667      /* Scan a closed bracket */      /* If a conditional group has only one branch, there is a second, implied,
1668        empty branch, so just skip over the conditional, because it could be empty.
1669        Otherwise, scan the individual branches of the group. */
1670    
1671      empty_branch = FALSE;      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
     do  
       {  
       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))  
         empty_branch = TRUE;  
1672        code += GET(code, 1);        code += GET(code, 1);
1673        else
1674          {
1675          empty_branch = FALSE;
1676          do
1677            {
1678            if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1679              empty_branch = TRUE;
1680            code += GET(code, 1);
1681            }
1682          while (*code == OP_ALT);
1683          if (!empty_branch) return FALSE;   /* All branches are non-empty */
1684        }        }
1685      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
     code += 1 + LINK_SIZE;  
1686      c = *code;      c = *code;
1687        continue;
1688      }      }
1689    
1690    else switch (c)    /* Handle the other opcodes */
1691    
1692      switch (c)
1693      {      {
1694      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1695        cannot be represented just by a bit map. This includes negated single
1696        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1697        actual length is stored in the compiled code, so we must update "code"
1698        here. */
1699    
1700  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1701      case OP_XCLASS:      case OP_XCLASS:
1702      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1703      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1704  #endif  #endif
1705    
# Line 1260  for (code = first_significant_code(code Line 1743  for (code = first_significant_code(code
1743      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1744      case OP_WORDCHAR:      case OP_WORDCHAR:
1745      case OP_ANY:      case OP_ANY:
1746        case OP_ALLANY:
1747      case OP_ANYBYTE:      case OP_ANYBYTE:
1748      case OP_CHAR:      case OP_CHAR:
1749      case OP_CHARNC:      case OP_CHARNC:
1750      case OP_NOT:      case OP_NOT:
1751      case OP_PLUS:      case OP_PLUS:
1752      case OP_MINPLUS:      case OP_MINPLUS:
1753        case OP_POSPLUS:
1754      case OP_EXACT:      case OP_EXACT:
1755      case OP_NOTPLUS:      case OP_NOTPLUS:
1756      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1757        case OP_NOTPOSPLUS:
1758      case OP_NOTEXACT:      case OP_NOTEXACT:
1759      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1760      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1761        case OP_TYPEPOSPLUS:
1762      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1763      return FALSE;      return FALSE;
1764    
1765        /* These are going to continue, as they may be empty, but we have to
1766        fudge the length for the \p and \P cases. */
1767    
1768        case OP_TYPESTAR:
1769        case OP_TYPEMINSTAR:
1770        case OP_TYPEPOSSTAR:
1771        case OP_TYPEQUERY:
1772        case OP_TYPEMINQUERY:
1773        case OP_TYPEPOSQUERY:
1774        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1775        break;
1776    
1777        /* Same for these */
1778    
1779        case OP_TYPEUPTO:
1780        case OP_TYPEMINUPTO:
1781        case OP_TYPEPOSUPTO:
1782        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1783        break;
1784    
1785      /* End of branch */      /* End of branch */
1786    
1787      case OP_KET:      case OP_KET:
# Line 1283  for (code = first_significant_code(code Line 1790  for (code = first_significant_code(code
1790      case OP_ALT:      case OP_ALT:
1791      return TRUE;      return TRUE;
1792    
1793      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1794      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1795    
1796  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1797      case OP_STAR:      case OP_STAR:
1798      case OP_MINSTAR:      case OP_MINSTAR:
1799        case OP_POSSTAR:
1800      case OP_QUERY:      case OP_QUERY:
1801      case OP_MINQUERY:      case OP_MINQUERY:
1802        case OP_POSQUERY:
1803      case OP_UPTO:      case OP_UPTO:
1804      case OP_MINUPTO:      case OP_MINUPTO:
1805        case OP_POSUPTO:
1806      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1807      break;      break;
1808  #endif  #endif
# Line 1341  return TRUE; Line 1851  return TRUE;
1851  *************************************************/  *************************************************/
1852    
1853  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1854  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1855  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1856  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1857    
1858    Originally, this function only recognized a sequence of letters between the
1859    terminators, but it seems that Perl recognizes any sequence of characters,
1860    though of course unknown POSIX names are subsequently rejected. Perl gives an
1861    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1862    didn't consider this to be a POSIX class. Likewise for [:1234:].
1863    
1864    The problem in trying to be exactly like Perl is in the handling of escapes. We
1865    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1866    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1867    below handles the special case of \], but does not try to do any other escape
1868    processing. This makes it different from Perl for cases such as [:l\ower:]
1869    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1870    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1871    I think.
1872    
1873  Argument:  Arguments:
1874    ptr      pointer to the initial [    ptr      pointer to the initial [
1875    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1876    
1877  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1878  */  */
1879    
1880  static BOOL  static BOOL
1881  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1882  {  {
1883  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1884  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1885  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1886    {    {
1887    *endptr = ptr;    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
1888    return TRUE;      {
1889        if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
1890        if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
1891          {
1892          *endptr = ptr;
1893          return TRUE;
1894          }
1895        }
1896    }    }
1897  return FALSE;  return FALSE;
1898  }  }
# Line 1388  Returns:     a value representing the na Line 1917  Returns:     a value representing the na
1917  static int  static int
1918  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
1919  {  {
1920    const char *pn = posix_names;
1921  register int yield = 0;  register int yield = 0;
1922  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
1923    {    {
1924    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
1925      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
1926      pn += posix_name_lengths[yield] + 1;
1927    yield++;    yield++;
1928    }    }
1929  return -1;  return -1;
# Line 1407  return -1; Line 1938  return -1;
1938  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
1939  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
1940  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
1941  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1942  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
1943  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
1944  offsets adjusted. That is the job of this function. Before it is called, the  have their offsets adjusted. That one of the jobs of this function. Before it
1945  partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
1946    OP_END.
1947    
1948    This function has been extended with the possibility of forward references for
1949    recursions and subroutine calls. It must also check the list of such references
1950    for the group we are dealing with. If it finds that one of the recursions in
1951    the current group is on this list, it adjusts the offset in the list, not the
1952    value in the reference (which is a group number).
1953    
1954  Arguments:  Arguments:
1955    group      points to the start of the group    group      points to the start of the group
1956    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1957    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1958    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1959      save_hwm   the hwm forward reference pointer at the start of the group
1960    
1961  Returns:     nothing  Returns:     nothing
1962  */  */
1963    
1964  static void  static void
1965  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1966      uschar *save_hwm)
1967  {  {
1968  uschar *ptr = group;  uschar *ptr = group;
1969    
1970  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1971    {    {
1972    int offset = GET(ptr, 1);    int offset;
1973    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1974    
1975      /* See if this recursion is on the forward reference list. If so, adjust the
1976      reference. */
1977    
1978      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1979        {
1980        offset = GET(hc, 0);
1981        if (cd->start_code + offset == ptr + 1)
1982          {
1983          PUT(hc, 0, offset + adjust);
1984          break;
1985          }
1986        }
1987    
1988      /* Otherwise, adjust the recursion offset if it's after the start of this
1989      group. */
1990    
1991      if (hc >= cd->hwm)
1992        {
1993        offset = GET(ptr, 1);
1994        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1995        }
1996    
1997    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1998    }    }
1999  }  }
# Line 1508  Yield:        TRUE when range returned; Line 2072  Yield:        TRUE when range returned;
2072  */  */
2073    
2074  static BOOL  static BOOL
2075  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2076      unsigned int *odptr)
2077  {  {
2078  int c, othercase, next;  unsigned int c, othercase, next;
2079    
2080  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2081    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2082    
2083  if (c > d) return FALSE;  if (c > d) return FALSE;
2084    
# Line 1522  next = othercase + 1; Line 2087  next = othercase + 1;
2087    
2088  for (++c; c <= d; c++)  for (++c; c <= d; c++)
2089    {    {
2090    if (_pcre_ucp_othercase(c) != next) break;    if (UCD_OTHERCASE(c) != next) break;
2091    next++;    next++;
2092    }    }
2093    
# Line 1534  return TRUE; Line 2099  return TRUE;
2099  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2100    
2101    
2102    
2103  /*************************************************  /*************************************************
2104  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
2105  *************************************************/  *************************************************/
2106    
2107  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
2108  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
2109  bits.  sense to automatically possessify the repeated item.
2110    
2111  Arguments:  Arguments:
2112    optionsptr     pointer to the option bits    op_code       the repeated op code
2113    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
2114    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
2115    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
2116    errorcodeptr   points to error code variable    ptr           next character in pattern
2117    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
2118      cd            contains pointers to tables etc.
2119    
2120    Returns:        TRUE if possessifying is wanted
2121    */
2122    
2123    static BOOL
2124    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2125      const uschar *ptr, int options, compile_data *cd)
2126    {
2127    int next;
2128    
2129    /* Skip whitespace and comments in extended mode */
2130    
2131    if ((options & PCRE_EXTENDED) != 0)
2132      {
2133      for (;;)
2134        {
2135        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2136        if (*ptr == CHAR_NUMBER_SIGN)
2137          {
2138          while (*(++ptr) != 0)
2139            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2140          }
2141        else break;
2142        }
2143      }
2144    
2145    /* If the next item is one that we can handle, get its value. A non-negative
2146    value is a character, a negative value is an escape value. */
2147    
2148    if (*ptr == CHAR_BACKSLASH)
2149      {
2150      int temperrorcode = 0;
2151      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2152      if (temperrorcode != 0) return FALSE;
2153      ptr++;    /* Point after the escape sequence */
2154      }
2155    
2156    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2157      {
2158    #ifdef SUPPORT_UTF8
2159      if (utf8) { GETCHARINC(next, ptr); } else
2160    #endif
2161      next = *ptr++;
2162      }
2163    
2164    else return FALSE;
2165    
2166    /* Skip whitespace and comments in extended mode */
2167    
2168    if ((options & PCRE_EXTENDED) != 0)
2169      {
2170      for (;;)
2171        {
2172        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2173        if (*ptr == CHAR_NUMBER_SIGN)
2174          {
2175          while (*(++ptr) != 0)
2176            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2177          }
2178        else break;
2179        }
2180      }
2181    
2182    /* If the next thing is itself optional, we have to give up. */
2183    
2184    if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2185      strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2186        return FALSE;
2187    
2188    /* Now compare the next item with the previous opcode. If the previous is a
2189    positive single character match, "item" either contains the character or, if
2190    "item" is greater than 127 in utf8 mode, the character's bytes are in
2191    utf8_char. */
2192    
2193    
2194    /* Handle cases when the next item is a character. */
2195    
2196    if (next >= 0) switch(op_code)
2197      {
2198      case OP_CHAR:
2199    #ifdef SUPPORT_UTF8
2200      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2201    #else
2202      (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2203    #endif
2204      return item != next;
2205    
2206      /* For CHARNC (caseless character) we must check the other case. If we have
2207      Unicode property support, we can use it to test the other case of
2208      high-valued characters. */
2209    
2210      case OP_CHARNC:
2211    #ifdef SUPPORT_UTF8
2212      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2213    #endif
2214      if (item == next) return FALSE;
2215    #ifdef SUPPORT_UTF8
2216      if (utf8)
2217        {
2218        unsigned int othercase;
2219        if (next < 128) othercase = cd->fcc[next]; else
2220    #ifdef SUPPORT_UCP
2221        othercase = UCD_OTHERCASE((unsigned int)next);
2222    #else
2223        othercase = NOTACHAR;
2224    #endif
2225        return (unsigned int)item != othercase;
2226        }
2227      else
2228    #endif  /* SUPPORT_UTF8 */
2229      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2230    
2231      /* For OP_NOT, "item" must be a single-byte character. */
2232    
2233      case OP_NOT:
2234      if (item == next) return TRUE;
2235      if ((options & PCRE_CASELESS) == 0) return FALSE;
2236    #ifdef SUPPORT_UTF8
2237      if (utf8)
2238        {
2239        unsigned int othercase;
2240        if (next < 128) othercase = cd->fcc[next]; else
2241    #ifdef SUPPORT_UCP
2242        othercase = UCD_OTHERCASE(next);
2243    #else
2244        othercase = NOTACHAR;
2245    #endif
2246        return (unsigned int)item == othercase;
2247        }
2248      else
2249    #endif  /* SUPPORT_UTF8 */
2250      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2251    
2252      case OP_DIGIT:
2253      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2254    
2255      case OP_NOT_DIGIT:
2256      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2257    
2258      case OP_WHITESPACE:
2259      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2260    
2261      case OP_NOT_WHITESPACE:
2262      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2263    
2264      case OP_WORDCHAR:
2265      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2266    
2267      case OP_NOT_WORDCHAR:
2268      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2269    
2270      case OP_HSPACE:
2271      case OP_NOT_HSPACE:
2272      switch(next)
2273        {
2274        case 0x09:
2275        case 0x20:
2276        case 0xa0:
2277        case 0x1680:
2278        case 0x180e:
2279        case 0x2000:
2280        case 0x2001:
2281        case 0x2002:
2282        case 0x2003:
2283        case 0x2004:
2284        case 0x2005:
2285        case 0x2006:
2286        case 0x2007:
2287        case 0x2008:
2288        case 0x2009:
2289        case 0x200A:
2290        case 0x202f:
2291        case 0x205f:
2292        case 0x3000:
2293        return op_code != OP_HSPACE;
2294        default:
2295        return op_code == OP_HSPACE;
2296        }
2297    
2298      case OP_VSPACE:
2299      case OP_NOT_VSPACE:
2300      switch(next)
2301        {
2302        case 0x0a:
2303        case 0x0b:
2304        case 0x0c:
2305        case 0x0d:
2306        case 0x85:
2307        case 0x2028:
2308        case 0x2029:
2309        return op_code != OP_VSPACE;
2310        default:
2311        return op_code == OP_VSPACE;
2312        }
2313    
2314      default:
2315      return FALSE;
2316      }
2317    
2318    
2319    /* Handle the case when the next item is \d, \s, etc. */
2320    
2321    switch(op_code)
2322      {
2323      case OP_CHAR:
2324      case OP_CHARNC:
2325    #ifdef SUPPORT_UTF8
2326      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2327    #endif
2328      switch(-next)
2329        {
2330        case ESC_d:
2331        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2332    
2333        case ESC_D:
2334        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2335    
2336        case ESC_s:
2337        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2338    
2339        case ESC_S:
2340        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2341    
2342        case ESC_w:
2343        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2344    
2345        case ESC_W:
2346        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2347    
2348        case ESC_h:
2349        case ESC_H:
2350        switch(item)
2351          {
2352          case 0x09:
2353          case 0x20:
2354          case 0xa0:
2355          case 0x1680:
2356          case 0x180e:
2357          case 0x2000:
2358          case 0x2001:
2359          case 0x2002:
2360          case 0x2003:
2361          case 0x2004:
2362          case 0x2005:
2363          case 0x2006:
2364          case 0x2007:
2365          case 0x2008:
2366          case 0x2009:
2367          case 0x200A:
2368          case 0x202f:
2369          case 0x205f:
2370          case 0x3000:
2371          return -next != ESC_h;
2372          default:
2373          return -next == ESC_h;
2374          }
2375    
2376        case ESC_v:
2377        case ESC_V:
2378        switch(item)
2379          {
2380          case 0x0a:
2381          case 0x0b:
2382          case 0x0c:
2383          case 0x0d:
2384          case 0x85:
2385          case 0x2028:
2386          case 0x2029:
2387          return -next != ESC_v;
2388          default:
2389          return -next == ESC_v;
2390          }
2391    
2392        default:
2393        return FALSE;
2394        }
2395    
2396      case OP_DIGIT:
2397      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2398             next == -ESC_h || next == -ESC_v;
2399    
2400      case OP_NOT_DIGIT:
2401      return next == -ESC_d;
2402    
2403      case OP_WHITESPACE:
2404      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2405    
2406      case OP_NOT_WHITESPACE:
2407      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2408    
2409      case OP_HSPACE:
2410      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2411    
2412      case OP_NOT_HSPACE:
2413      return next == -ESC_h;
2414    
2415      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2416      case OP_VSPACE:
2417      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2418    
2419      case OP_NOT_VSPACE:
2420      return next == -ESC_v;
2421    
2422      case OP_WORDCHAR:
2423      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2424    
2425      case OP_NOT_WORDCHAR:
2426      return next == -ESC_w || next == -ESC_d;
2427    
2428      default:
2429      return FALSE;
2430      }
2431    
2432    /* Control does not reach here */
2433    }
2434    
2435    
2436    
2437    /*************************************************
2438    *           Compile one branch                   *
2439    *************************************************/
2440    
2441    /* Scan the pattern, compiling it into the a vector. If the options are
2442    changed during the branch, the pointer is used to change the external options
2443    bits. This function is used during the pre-compile phase when we are trying
2444    to find out the amount of memory needed, as well as during the real compile
2445    phase. The value of lengthptr distinguishes the two phases.
2446    
2447    Arguments:
2448      optionsptr     pointer to the option bits
2449      codeptr        points to the pointer to the current code point
2450      ptrptr         points to the current pattern pointer
2451      errorcodeptr   points to error code variable
2452      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2453    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2454    bcptr          points to current branch chain    bcptr          points to current branch chain
2455    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2456      lengthptr      NULL during the real compile phase
2457                     points to length accumulator during pre-compile phase
2458    
2459  Returns:         TRUE on success  Returns:         TRUE on success
2460                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2461  */  */
2462    
2463  static BOOL  static BOOL
2464  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2465    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2466    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2467  {  {
2468  int repeat_type, op_type;  int repeat_type, op_type;
2469  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1569  int greedy_default, greedy_non_default; Line 2472  int greedy_default, greedy_non_default;
2472  int firstbyte, reqbyte;  int firstbyte, reqbyte;
2473  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
2474  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
 int condcount = 0;  
2475  int options = *optionsptr;  int options = *optionsptr;
2476  int after_manual_callout = 0;  int after_manual_callout = 0;
2477    int length_prevgroup = 0;
2478  register int c;  register int c;
2479  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2480    uschar *last_code = code;
2481    uschar *orig_code = code;
2482  uschar *tempcode;  uschar *tempcode;
2483  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2484  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1581  const uschar *ptr = *ptrptr; Line 2486  const uschar *ptr = *ptrptr;
2486  const uschar *tempptr;  const uschar *tempptr;
2487  uschar *previous = NULL;  uschar *previous = NULL;
2488  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2489    uschar *save_hwm = NULL;
2490  uschar classbits[32];  uschar classbits[32];
2491    
2492  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2493  BOOL class_utf8;  BOOL class_utf8;
2494  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2495  uschar *class_utf8data;  uschar *class_utf8data;
2496    uschar *class_utf8data_base;
2497  uschar utf8_char[6];  uschar utf8_char[6];
2498  #else  #else
2499  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2500    uschar *utf8_char = NULL;
2501    #endif
2502    
2503    #ifdef DEBUG
2504    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2505  #endif  #endif
2506    
2507  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1621  req_caseopt = ((options & PCRE_CASELESS) Line 2533  req_caseopt = ((options & PCRE_CASELESS)
2533  for (;; ptr++)  for (;; ptr++)
2534    {    {
2535    BOOL negate_class;    BOOL negate_class;
2536      BOOL should_flip_negation;
2537    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2538    BOOL is_quantifier;    BOOL is_quantifier;
2539      BOOL is_recurse;
2540      BOOL reset_bracount;
2541    int class_charcount;    int class_charcount;
2542    int class_lastchar;    int class_lastchar;
2543    int newoptions;    int newoptions;
2544    int recno;    int recno;
2545      int refsign;
2546    int skipbytes;    int skipbytes;
2547    int subreqbyte;    int subreqbyte;
2548    int subfirstbyte;    int subfirstbyte;
2549      int terminator;
2550    int mclength;    int mclength;
2551    uschar mcbuffer[8];    uschar mcbuffer[8];
2552    
2553    /* Next byte in the pattern */    /* Get next byte in the pattern */
2554    
2555    c = *ptr;    c = *ptr;
2556    
2557      /* If we are in the pre-compile phase, accumulate the length used for the
2558      previous cycle of this loop. */
2559    
2560      if (lengthptr != NULL)
2561        {
2562    #ifdef DEBUG
2563        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2564    #endif
2565        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2566          {
2567          *errorcodeptr = ERR52;
2568          goto FAILED;
2569          }
2570    
2571        /* There is at least one situation where code goes backwards: this is the
2572        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2573        the class is simply eliminated. However, it is created first, so we have to
2574        allow memory for it. Therefore, don't ever reduce the length at this point.
2575        */
2576    
2577        if (code < last_code) code = last_code;
2578    
2579        /* Paranoid check for integer overflow */
2580    
2581        if (OFLOW_MAX - *lengthptr < code - last_code)
2582          {
2583          *errorcodeptr = ERR20;
2584          goto FAILED;
2585          }
2586    
2587        *lengthptr += code - last_code;
2588        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2589    
2590        /* If "previous" is set and it is not at the start of the work space, move
2591        it back to there, in order to avoid filling up the work space. Otherwise,
2592        if "previous" is NULL, reset the current code pointer to the start. */
2593    
2594        if (previous != NULL)
2595          {
2596          if (previous > orig_code)
2597            {
2598            memmove(orig_code, previous, code - previous);
2599            code -= previous - orig_code;
2600            previous = orig_code;
2601            }
2602          }
2603        else code = orig_code;
2604    
2605        /* Remember where this code item starts so we can pick up the length
2606        next time round. */
2607    
2608        last_code = code;
2609        }
2610    
2611      /* In the real compile phase, just check the workspace used by the forward
2612      reference list. */
2613    
2614      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2615        {
2616        *errorcodeptr = ERR52;
2617        goto FAILED;
2618        }
2619    
2620    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2621    
2622    if (inescq && c != 0)    if (inescq && c != 0)
2623      {      {
2624      if (c == '\\' && ptr[1] == 'E')      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2625        {        {
2626        inescq = FALSE;        inescq = FALSE;
2627        ptr++;        ptr++;
# Line 1651  for (;; ptr++) Line 2631  for (;; ptr++)
2631        {        {
2632        if (previous_callout != NULL)        if (previous_callout != NULL)
2633          {          {
2634          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2635              complete_callout(previous_callout, ptr, cd);
2636          previous_callout = NULL;          previous_callout = NULL;
2637          }          }
2638        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1666  for (;; ptr++) Line 2647  for (;; ptr++)
2647    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
2648    a quantifier. */    a quantifier. */
2649    
2650    is_quantifier = c == '*' || c == '+' || c == '?' ||    is_quantifier =
2651      (c == '{' && is_counted_repeat(ptr+1));      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2652        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2653    
2654    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2655         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2656      {      {
2657      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2658          complete_callout(previous_callout, ptr, cd);
2659      previous_callout = NULL;      previous_callout = NULL;
2660      }      }
2661    
# Line 1681  for (;; ptr++) Line 2664  for (;; ptr++)
2664    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2665      {      {
2666      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2667      if (c == '#')      if (c == CHAR_NUMBER_SIGN)
2668        {        {
2669        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2670        on the Macintosh. */          {
2671        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2672        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2673          if (*ptr != 0) continue;
2674    
2675          /* Else fall through to handle end of string */
2676          c = 0;
2677        }        }
2678      }      }
2679    
# Line 1700  for (;; ptr++) Line 2687  for (;; ptr++)
2687    
2688    switch(c)    switch(c)
2689      {      {
2690      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2691        case 0:                        /* The branch terminates at string end */
2692      case 0:      case CHAR_VERTICAL_LINE:       /* or | or ) */
2693      case '|':      case CHAR_RIGHT_PARENTHESIS:
     case ')':  
2694      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2695      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2696      *codeptr = code;      *codeptr = code;
2697      *ptrptr = ptr;      *ptrptr = ptr;
2698        if (lengthptr != NULL)
2699          {
2700          if (OFLOW_MAX - *lengthptr < code - last_code)
2701            {
2702            *errorcodeptr = ERR20;
2703            goto FAILED;
2704            }
2705          *lengthptr += code - last_code;   /* To include callout length */
2706          DPRINTF((">> end branch\n"));
2707          }
2708      return TRUE;      return TRUE;
2709    
2710    
2711        /* ===================================================================*/
2712      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2713      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2714    
2715      case '^':      case CHAR_CIRCUMFLEX_ACCENT:
2716      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
2717        {        {
2718        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
# Line 1723  for (;; ptr++) Line 2721  for (;; ptr++)
2721      *code++ = OP_CIRC;      *code++ = OP_CIRC;
2722      break;      break;
2723    
2724      case '$':      case CHAR_DOLLAR_SIGN:
2725      previous = NULL;      previous = NULL;
2726      *code++ = OP_DOLL;      *code++ = OP_DOLL;
2727      break;      break;
# Line 1731  for (;; ptr++) Line 2729  for (;; ptr++)
2729      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
2730      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqbyte doesn't change either. */
2731    
2732      case '.':      case CHAR_DOT:
2733      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2734      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
2735      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
2736      previous = code;      previous = code;
2737      *code++ = OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2738      break;      break;
2739    
2740    
2741        /* ===================================================================*/
2742      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2743      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2744      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1749  for (;; ptr++) Line 2749  for (;; ptr++)
2749      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2750      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2751      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
     */  
2752    
2753      case '[':      In JavaScript compatibility mode, an isolated ']' causes an error. In
2754        default (Perl) mode, it is treated as a data character. */
2755    
2756        case CHAR_RIGHT_SQUARE_BRACKET:
2757        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2758          {
2759          *errorcodeptr = ERR64;
2760          goto FAILED;
2761          }
2762        goto NORMAL_CHAR;
2763    
2764        case CHAR_LEFT_SQUARE_BRACKET:
2765      previous = code;      previous = code;
2766    
2767      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2768      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2769    
2770      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2771          check_posix_syntax(ptr, &tempptr, cd))           ptr[1] == CHAR_EQUALS_SIGN) &&
2772            check_posix_syntax(ptr, &tempptr))
2773        {        {
2774        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2775        goto FAILED;        goto FAILED;
2776        }        }
2777    
2778      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2779        if the first few characters (either before or after ^) are \Q\E or \E we
2780        skip them too. This makes for compatibility with Perl. */
2781    
2782      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2783        for (;;)
2784        {        {
       negate_class = TRUE;  
2785        c = *(++ptr);        c = *(++ptr);
2786          if (c == CHAR_BACKSLASH)
2787            {
2788            if (ptr[1] == CHAR_E)
2789              ptr++;
2790            else if (strncmp((const char *)ptr+1,
2791                              STR_Q STR_BACKSLASH STR_E, 3) == 0)
2792              ptr += 3;
2793            else
2794              break;
2795            }
2796          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2797            negate_class = TRUE;
2798          else break;
2799        }        }
2800      else  
2801        /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2802        an initial ']' is taken as a data character -- the code below handles
2803        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2804        [^] must match any character, so generate OP_ALLANY. */
2805    
2806        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2807            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2808        {        {
2809        negate_class = FALSE;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
2810          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2811          zerofirstbyte = firstbyte;
2812          break;
2813        }        }
2814    
2815        /* If a class contains a negative special such as \S, we need to flip the
2816        negation flag at the end, so that support for characters > 255 works
2817        correctly (they are all included in the class). */
2818    
2819        should_flip_negation = FALSE;
2820    
2821      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2822      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2823      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2824    
2825      class_charcount = 0;      class_charcount = 0;
2826      class_lastchar = -1;      class_lastchar = -1;
2827    
2828        /* Initialize the 32-char bit map to all zeros. We build the map in a
2829        temporary bit of memory, in case the class contains only 1 character (less
2830        than 256), because in that case the compiled code doesn't use the bit map.
2831        */
2832    
2833        memset(classbits, 0, 32 * sizeof(uschar));
2834    
2835  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2836      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2837      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2838        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2839  #endif  #endif
2840    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2841      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2842      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2843      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2844    
2845      do      if (c != 0) do
2846        {        {
2847          const uschar *oldptr;
2848    
2849  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2850        if (utf8 && c > 127)        if (utf8 && c > 127)
2851          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2852          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2853          }          }
2854    
2855          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2856          data and reset the pointer. This is so that very large classes that
2857          contain a zillion UTF-8 characters no longer overwrite the work space
2858          (which is on the stack). */
2859    
2860          if (lengthptr != NULL)
2861            {
2862            *lengthptr += class_utf8data - class_utf8data_base;
2863            class_utf8data = class_utf8data_base;
2864            }
2865    
2866  #endif  #endif
2867    
2868        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
2869    
2870        if (inescq)        if (inescq)
2871          {          {
2872          if (c == '\\' && ptr[1] == 'E')          if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
2873            {            {
2874            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2875            ptr++;            ptr++;                            /* Skip the 'E' */
2876            continue;            continue;                         /* Carry on with next */
2877            }            }
2878          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2879          }          }
2880    
2881        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1829  for (;; ptr++) Line 2884  for (;; ptr++)
2884        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2885        5.6 and 5.8 do. */        5.6 and 5.8 do. */
2886    
2887        if (c == '[' &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
2888            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2889            check_posix_syntax(ptr, &tempptr, cd))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
2890          {          {
2891          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2892          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
2893          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2894          uschar pbits[32];          uschar pbits[32];
2895    
2896          if (ptr[1] != ':')          if (ptr[1] != CHAR_COLON)
2897            {            {
2898            *errorcodeptr = ERR31;            *errorcodeptr = ERR31;
2899            goto FAILED;            goto FAILED;
2900            }            }
2901    
2902          ptr += 2;          ptr += 2;
2903          if (*ptr == '^')          if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2904            {            {
2905            local_negate = TRUE;            local_negate = TRUE;
2906              should_flip_negation = TRUE;  /* Note negative special */
2907            ptr++;            ptr++;
2908            }            }
2909    
# Line 1911  for (;; ptr++) Line 2967  for (;; ptr++)
2967          }          }
2968    
2969        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2970        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2971        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2972        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2973        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2974        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2975    
2976        if (c == '\\')        if (c == CHAR_BACKSLASH)
2977          {          {
2978          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2979            if (*errorcodeptr != 0) goto FAILED;
2980    
2981          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
2982          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
2983            else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
2984          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2985            {            {
2986            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
2987              {              {
2988              ptr += 2; /* avoid empty string */              ptr += 2; /* avoid empty string */
2989              }              }
2990            else inescq = TRUE;            else inescq = TRUE;
2991            continue;            continue;
2992            }            }
2993            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2994    
2995          if (c < 0)          if (c < 0)
2996            {            {
2997            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2998            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2999            switch (-c)  
3000              /* Save time by not doing this in the pre-compile phase. */
3001    
3002              if (lengthptr == NULL) switch (-c)
3003              {              {
3004              case ESC_d:              case ESC_d:
3005              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3006              continue;              continue;
3007    
3008              case ESC_D:              case ESC_D:
3009                should_flip_negation = TRUE;
3010              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3011              continue;              continue;
3012    
# Line 1953  for (;; ptr++) Line 3015  for (;; ptr++)
3015              continue;              continue;
3016    
3017              case ESC_W:              case ESC_W:
3018                should_flip_negation = TRUE;
3019              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3020              continue;              continue;
3021    
# Line 1962  for (;; ptr++) Line 3025  for (;; ptr++)
3025              continue;              continue;
3026    
3027              case ESC_S:              case ESC_S:
3028                should_flip_negation = TRUE;
3029              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3030              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3031              continue;              continue;
3032    
3033  #ifdef SUPPORT_UCP              default:    /* Not recognized; fall through */
3034              case ESC_p:              break;      /* Need "default" setting to stop compiler warning. */
3035              case ESC_P:              }
3036    
3037              /* In the pre-compile phase, just do the recognition. */
3038    
3039              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3040                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3041    
3042              /* We need to deal with \H, \h, \V, and \v in both phases because
3043              they use extra memory. */
3044    
3045              if (-c == ESC_h)
3046                {
3047                SETBIT(classbits, 0x09); /* VT */
3048                SETBIT(classbits, 0x20); /* SPACE */
3049                SETBIT(classbits, 0xa0); /* NSBP */
3050    #ifdef SUPPORT_UTF8
3051                if (utf8)
3052                {                {
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
3053                class_utf8 = TRUE;                class_utf8 = TRUE;
3054                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
3055                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3056                *class_utf8data++ = ptype;                *class_utf8data++ = XCL_SINGLE;
3057                *class_utf8data++ = pdata;                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3058                class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = XCL_RANGE;
3059                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3060                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3061                  *class_utf8data++ = XCL_SINGLE;
3062                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3063                  *class_utf8data++ = XCL_SINGLE;
3064                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3065                  *class_utf8data++ = XCL_SINGLE;
3066                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3067                }                }
             continue;  
3068  #endif  #endif
3069                continue;
3070                }
3071    
3072              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
3073              strict mode. By default, for compatibility with Perl, they are              {
3074              treated as literals. */              for (c = 0; c < 32; c++)
3075                  {
3076                  int x = 0xff;
3077                  switch (c)
3078                    {
3079                    case 0x09/8: x ^= 1 << (0x09%8); break;
3080                    case 0x20/8: x ^= 1 << (0x20%8); break;
3081                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
3082                    default: break;
3083                    }
3084                  classbits[c] |= x;
3085                  }
3086    
3087              default:  #ifdef SUPPORT_UTF8
3088              if ((options & PCRE_EXTRA) != 0)              if (utf8)
3089                {                {
3090                *errorcodeptr = ERR7;                class_utf8 = TRUE;
3091                goto FAILED;                *class_utf8data++ = XCL_RANGE;
3092                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3093                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3094                  *class_utf8data++ = XCL_RANGE;
3095                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3096                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3097                  *class_utf8data++ = XCL_RANGE;
3098                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3099                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3100                  *class_utf8data++ = XCL_RANGE;
3101                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3102                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3103                  *class_utf8data++ = XCL_RANGE;
3104                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3105                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3106                  *class_utf8data++ = XCL_RANGE;
3107                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3108                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3109                  *class_utf8data++ = XCL_RANGE;
3110                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3111                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3112                }                }
3113              c = *ptr;              /* The final character */  #endif
3114              class_charcount -= 2;  /* Undo the default count from above */              continue;
3115              }              }
           }  
   
         /* Fall through if we have a single character (c >= 0). This may be  
         > 256 in UTF-8 mode. */  
   
         }   /* End of backslash handling */  
   
       /* A single character may be followed by '-' to form a range. However,  
       Perl does not permit ']' to be the end of the range. A '-' character  
       here is treated as a literal. */  
   
       if (ptr[1] == '-' && ptr[2] != ']')  
         {  
         int d;  
         ptr += 2;  
3116    
3117              if (-c == ESC_v)
3118                {
3119                SETBIT(classbits, 0x0a); /* LF */
3120                SETBIT(classbits, 0x0b); /* VT */
3121                SETBIT(classbits, 0x0c); /* FF */
3122                SETBIT(classbits, 0x0d); /* CR */
3123                SETBIT(classbits, 0x85); /* NEL */
3124  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3125          if (utf8)              if (utf8)
3126            {                           /* Braces are required because the */                {
3127            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */                class_utf8 = TRUE;
3128            }                *class_utf8data++ = XCL_RANGE;
3129          else                class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3130                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3131                  }
3132    #endif
3133                continue;
3134                }
3135    
3136              if (-c == ESC_V)
3137                {
3138                for (c = 0; c < 32; c++)
3139                  {
3140                  int x = 0xff;
3141                  switch (c)
3142                    {
3143                    case 0x0a/8: x ^= 1 << (0x0a%8);
3144                                 x ^= 1 << (0x0b%8);
3145                                 x ^= 1 << (0x0c%8);
3146                                 x ^= 1 << (0x0d%8);
3147                                 break;
3148                    case 0x85/8: x ^= 1 << (0x85%8); break;
3149                    default: break;
3150                    }
3151                  classbits[c] |= x;
3152                  }
3153    
3154    #ifdef SUPPORT_UTF8
3155                if (utf8)
3156                  {
3157                  class_utf8 = TRUE;
3158                  *class_utf8data++ = XCL_RANGE;
3159                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3160                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3161                  *class_utf8data++ = XCL_RANGE;
3162                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3163                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3164                  }
3165    #endif
3166                continue;
3167                }
3168    
3169              /* We need to deal with \P and \p in both phases. */
3170    
3171    #ifdef SUPPORT_UCP
3172              if (-c == ESC_p || -c == ESC_P)
3173                {
3174                BOOL negated;
3175                int pdata;
3176                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3177                if (ptype < 0) goto FAILED;
3178                class_utf8 = TRUE;
3179                *class_utf8data++ = ((-c == ESC_p) != negated)?
3180                  XCL_PROP : XCL_NOTPROP;
3181                *class_utf8data++ = ptype;
3182                *class_utf8data++ = pdata;
3183                class_charcount -= 2;   /* Not a < 256 character */
3184                continue;
3185                }
3186    #endif
3187              /* Unrecognized escapes are faulted if PCRE is running in its
3188              strict mode. By default, for compatibility with Perl, they are
3189              treated as literals. */
3190    
3191              if ((options & PCRE_EXTRA) != 0)
3192                {
3193                *errorcodeptr = ERR7;
3194                goto FAILED;
3195                }
3196    
3197              class_charcount -= 2;  /* Undo the default count from above */
3198              c = *ptr;              /* Get the final character and fall through */
3199              }
3200    
3201            /* Fall through if we have a single character (c >= 0). This may be
3202            greater than 256 in UTF-8 mode. */
3203    
3204            }   /* End of backslash handling */
3205    
3206          /* A single character may be followed by '-' to form a range. However,
3207          Perl does not permit ']' to be the end of the range. A '-' character
3208          at the end is treated as a literal. Perl ignores orphaned \E sequences
3209          entirely. The code for handling \Q and \E is messy. */
3210    
3211          CHECK_RANGE:
3212          while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3213            {
3214            inescq = FALSE;
3215            ptr += 2;
3216            }
3217    
3218          oldptr = ptr;
3219    
3220          /* Remember \r or \n */
3221    
3222          if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3223    
3224          /* Check for range */
3225    
3226          if (!inescq && ptr[1] == CHAR_MINUS)
3227            {
3228            int d;
3229            ptr += 2;
3230            while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3231    
3232            /* If we hit \Q (not followed by \E) at this point, go into escaped
3233            mode. */
3234    
3235            while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3236              {
3237              ptr += 2;
3238              if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3239                { ptr += 2; continue; }
3240              inescq = TRUE;
3241              break;
3242              }
3243    
3244            if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3245              {
3246              ptr = oldptr;
3247              goto LONE_SINGLE_CHARACTER;
3248              }
3249    
3250    #ifdef SUPPORT_UTF8
3251            if (utf8)
3252              {                           /* Braces are required because the */
3253              GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3254              }
3255            else
3256  #endif  #endif
3257          d = *ptr;  /* Not UTF-8 mode */          d = *ptr;  /* Not UTF-8 mode */
3258    
# Line 2026  for (;; ptr++) Line 3260  for (;; ptr++)
3260          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3261          in such circumstances. */          in such circumstances. */
3262    
3263          if (d == '\\')          if (!inescq && d == CHAR_BACKSLASH)
3264            {            {
3265            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3266            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
3267    
3268            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backspace; \X is literal X; \R is literal R; any other
3269            was literal */            special means the '-' was literal */
3270    
3271            if (d < 0)            if (d < 0)
3272              {              {
3273              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = CHAR_BS;
3274              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = CHAR_X;
3275                else if (d == -ESC_R) d = CHAR_R; else
3276                {                {
3277                ptr = oldptr - 2;                ptr = oldptr;
3278                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3279                }                }
3280              }              }
3281            }            }
3282    
3283          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3284          the pre-pass. Optimize one-character ranges */          one-character ranges */
3285    
3286            if (d < c)
3287              {
3288              *errorcodeptr = ERR8;
3289              goto FAILED;
3290              }
3291    
3292          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3293    
3294            /* Remember \r or \n */
3295    
3296            if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3297    
3298          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3299          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
3300          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 2067  for (;; ptr++) Line 3312  for (;; ptr++)
3312  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3313            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3314              {              {
3315              int occ, ocd;              unsigned int occ, ocd;
3316              int cc = c;              unsigned int cc = c;
3317              int origd = d;              unsigned int origd = d;
3318              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3319                {                {
3320                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3321                      ocd <= (unsigned int)d)
3322                    continue;                          /* Skip embedded ranges */
3323    
3324                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3325                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3326                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3327                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3328                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3329                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3330                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3331                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3332                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3333                  d = ocd;                  d = ocd;
3334                  continue;                  continue;
# Line 2127  for (;; ptr++) Line 3376  for (;; ptr++)
3376          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3377          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3378    
3379          for (; c <= d; c++)          class_charcount += d - c + 1;
3380            class_lastchar = d;
3381    
3382            /* We can save a bit of time by skipping this in the pre-compile. */
3383    
3384            if (lengthptr == NULL) for (; c <= d; c++)
3385            {            {
3386            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3387            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2135  for (;; ptr++) Line 3389  for (;; ptr++)
3389              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3390              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3391              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3392            }            }
3393    
3394          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2160  for (;; ptr++) Line 3412  for (;; ptr++)
3412  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3413          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3414            {            {
3415            int othercase;            unsigned int othercase;
3416            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = UCD_OTHERCASE(c)) != c)
3417              {              {
3418              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3419              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2186  for (;; ptr++) Line 3438  for (;; ptr++)
3438          }          }
3439        }        }
3440    
3441      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3442      loop. This "while" is the end of the "do" above. */  
3443        while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3444    
3445        if (c == 0)                          /* Missing terminating ']' */
3446          {
3447          *errorcodeptr = ERR6;
3448          goto FAILED;
3449          }
3450    
3451    
3452    /* This code has been disabled because it would mean that \s counts as
3453    an explicit \r or \n reference, and that's not really what is wanted. Now
3454    we set the flag only if there is a literal "\r" or "\n" in the class. */
3455    
3456    #if 0
3457        /* Remember whether \r or \n are in this class */
3458    
3459        if (negate_class)
3460          {
3461          if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3462          }
3463        else
3464          {
3465          if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3466          }
3467    #endif
3468    
     while ((c = *(++ptr)) != ']' || inescq);  
3469    
3470      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3471      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3472      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3473      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3474      single-bytes only. This is an historical hangover. Maybe one day we can  
3475      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3476        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3477        operate on single-bytes only. This is an historical hangover. Maybe one day
3478        we can tidy these opcodes to handle multi-byte characters.
3479    
3480      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3481      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 2206  for (;; ptr++) Line 3485  for (;; ptr++)
3485      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3486    
3487  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3488      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3489            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3490  #else  #else
3491      if (class_charcount == 1)      if (class_charcount == 1)
3492  #endif  #endif
# Line 2252  for (;; ptr++) Line 3529  for (;; ptr++)
3529      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3530    
3531      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3532      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
3533      we can omit the bitmap. */      such as \S in the class, because in that case all characters > 255 are in
3534        the class, so any that were explicitly given as well can be ignored. If
3535        (when there are explicit characters > 255 that must be listed) there are no
3536        characters < 256, we can omit the bitmap in the actual compiled code. */
3537    
3538  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3539      if (class_utf8)      if (class_utf8 && !should_flip_negation)
3540        {        {
3541        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3542        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
3543        code += LINK_SIZE;        code += LINK_SIZE;
3544        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3545    
3546        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3547        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3548    
3549        if (class_charcount > 0)        if (class_charcount > 0)
3550          {          {
3551          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3552            memmove(code + 32, code, class_utf8data - code);
3553          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3554          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3555          }          }
3556          else code = class_utf8data;
3557    
3558        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3559    
# Line 2289  for (;; ptr++) Line 3562  for (;; ptr++)
3562        }        }
3563  #endif  #endif
3564    
3565      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, set the opcode to OP_CLASS or
3566      and copy it into the code vector. If this is the first thing in the branch,      OP_NCLASS, depending on whether the whole class was negated and whether
3567      there can be no first char setting, whatever the repeat count. Any reqbyte      there were negative specials such as \S in the class. Then copy the 32-byte
3568      setting must remain unchanged after any kind of repeat. */      map into the code vector, negating it if necessary. */
3569    
3570        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3571      if (negate_class)      if (negate_class)
3572        {        {
3573        *code++ = OP_NCLASS;        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3574        for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3575        }        }
3576      else      else
3577        {        {
       *code++ = OP_CLASS;  
3578        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
3579        }        }
3580      code += 32;      code += 32;
3581      break;      break;
3582    
3583    
3584        /* ===================================================================*/
3585      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3586      has been tested above. */      has been tested above. */
3587    
3588      case '{':      case CHAR_LEFT_CURLY_BRACKET:
3589      if (!is_quantifier) goto NORMAL_CHAR;      if (!is_quantifier) goto NORMAL_CHAR;
3590      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3591      if (*errorcodeptr != 0) goto FAILED;      if (*errorcodeptr != 0) goto FAILED;
3592      goto REPEAT;      goto REPEAT;
3593    
3594      case '*':      case CHAR_ASTERISK:
3595      repeat_min = 0;      repeat_min = 0;
3596      repeat_max = -1;      repeat_max = -1;
3597      goto REPEAT;      goto REPEAT;
3598    
3599      case '+':      case CHAR_PLUS:
3600      repeat_min = 1;      repeat_min = 1;
3601      repeat_max = -1;      repeat_max = -1;
3602      goto REPEAT;      goto REPEAT;
3603    
3604      case '?':      case CHAR_QUESTION_MARK:
3605      repeat_min = 0;      repeat_min = 0;
3606      repeat_max = 1;      repeat_max = 1;
3607    
# Line 2361  for (;; ptr++) Line 3636  for (;; ptr++)
3636      but if PCRE_UNGREEDY is set, it works the other way round. We change the      but if PCRE_UNGREEDY is set, it works the other way round. We change the
3637      repeat type to the non-default. */      repeat type to the non-default. */
3638    
3639      if (ptr[1] == '+')      if (ptr[1] == CHAR_PLUS)
3640        {        {
3641        repeat_type = 0;                  /* Force greedy */        repeat_type = 0;                  /* Force greedy */
3642        possessive_quantifier = TRUE;        possessive_quantifier = TRUE;
3643        ptr++;        ptr++;
3644        }        }
3645      else if (ptr[1] == '?')      else if (ptr[1] == CHAR_QUESTION_MARK)
3646        {        {
3647        repeat_type = greedy_non_default;        repeat_type = greedy_non_default;
3648        ptr++;        ptr++;
3649        }        }
3650      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3651    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3652      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3653      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3654      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2421  for (;; ptr++) Line 3682  for (;; ptr++)
3682          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3683          }          }
3684    
3685          /* If the repetition is unlimited, it pays to see if the next thing on
3686          the line is something that cannot possibly match this character. If so,
3687          automatically possessifying this item gains some performance in the case
3688          where the match fails. */
3689    
3690          if (!possessive_quantifier &&
3691              repeat_max < 0 &&
3692              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3693                options, cd))
3694            {
3695            repeat_type = 0;    /* Force greedy */
3696            possessive_quantifier = TRUE;
3697            }
3698    
3699        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3700        }        }
3701    
3702      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3703      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3704      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3705      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3706        currently used only for single-byte chars. */
3707    
3708      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3709        {        {
3710        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3711        c = previous[1];        c = previous[1];
3712          if (!possessive_quantifier &&
3713              repeat_max < 0 &&
3714              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3715            {
3716            repeat_type = 0;    /* Force greedy */
3717            possessive_quantifier = TRUE;
3718            }
3719        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3720        }        }
3721    
# Line 2450  for (;; ptr++) Line 3733  for (;; ptr++)
3733        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3734        c = *previous;        c = *previous;
3735    
3736          if (!possessive_quantifier &&
3737              repeat_max < 0 &&
3738              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3739            {
3740            repeat_type = 0;    /* Force greedy */
3741            possessive_quantifier = TRUE;
3742            }
3743    
3744        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3745        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3746          {          {
# Line 2469  for (;; ptr++) Line 3760  for (;; ptr++)
3760        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3761        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3762    
3763        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3764    
3765        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3766    
# Line 2490  for (;; ptr++) Line 3781  for (;; ptr++)
3781          }          }
3782    
3783        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3784        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3785        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3786        one less than the maximum. */        one less than the maximum. */
3787    
# Line 2543  for (;; ptr++) Line 3834  for (;; ptr++)
3834            }            }
3835    
3836          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3837          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3838            UPTO is just for 1 instance, we can use QUERY instead. */
3839    
3840          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3841            {            {
# Line 2562  for (;; ptr++) Line 3854  for (;; ptr++)
3854              *code++ = prop_value;              *code++ = prop_value;
3855              }              }
3856            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3857            *code++ = OP_UPTO + repeat_type;  
3858            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3859                {
3860                *code++ = OP_QUERY + repeat_type;
3861                }
3862              else
3863                {
3864                *code++ = OP_UPTO + repeat_type;
3865                PUT2INC(code, 0, repeat_max);
3866                }
3867            }            }
3868          }          }
3869    
# Line 2610  for (;; ptr++) Line 3910  for (;; ptr++)
3910        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3911        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3912    
3913        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3914    
3915        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
3916          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 2630  for (;; ptr++) Line 3930  for (;; ptr++)
3930      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3931      cases. */      cases. */
3932    
3933      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3934               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3935        {        {
3936        register int i;        register int i;
3937        int ketoffset = 0;        int ketoffset = 0;
3938        int len = code - previous;        int len = code - previous;
3939        uschar *bralink = NULL;        uschar *bralink = NULL;
3940    
3941          /* Repeating a DEFINE group is pointless */
3942    
3943          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3944            {
3945            *errorcodeptr = ERR55;
3946            goto FAILED;
3947            }
3948    
3949        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3950        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3951        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2660  for (;; ptr++) Line 3968  for (;; ptr++)
3968    
3969        if (repeat_min == 0)        if (repeat_min == 0)
3970          {          {
3971          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
3972          altogether. */          output altogether, like this:
   
         if (repeat_max == 0)  
           {  
           code = previous;  
           goto END_REPEAT;  
           }  
3973    
3974          /* If the maximum is 1 or unlimited, we just have to stick in the          ** if (repeat_max == 0)
3975          BRAZERO and do no more at this point. However, we do need to adjust          **   {
3976          any OP_RECURSE calls inside the group that refer to the group itself or          **   code = previous;
3977          any internal group, because the offset is from the start of the whole          **   goto END_REPEAT;
3978          regex. Temporarily terminate the pattern while doing this. */          **   }
3979    
3980            However, that fails when a group is referenced as a subroutine from
3981            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3982            so that it is skipped on execution. As we don't have a list of which
3983            groups are referenced, we cannot do this selectively.
3984    
3985            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3986            and do no more at this point. However, we do need to adjust any
3987            OP_RECURSE calls inside the group that refer to the group itself or any
3988            internal or forward referenced group, because the offset is from the
3989            start of the whole regex. Temporarily terminate the pattern while doing
3990            this. */
3991    
3992          if (repeat_max <= 1)          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3993            {            {
3994            *code = OP_END;            *code = OP_END;
3995            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3996            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3997            code++;            code++;
3998              if (repeat_max == 0)
3999                {
4000                *previous++ = OP_SKIPZERO;
4001                goto END_REPEAT;
4002                }
4003            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4004            }            }
4005    
# Line 2696  for (;; ptr++) Line 4015  for (;; ptr++)
4015            {            {
4016            int offset;            int offset;
4017            *code = OP_END;            *code = OP_END;
4018            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4019            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
4020            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
4021            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2716  for (;; ptr++) Line 4035  for (;; ptr++)
4035        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
4036        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
4037        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
4038        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
4039          forward reference subroutine calls in the group, there will be entries on
4040          the workspace list; replicate these with an appropriate increment. */
4041    
4042        else        else
4043          {          {
4044          if (repeat_min > 1)          if (repeat_min > 1)
4045            {            {
4046            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
4047            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
4048              potential integer overflow. */
4049    
4050              if (lengthptr != NULL)
4051                {
4052                int delta = (repeat_min - 1)*length_prevgroup;
4053                if ((double)(repeat_min - 1)*(double)length_prevgroup >
4054                                                                (double)INT_MAX ||
4055                    OFLOW_MAX - *lengthptr < delta)
4056                  {
4057                  *errorcodeptr = ERR20;
4058                  goto FAILED;
4059                  }
4060                *lengthptr += delta;
4061                }
4062    
4063              /* This is compiling for real */
4064    
4065              else
4066              {              {
4067              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4068              code += len;              for (i = 1; i < repeat_min; i++)
4069                  {
4070                  uschar *hc;
4071                  uschar *this_hwm = cd->hwm;
4072                  memcpy(code, previous, len);
4073                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4074                    {
4075                    PUT(cd->hwm, 0, GET(hc, 0) + len);
4076                    cd->hwm += LINK_SIZE;
4077                    }
4078                  save_hwm = this_hwm;
4079                  code += len;
4080                  }
4081              }              }
4082            }            }
4083    
4084          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
4085          }          }
4086    
# Line 2736  for (;; ptr++) Line 4088  for (;; ptr++)
4088        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
4089        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
4090        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
4091        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
4092          replicate entries on the forward reference list. */
4093    
4094        if (repeat_max >= 0)        if (repeat_max >= 0)
4095          {          {
4096          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
4097            just adjust the length as if we had. For each repetition we must add 1
4098            to the length for BRAZERO and for all but the last repetition we must
4099            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4100            paranoid checks to avoid integer overflow. */
4101    
4102            if (lengthptr != NULL && repeat_max > 0)
4103              {
4104              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4105                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4106              if ((double)repeat_max *
4107                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4108                      > (double)INT_MAX ||
4109                  OFLOW_MAX - *lengthptr < delta)
4110                {
4111                *errorcodeptr = ERR20;
4112                goto FAILED;
4113                }
4114              *lengthptr += delta;
4115              }
4116    
4117            /* This is compiling for real */
4118    
4119            else for (i = repeat_max - 1; i >= 0; i--)
4120            {            {
4121              uschar *hc;
4122              uschar *this_hwm = cd->hwm;
4123    
4124            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
4125    
4126            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2757  for (;; ptr++) Line 4136  for (;; ptr++)
4136              }              }
4137    
4138            memcpy(code, previous, len);            memcpy(code, previous, len);
4139              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4140                {
4141                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4142                cd->hwm += LINK_SIZE;
4143                }
4144              save_hwm = this_hwm;
4145            code += len;            code += len;
4146            }            }
4147    
# Line 2779  for (;; ptr++) Line 4164  for (;; ptr++)
4164        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
4165        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
4166        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
4167        correct offset was computed above. */        correct offset was computed above.
4168    
4169        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
4170          this group is a non-atomic one that could match an empty string. If so,
4171          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4172          that runtime checking can be done. [This check is also applied to
4173          atomic groups at runtime, but in a different way.] */
4174    
4175          else
4176            {
4177            uschar *ketcode = code - ketoffset;
4178            uschar *bracode = ketcode - GET(ketcode, 1);
4179            *ketcode = OP_KETRMAX + repeat_type;
4180            if (lengthptr == NULL && *bracode != OP_ONCE)
4181              {
4182              uschar *scode = bracode;
4183              do
4184                {
4185                if (could_be_empty_branch(scode, ketcode, utf8))
4186                  {
4187                  *bracode += OP_SBRA - OP_BRA;
4188                  break;
4189                  }
4190                scode += GET(scode, 1);
4191                }
4192              while (*scode == OP_ALT);
4193              }
4194            }
4195        }        }
4196    
4197        /* If previous is OP_FAIL, it was generated by an empty class [] in
4198        JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4199        by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4200        error above. We can just ignore the repeat in JS case. */
4201    
4202        else if (*previous == OP_FAIL) goto END_REPEAT;
4203    
4204      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
4205    
4206      else      else
# Line 2792  for (;; ptr++) Line 4209  for (;; ptr++)
4209        goto FAILED;        goto FAILED;
4210        }        }
4211    
4212      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
4213      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
4214      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
4215      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4216      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
4217        but the special opcodes can optimize it a bit. The repeated item starts at
4218        tempcode, not at previous, which might be the first part of a string whose
4219        (former) last char we repeated.
4220    
4221        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4222        an 'upto' may follow. We skip over an 'exact' item, and then test the
4223        length of what remains before proceeding. */
4224    
4225      if (possessive_quantifier)      if (possessive_quantifier)
4226        {        {
4227        int len = code - tempcode;        int len;
4228        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4229        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
4230        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode] +
4231        tempcode[0] = OP_ONCE;            ((*tempcode == OP_TYPEEXACT &&
4232        *code++ = OP_KET;               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4233        PUTINC(code, 0, len);        len = code - tempcode;
4234        PUT(tempcode, 1, len);        if (len > 0) switch (*tempcode)
4235            {
4236            case OP_STAR:  *tempcode = OP_POSSTAR; break;
4237            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4238            case OP_QUERY: *tempcode = OP_POSQUERY; break;
4239            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4240    
4241            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4242            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4243            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4244            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4245    
4246            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4247            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4248            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4249            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4250    
4251            default:
4252            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4253            code += 1 + LINK_SIZE;
4254            len += 1 + LINK_SIZE;
4255            tempcode[0] = OP_ONCE;
4256            *code++ = OP_KET;
4257            PUTINC(code, 0, len);
4258            PUT(tempcode, 1, len);
4259            break;
4260            }
4261        }        }
4262    
4263      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2820  for (;; ptr++) Line 4270  for (;; ptr++)
4270      break;      break;
4271    
4272    
4273      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
4274      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
4275      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
4276      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
4277    
4278      case '(':      case CHAR_LEFT_PARENTHESIS:
4279      newoptions = options;      newoptions = options;
4280      skipbytes = 0;      skipbytes = 0;
4281        bravalue = OP_CBRA;
4282        save_hwm = cd->hwm;
4283        reset_bracount = FALSE;
4284    
4285        /* First deal with various "verbs" that can be introduced by '*'. */
4286    
4287        if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4288          {
4289          int i, namelen;
4290          const char *vn = verbnames;
4291          const uschar *name = ++ptr;
4292          previous = NULL;
4293          while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4294          if (*ptr == CHAR_COLON)
4295            {
4296            *errorcodeptr = ERR59;   /* Not supported */
4297            goto FAILED;
4298            }
4299          if (*ptr != CHAR_RIGHT_PARENTHESIS)
4300            {
4301            *errorcodeptr = ERR60;
4302            goto FAILED;
4303            }
4304          namelen = ptr - name;
4305          for (i = 0; i < verbcount; i++)
4306            {
4307            if (namelen == verbs[i].len &&
4308                strncmp((char *)name, vn, namelen) == 0)
4309              {
4310              *code = verbs[i].op;
4311              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4312              break;
4313              }
4314            vn += verbs[i].len + 1;
4315            }
4316          if (i < verbcount) continue;
4317          *errorcodeptr = ERR60;
4318          goto FAILED;
4319          }
4320    
4321        /* Deal with the extended parentheses; all are introduced by '?', and the
4322        appearance of any of them means that this is not a capturing group. */
4323    
4324      if (*(++ptr) == '?')      else if (*ptr == CHAR_QUESTION_MARK)
4325        {        {
4326        int set, unset;        int i, set, unset, namelen;
4327        int *optset;        int *optset;
4328          const uschar *name;
4329          uschar *slot;
4330    
4331        switch (*(++ptr))        switch (*(++ptr))
4332          {          {
4333          case '#':                 /* Comment; skip to ket */          case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
4334          ptr++;          ptr++;
4335          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4336            if (*ptr == 0)
4337              {
4338              *errorcodeptr = ERR18;
4339              goto FAILED;
4340              }
4341          continue;          continue;
4342    
4343          case ':':                 /* Non-extracting bracket */  
4344            /* ------------------------------------------------------------ */
4345            case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
4346            reset_bracount = TRUE;
4347            /* Fall through */
4348    
4349            /* ------------------------------------------------------------ */
4350            case CHAR_COLON:          /* Non-capturing bracket */
4351          bravalue = OP_BRA;          bravalue = OP_BRA;
4352          ptr++;          ptr++;
4353          break;          break;
4354    
4355          case '(':  
4356            /* ------------------------------------------------------------ */
4357            case CHAR_LEFT_PARENTHESIS:
4358          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4359    
4360          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
4361            group), a name (referring to a named group), or 'R', referring to
4362            recursion. R<digits> and R&name are also permitted for recursion tests.
4363    
4364            There are several syntaxes for testing a named group: (?(name)) is used
4365            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4366    
4367            There are two unfortunate ambiguities, caused by history. (a) 'R' can
4368            be the recursive thing or the name 'R' (and similarly for 'R' followed
4369            by digits), and (b) a number could be a name that consists of digits.
4370            In both cases, we look for a name first; if not found, we try the other
4371            cases. */
4372    
4373            /* For conditions that are assertions, check the syntax, and then exit
4374            the switch. This will take control down to where bracketed groups,
4375            including assertions, are processed. */
4376    
4377            if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4378                ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4379              break;
4380    
4381            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4382            below), and all need to skip 3 bytes at the start of the group. */
4383    
4384            code[1+LINK_SIZE] = OP_CREF;
4385            skipbytes = 3;
4386            refsign = -1;
4387    
4388            /* Check for a test for recursion in a named group. */
4389    
4390          if (ptr[1] == 'R')          if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4391            {            {
4392            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4393            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4394            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4395            }            }
4396    
4397          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4398          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
4399          the syntax was checked in the first pass. */  
4400            else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4401          else if ((digitab[ptr[1]] && ctype_digit) != 0)            {
4402            {            terminator = CHAR_GREATER_THAN_SIGN;
           int condref;                 /* Don't amalgamate; some compilers */  
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4403            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4404            }            }
4405          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == CHAR_APOSTROPHE)
4406          set bravalue above. */            {
4407          break;            terminator = CHAR_APOSTROPHE;
4408              ptr++;
4409          case '=':                 /* Positive lookahead */            }
4410