/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 341 by ph10, Sat Apr 19 16:41:04 2008 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45  #define NLBLOCK cd            /* The block containing newline information */  #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 54  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 73  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 97  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 116  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144  terminated by a zero length entry. The first three must be alpha, lower, upper,  searched linearly. Put all the names into a single string, in order to reduce
145  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. */
146    
147  static const char *const posix_names[] = {  typedef struct verbitem {
148    "alpha", "lower", "upper",    int   len;
149    "alnum", "ascii", "blank", "cntrl", "digit", "graph",    int   op;
150    "print", "punct", "space", "word",  "xdigit" };  } verbitem;
151    
152    static const char verbnames[] =
153      "ACCEPT\0"
154      "COMMIT\0"
155      "F\0"
156      "FAIL\0"
157      "PRUNE\0"
158      "SKIP\0"
159      "THEN";
160    
161    static const verbitem verbs[] = {
162      { 6, OP_ACCEPT },
163      { 6, OP_COMMIT },
164      { 1, OP_FAIL },
165      { 4, OP_FAIL },
166      { 5, OP_PRUNE },
167      { 4, OP_SKIP  },
168      { 4, OP_THEN  }
169    };
170    
171    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174    /* Tables of names of POSIX character classes and their lengths. The names are
175    now all in a single string, to reduce the number of relocations when a shared
176    library is dynamically loaded. The list of lengths is terminated by a zero
177    length entry. The first three must be alpha, lower, upper, as this is assumed
178    for handling case independence. */
179    
180    static const char posix_names[] =
181      "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
182      "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
183      "word\0"   "xdigit";
184    
185  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
186    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 156  static const int posix_class_maps[] = { Line 213  static const int posix_class_maps[] = {
213  };  };
214    
215    
216  /* The texts of compile-time error messages. These are "char *" because they  #define STRING(a)  # a
217  are passed to the outside world. */  #define XSTRING(s) STRING(s)
218    
219  static const char *error_texts[] = {  /* The texts of compile-time error messages. These are "char *" because they
220    "no error",  are passed to the outside world. Do not ever re-use any error number, because
221    "\\ at end of pattern",  they are documented. Always add a new error instead. Messages marked DEAD below
222    "\\c at end of pattern",  are no longer used. This used to be a table of strings, but in order to reduce
223    "unrecognized character follows \\",  the number of relocations needed when a shared library is loaded dynamically,
224    "numbers out of order in {} quantifier",  it is now one long string. We cannot use a table of offsets, because the
225    lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226    simply count through to the one we want - this isn't a performance issue
227    because these strings are used only when there is a compilation error. */
228    
229    static const char error_texts[] =
230      "no error\0"
231      "\\ at end of pattern\0"
232      "\\c at end of pattern\0"
233      "unrecognized character follows \\\0"
234      "numbers out of order in {} quantifier\0"
235    /* 5 */    /* 5 */
236    "number too big in {} quantifier",    "number too big in {} quantifier\0"
237    "missing terminating ] for character class",    "missing terminating ] for character class\0"
238    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
239    "range out of order in character class",    "range out of order in character class\0"
240    "nothing to repeat",    "nothing to repeat\0"
241    /* 10 */    /* 10 */
242    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
244    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
245    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
246    "missing )",    "missing )\0"
247    /* 15 */    /* 15 */
248    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
249    "erroffset passed as NULL",    "erroffset passed as NULL\0"
250    "unknown option bit(s) set",    "unknown option bit(s) set\0"
251    "missing ) after comment",    "missing ) after comment\0"
252    "parentheses nested too deeply",    "parentheses nested too deeply\0"  /** DEAD **/
253    /* 20 */    /* 20 */
254    "regular expression too large",    "regular expression is too large\0"
255    "failed to get memory",    "failed to get memory\0"
256    "unmatched parentheses",    "unmatched parentheses\0"
257    "internal error: code overflow",    "internal error: code overflow\0"
258    "unrecognized character after (?<",    "unrecognized character after (?<\0"
259    /* 25 */    /* 25 */
260    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
261    "malformed number or name after (?(",    "malformed number or name after (?(\0"
262    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
263    "assertion expected after (?(",    "assertion expected after (?(\0"
264    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
265    /* 30 */    /* 30 */
266    "unknown POSIX class name",    "unknown POSIX class name\0"
267    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
268    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269    "spare error",    "spare error\0"  /** DEAD **/
270    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
271    /* 35 */    /* 35 */
272    "invalid condition (?(0)",    "invalid condition (?(0)\0"
273    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
274    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275    "number after (?C is > 255",    "number after (?C is > 255\0"
276    "closing ) for (?C expected",    "closing ) for (?C expected\0"
277    /* 40 */    /* 40 */
278    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
279    "unrecognized character after (?P",    "unrecognized character after (?P\0"
280    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)\0"
281    "two named subpatterns have the same name",    "two named subpatterns have the same name\0"
282    "invalid UTF-8 string",    "invalid UTF-8 string\0"
283    /* 45 */    /* 45 */
284    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
285    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
286    "unknown property name after \\P or \\p",    "unknown property name after \\P or \\p\0"
287    "subpattern name is too long (maximum 32 characters)",    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288    "too many named subpatterns (maximum 10,000)",    "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289    /* 50 */    /* 50 */
290    "repeated subpattern is too long",    "repeated subpattern is too long\0"    /** DEAD **/
291    "octal value is greater than \\377 (not in UTF-8 mode)"    "octal value is greater than \\377 (not in UTF-8 mode)\0"
292  };    "internal error: overran compiling workspace\0"
293      "internal error: previously-checked referenced subpattern not found\0"
294      "DEFINE group contains more than one branch\0"
295      /* 55 */
296      "repeating a DEFINE group is not allowed\0"
297      "inconsistent NEWLINE options\0"
298      "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299      "a numbered reference must not be zero\0"
300      "(*VERB) with an argument is not supported\0"
301      /* 60 */
302      "(*VERB) not recognized\0"
303      "number is too big\0"
304      "subpattern name expected\0"
305      "digit expected after (?+\0"
306      "] is an invalid data character in JavaScript compatibility mode";
307    
308    
309  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 241  For convenience, we use the same bit def Line 322  For convenience, we use the same bit def
322    
323  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
324    
325  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
326  static const unsigned char digitab[] =  static const unsigned char digitab[] =
327    {    {
328    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 277  static const unsigned char digitab[] = Line 358  static const unsigned char digitab[] =
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360    
361  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
362  static const unsigned char digitab[] =  static const unsigned char digitab[] =
363    {    {
364    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 291  static const unsigned char digitab[] = Line 372  static const unsigned char digitab[] =
372    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
373    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
374    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
375    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
376    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
377    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
378    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 325  static const unsigned char ebcdic_charta Line 406  static const unsigned char ebcdic_charta
406    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
407    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
408    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
409    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
410    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
411    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
412    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 352  static const unsigned char ebcdic_charta Line 433  static const unsigned char ebcdic_charta
433  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
434    
435  static BOOL  static BOOL
436    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
438    
439    
440    
441    /*************************************************
442    *            Find an error text                  *
443    *************************************************/
444    
445    /* The error texts are now all in one long string, to save on relocations. As
446    some of the text is of unknown length, we can't use a table of offsets.
447    Instead, just count through the strings. This is not a performance issue
448    because it happens only when there has been a compilation error.
449    
450    Argument:   the error number
451    Returns:    pointer to the error string
452    */
453    
454    static const char *
455    find_error_text(int n)
456    {
457    const char *s = error_texts;
458    for (; n > 0; n--) while (*s++ != 0);
459    return s;
460    }
461    
462    
463  /*************************************************  /*************************************************
# Line 363  static BOOL Line 466  static BOOL
466    
467  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
468  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
469  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
470  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472    ptr is pointing at the \. On exit, it is on the final character of the escape
473    sequence.
474    
475  Arguments:  Arguments:
476    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 376  Arguments: Line 481  Arguments:
481    
482  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
483                   negative => a special escape sequence                   negative => a special escape sequence
484                   on error, errorptr is set                   on error, errorcodeptr is set
485  */  */
486    
487  static int  static int
# Line 394  ptr--;                            /* Set Line 499  ptr--;                            /* Set
499    
500  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
501    
502  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
504  Otherwise further processing may be required. */  Otherwise further processing may be required. */
505    
506  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
507  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
508  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
509    
510  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
511  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
512  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
513  #endif  #endif
514    
# Line 412  else if ((i = escapes[c - 0x48]) != 0) Line 517  else if ((i = escapes[c - 0x48]) != 0)
517  else  else
518    {    {
519    const uschar *oldptr;    const uschar *oldptr;
520      BOOL braced, negated;
521    
522    switch (c)    switch (c)
523      {      {
524      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 425  else Line 532  else
532      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
533      break;      break;
534    
535        /* \g must be followed by one of a number of specific things:
536    
537        (1) A number, either plain or braced. If positive, it is an absolute
538        backreference. If negative, it is a relative backreference. This is a Perl
539        5.10 feature.
540    
541        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542        is part of Perl's movement towards a unified syntax for back references. As
543        this is synonymous with \k{name}, we fudge it up by pretending it really
544        was \k.
545    
546        (3) For Oniguruma compatibility we also support \g followed by a name or a
547        number either in angle brackets or in single quotes. However, these are
548        (possibly recursive) subroutine calls, _not_ backreferences. Just return
549        the -ESC_g code (cf \k). */
550    
551        case 'g':
552        if (ptr[1] == '<' || ptr[1] == '\'')
553          {
554          c = -ESC_g;
555          break;
556          }
557    
558        /* Handle the Perl-compatible cases */
559    
560        if (ptr[1] == '{')
561          {
562          const uschar *p;
563          for (p = ptr+2; *p != 0 && *p != '}'; p++)
564            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565          if (*p != 0 && *p != '}')
566            {
567            c = -ESC_k;
568            break;
569            }
570          braced = TRUE;
571          ptr++;
572          }
573        else braced = FALSE;
574    
575        if (ptr[1] == '-')
576          {
577          negated = TRUE;
578          ptr++;
579          }
580        else negated = FALSE;
581    
582        c = 0;
583        while ((digitab[ptr[1]] & ctype_digit) != 0)
584          c = c * 10 + *(++ptr) - '0';
585    
586        if (c < 0)   /* Integer overflow */
587          {
588          *errorcodeptr = ERR61;
589          break;
590          }
591    
592        if (braced && *(++ptr) != '}')
593          {
594          *errorcodeptr = ERR57;
595          break;
596          }
597    
598        if (c == 0)
599          {
600          *errorcodeptr = ERR58;
601          break;
602          }
603    
604        if (negated)
605          {
606          if (c > bracount)
607            {
608            *errorcodeptr = ERR15;
609            break;
610            }
611          c = bracount - (c - 1);
612          }
613    
614        c = -(ESC_REF + c);
615        break;
616    
617      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
618      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
619      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 446  else Line 635  else
635        c -= '0';        c -= '0';
636        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
637          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
638          if (c < 0)    /* Integer overflow */
639            {
640            *errorcodeptr = ERR61;
641            break;
642            }
643        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
644          {          {
645          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 495  else Line 689  else
689          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
690          count++;          count++;
691    
692  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
693          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
694          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
696          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
697          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698  #endif  #endif
# Line 522  else Line 716  else
716        {        {
717        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
718        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
719  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
720        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
721        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
723        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
724        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725  #endif  #endif
726        }        }
727      break;      break;
728    
729      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730        This coding is ASCII-specific, but then the whole concept of \cx is
731        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732    
733      case 'c':      case 'c':
734      c = *(++ptr);      c = *(++ptr);
735      if (c == 0)      if (c == 0)
736        {        {
737        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
738        return 0;        break;
739        }        }
740    
741      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
742      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
743      c ^= 0x40;      c ^= 0x40;
744  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
745      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
746      c ^= 0xC0;      c ^= 0xC0;
747  #endif  #endif
748      break;      break;
749    
750      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
752      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
753      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
754      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
755    
756      default:      default:
757      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 619  if (c == '{') Line 811  if (c == '{')
811      *negptr = TRUE;      *negptr = TRUE;
812      ptr++;      ptr++;
813      }      }
814    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
815      {      {
816      c = *(++ptr);      c = *(++ptr);
817      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 648  top = _pcre_utt_size; Line 840  top = _pcre_utt_size;
840  while (bot < top)  while (bot < top)
841    {    {
842    i = (bot + top) >> 1;    i = (bot + top) >> 1;
843    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844    if (c == 0)    if (c == 0)
845      {      {
846      *dptr = _pcre_utt[i].value;      *dptr = _pcre_utt[i].value;
# Line 772  return p; Line 964  return p;
964    
965    
966  /*************************************************  /*************************************************
967  *     Find forward referenced named subpattern   *  *       Find forward referenced subpattern       *
968  *************************************************/  *************************************************/
969    
970  /* This function scans along a pattern looking for capturing subpatterns, and  /* This function scans along a pattern's text looking for capturing
971  counting them. If it finds a named pattern that matches the name it is given,  subpatterns, and counting them. If it finds a named pattern that matches the
972  it returns its number. This is used for forward references to named  name it is given, it returns its number. Alternatively, if the name is NULL, it
973  subpatterns. We know that if (?P< is encountered, the name will be terminated  returns when it reaches a given numbered subpattern. This is used for forward
974  by '>' because that is checked in the first pass.  references to subpatterns. We know that if (?P< is encountered, the name will
975    be terminated by '>' because that is checked in the first pass.
976    
977  Arguments:  Arguments:
978    pointer      current position in the pattern    ptr          current position in the pattern
979    count        current count of capturing parens    cd           compile background data
980    name         name to seek    name         name to seek, or NULL if seeking a numbered subpattern
981    namelen      name length    lorn         name length, or subpattern number if name is NULL
982      xmode        TRUE if we are in /x mode
983    
984  Returns:       the number of the named subpattern, or -1 if not found  Returns:       the number of the named subpattern, or -1 if not found
985  */  */
986    
987  static int  static int
988  find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)  find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
989      BOOL xmode)
990  {  {
991  const uschar *thisname;  const uschar *thisname;
992    int count = cd->bracount;
993    
994  for (; *ptr != 0; ptr++)  for (; *ptr != 0; ptr++)
995    {    {
996    if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }    int term;
997    
998      /* Skip over backslashed characters and also entire \Q...\E */
999    
1000      if (*ptr == '\\')
1001        {
1002        if (*(++ptr) == 0) return -1;
1003        if (*ptr == 'Q') for (;;)
1004          {
1005          while (*(++ptr) != 0 && *ptr != '\\');
1006          if (*ptr == 0) return -1;
1007          if (*(++ptr) == 'E') break;
1008          }
1009        continue;
1010        }
1011    
1012      /* Skip over character classes; this logic must be similar to the way they
1013      are handled for real. If the first character is '^', skip it. Also, if the
1014      first few characters (either before or after ^) are \Q\E or \E we skip them
1015      too. This makes for compatibility with Perl. */
1016    
1017      if (*ptr == '[')
1018        {
1019        BOOL negate_class = FALSE;
1020        for (;;)
1021          {
1022          int c = *(++ptr);
1023          if (c == '\\')
1024            {
1025            if (ptr[1] == 'E') ptr++;
1026              else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1027                else break;
1028            }
1029          else if (!negate_class && c == '^')
1030            negate_class = TRUE;
1031          else break;
1032          }
1033    
1034        /* If the next character is ']', it is a data character that must be
1035        skipped, except in JavaScript compatibility mode. */
1036    
1037        if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1038          ptr++;
1039    
1040        while (*(++ptr) != ']')
1041          {
1042          if (*ptr == 0) return -1;
1043          if (*ptr == '\\')
1044            {
1045            if (*(++ptr) == 0) return -1;
1046            if (*ptr == 'Q') for (;;)
1047              {
1048              while (*(++ptr) != 0 && *ptr != '\\');
1049              if (*ptr == 0) return -1;
1050              if (*(++ptr) == 'E') break;
1051              }
1052            continue;
1053            }
1054          }
1055        continue;
1056        }
1057    
1058      /* Skip comments in /x mode */
1059    
1060      if (xmode && *ptr == '#')
1061        {
1062        while (*(++ptr) != 0 && *ptr != '\n');
1063        if (*ptr == 0) return -1;
1064        continue;
1065        }
1066    
1067      /* An opening parens must now be a real metacharacter */
1068    
1069    if (*ptr != '(') continue;    if (*ptr != '(') continue;
1070    if (ptr[1] != '?') { count++; continue; }    if (ptr[1] != '?' && ptr[1] != '*')
1071    if (ptr[2] == '(') { ptr += 2; continue; }      {
1072    if (ptr[2] != 'P' || ptr[3] != '<') continue;      count++;
1073        if (name == NULL && count == lorn) return count;
1074        continue;
1075        }
1076    
1077      ptr += 2;
1078      if (*ptr == 'P') ptr++;                      /* Allow optional P */
1079    
1080      /* We have to disambiguate (?<! and (?<= from (?<name> */
1081    
1082      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1083           *ptr != '\'')
1084        continue;
1085    
1086    count++;    count++;
1087    ptr += 4;  
1088      if (name == NULL && count == lorn) return count;
1089      term = *ptr++;
1090      if (term == '<') term = '>';
1091    thisname = ptr;    thisname = ptr;
1092    while (*ptr != '>') ptr++;    while (*ptr != term) ptr++;
1093    if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)    if (name != NULL && lorn == ptr - thisname &&
1094          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1095      return count;      return count;
1096    }    }
1097    
1098  return -1;  return -1;
1099  }  }
1100    
# Line 862  for (;;) Line 1149  for (;;)
1149    
1150      case OP_CALLOUT:      case OP_CALLOUT:
1151      case OP_CREF:      case OP_CREF:
1152      case OP_BRANUMBER:      case OP_RREF:
1153        case OP_DEF:
1154      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1155      break;      break;
1156    
# Line 907  for (;;) Line 1195  for (;;)
1195    {    {
1196    int d;    int d;
1197    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1198    switch (op)    switch (op)
1199      {      {
1200        case OP_CBRA:
1201      case OP_BRA:      case OP_BRA:
1202      case OP_ONCE:      case OP_ONCE:
1203      case OP_COND:      case OP_COND:
1204      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1205      if (d < 0) return d;      if (d < 0) return d;
1206      branchlength += d;      branchlength += d;
1207      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 949  for (;;) Line 1236  for (;;)
1236      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1237    
1238      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1239      case OP_CREF:      case OP_CREF:
1240        case OP_RREF:
1241        case OP_DEF:
1242      case OP_OPT:      case OP_OPT:
1243      case OP_CALLOUT:      case OP_CALLOUT:
1244      case OP_SOD:      case OP_SOD:
# Line 995  for (;;) Line 1283  for (;;)
1283    
1284      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1285      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1286        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1287      cc += 4;      cc += 4;
1288      break;      break;
1289    
# Line 1094  for (;;) Line 1383  for (;;)
1383    
1384    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1385    
1386    /* Handle bracketed group */    /* Handle capturing bracket */
1387    
1388    else if (c > OP_BRA)    else if (c == OP_CBRA)
1389      {      {
1390      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1391      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1392      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1393      }      }
1394    
1395    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes    /* Otherwise, we can get the item's length from the table, except that for
1396    that are followed by a character may be followed by a multi-byte character.    repeated character types, we have to test for \p and \P, which have an extra
1397    The length in the table is a minimum, so we have to scan along to skip the    two bytes of parameters. */
   extra bytes. All opcodes are less than 128, so we can use relatively  
   efficient code. */  
1398    
1399    else    else
1400      {      {
1401        switch(c)
1402          {
1403          case OP_TYPESTAR:
1404          case OP_TYPEMINSTAR:
1405          case OP_TYPEPLUS:
1406          case OP_TYPEMINPLUS:
1407          case OP_TYPEQUERY:
1408          case OP_TYPEMINQUERY:
1409          case OP_TYPEPOSSTAR:
1410          case OP_TYPEPOSPLUS:
1411          case OP_TYPEPOSQUERY:
1412          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1413          break;
1414    
1415          case OP_TYPEUPTO:
1416          case OP_TYPEMINUPTO:
1417          case OP_TYPEEXACT:
1418          case OP_TYPEPOSUPTO:
1419          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1420          break;
1421          }
1422    
1423        /* Add in the fixed length from the table */
1424    
1425      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1426    
1427      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1428      a multi-byte character. The length in the table is a minimum, so we have to
1429      arrange to skip the extra bytes. */
1430    
1431    #ifdef SUPPORT_UTF8
1432      if (utf8) switch(c)      if (utf8) switch(c)
1433        {        {
1434        case OP_CHAR:        case OP_CHAR:
# Line 1120  for (;;) Line 1436  for (;;)
1436        case OP_EXACT:        case OP_EXACT:
1437        case OP_UPTO:        case OP_UPTO:
1438        case OP_MINUPTO:        case OP_MINUPTO:
1439          case OP_POSUPTO:
1440        case OP_STAR:        case OP_STAR:
1441        case OP_MINSTAR:        case OP_MINSTAR:
1442          case OP_POSSTAR:
1443        case OP_PLUS:        case OP_PLUS:
1444        case OP_MINPLUS:        case OP_MINPLUS:
1445          case OP_POSPLUS:
1446        case OP_QUERY:        case OP_QUERY:
1447        case OP_MINQUERY:        case OP_MINQUERY:
1448        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1449          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1450        break;        break;
1451        }        }
1452    #endif
1453      }      }
1454    }    }
1455  }  }
# Line 1164  for (;;) Line 1485  for (;;)
1485    
1486    if (c == OP_XCLASS) code += GET(code, 1);    if (c == OP_XCLASS) code += GET(code, 1);
1487    
1488    /* All bracketed groups have the same length. */    /* Otherwise, we can get the item's length from the table, except that for
1489      repeated character types, we have to test for \p and \P, which have an extra
1490      two bytes of parameters. */
1491    
1492    else if (c > OP_BRA)    else
1493      {      {
1494      code += _pcre_OP_lengths[OP_BRA];      switch(c)
1495      }        {
1496          case OP_TYPESTAR:
1497          case OP_TYPEMINSTAR:
1498          case OP_TYPEPLUS:
1499          case OP_TYPEMINPLUS:
1500          case OP_TYPEQUERY:
1501          case OP_TYPEMINQUERY:
1502          case OP_TYPEPOSSTAR:
1503          case OP_TYPEPOSPLUS:
1504          case OP_TYPEPOSQUERY:
1505          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1506          break;
1507    
1508    /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes        case OP_TYPEPOSUPTO:
1509    that are followed by a character may be followed by a multi-byte character.        case OP_TYPEUPTO:
1510    The length in the table is a minimum, so we have to scan along to skip the        case OP_TYPEMINUPTO:
1511    extra bytes. All opcodes are less than 128, so we can use relatively        case OP_TYPEEXACT:
1512    efficient code. */        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1513          break;
1514          }
1515    
1516        /* Add in the fixed length from the table */
1517    
   else  
     {  
1518      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
1519    
1520        /* In UTF-8 mode, opcodes that are followed by a character may be followed
1521        by a multi-byte character. The length in the table is a minimum, so we have
1522        to arrange to skip the extra bytes. */
1523    
1524    #ifdef SUPPORT_UTF8
1525      if (utf8) switch(c)      if (utf8) switch(c)
1526        {        {
1527        case OP_CHAR:        case OP_CHAR:
# Line 1187  for (;;) Line 1529  for (;;)
1529        case OP_EXACT:        case OP_EXACT:
1530        case OP_UPTO:        case OP_UPTO:
1531        case OP_MINUPTO:        case OP_MINUPTO:
1532          case OP_POSUPTO:
1533        case OP_STAR:        case OP_STAR:
1534        case OP_MINSTAR:        case OP_MINSTAR:
1535          case OP_POSSTAR:
1536        case OP_PLUS:        case OP_PLUS:
1537        case OP_MINPLUS:        case OP_MINPLUS:
1538          case OP_POSPLUS:
1539        case OP_QUERY:        case OP_QUERY:
1540        case OP_MINQUERY:        case OP_MINQUERY:
1541        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1542          if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1543        break;        break;
1544        }        }
1545    #endif
1546      }      }
1547    }    }
1548  }  }
# Line 1207  for (;;) Line 1554  for (;;)
1554  *************************************************/  *************************************************/
1555    
1556  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1557  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1558  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1559  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1560  whose current branch will already have been scanned.  backward and negative forward assertions when its final argument is TRUE. If we
1561    hit an unclosed bracket, we return "empty" - this means we've struck an inner
1562    bracket whose current branch will already have been scanned.
1563    
1564  Arguments:  Arguments:
1565    code        points to start of search    code        points to start of search
# Line 1224  static BOOL Line 1573  static BOOL
1573  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1574  {  {
1575  register int c;  register int c;
1576  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1577       code < endcode;       code < endcode;
1578       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1579    {    {
# Line 1232  for (code = first_significant_code(code Line 1581  for (code = first_significant_code(code
1581    
1582    c = *code;    c = *code;
1583    
1584    if (c >= OP_BRA)    /* Skip over forward assertions; the other assertions are skipped by
1585      first_significant_code() with a TRUE final argument. */
1586    
1587      if (c == OP_ASSERT)
1588        {
1589        do code += GET(code, 1); while (*code == OP_ALT);
1590        c = *code;
1591        continue;
1592        }
1593    
1594      /* Groups with zero repeats can of course be empty; skip them. */
1595    
1596      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1597        {
1598        code += _pcre_OP_lengths[c];
1599        do code += GET(code, 1); while (*code == OP_ALT);
1600        c = *code;
1601        continue;
1602        }
1603    
1604      /* For other groups, scan the branches. */
1605    
1606      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1607      {      {
1608      BOOL empty_branch;      BOOL empty_branch;
1609      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1248  for (code = first_significant_code(code Line 1619  for (code = first_significant_code(code
1619        }        }
1620      while (*code == OP_ALT);      while (*code == OP_ALT);
1621      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1622      c = *code;      c = *code;
1623        continue;
1624      }      }
1625    
1626    else switch (c)    /* Handle the other opcodes */
1627    
1628      switch (c)
1629      {      {
1630      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1631        cannot be represented just by a bit map. This includes negated single
1632        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1633        actual length is stored in the compiled code, so we must update "code"
1634        here. */
1635    
1636  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1637      case OP_XCLASS:      case OP_XCLASS:
1638      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1639      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1640  #endif  #endif
1641    
# Line 1308  for (code = first_significant_code(code Line 1685  for (code = first_significant_code(code
1685      case OP_NOT:      case OP_NOT:
1686      case OP_PLUS:      case OP_PLUS:
1687      case OP_MINPLUS:      case OP_MINPLUS:
1688        case OP_POSPLUS:
1689      case OP_EXACT:      case OP_EXACT:
1690      case OP_NOTPLUS:      case OP_NOTPLUS:
1691      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1692        case OP_NOTPOSPLUS:
1693      case OP_NOTEXACT:      case OP_NOTEXACT:
1694      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1695      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1696        case OP_TYPEPOSPLUS:
1697      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1698      return FALSE;      return FALSE;
1699    
1700        /* These are going to continue, as they may be empty, but we have to
1701        fudge the length for the \p and \P cases. */
1702    
1703        case OP_TYPESTAR:
1704        case OP_TYPEMINSTAR:
1705        case OP_TYPEPOSSTAR:
1706        case OP_TYPEQUERY:
1707        case OP_TYPEMINQUERY:
1708        case OP_TYPEPOSQUERY:
1709        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1710        break;
1711    
1712        /* Same for these */
1713    
1714        case OP_TYPEUPTO:
1715        case OP_TYPEMINUPTO:
1716        case OP_TYPEPOSUPTO:
1717        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1718        break;
1719    
1720      /* End of branch */      /* End of branch */
1721    
1722      case OP_KET:      case OP_KET:
# Line 1325  for (code = first_significant_code(code Line 1725  for (code = first_significant_code(code
1725      case OP_ALT:      case OP_ALT:
1726      return TRUE;      return TRUE;
1727    
1728      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1729      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1730    
1731  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1732      case OP_STAR:      case OP_STAR:
1733      case OP_MINSTAR:      case OP_MINSTAR:
1734        case OP_POSSTAR:
1735      case OP_QUERY:      case OP_QUERY:
1736      case OP_MINQUERY:      case OP_MINQUERY:
1737        case OP_POSQUERY:
1738      case OP_UPTO:      case OP_UPTO:
1739      case OP_MINUPTO:      case OP_MINUPTO:
1740        case OP_POSUPTO:
1741      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1742      break;      break;
1743  #endif  #endif
# Line 1383  return TRUE; Line 1786  return TRUE;
1786  *************************************************/  *************************************************/
1787    
1788  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1789  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1790  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1791  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1792    
1793    Originally, this function only recognized a sequence of letters between the
1794    terminators, but it seems that Perl recognizes any sequence of characters,
1795    though of course unknown POSIX names are subsequently rejected. Perl gives an
1796    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1797    didn't consider this to be a POSIX class. Likewise for [:1234:].
1798    
1799    The problem in trying to be exactly like Perl is in the handling of escapes. We
1800    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1801    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1802    below handles the special case of \], but does not try to do any other escape
1803    processing. This makes it different from Perl for cases such as [:l\ower:]
1804    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1805    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1806    I think.
1807    
1808  Argument:  Arguments:
1809    ptr      pointer to the initial [    ptr      pointer to the initial [
1810    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1811    
1812  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1813  */  */
1814    
1815  static BOOL  static BOOL
1816  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1817  {  {
1818  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1819  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1820  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1821    {    {
1822    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1823    return TRUE;      {
1824        if (*ptr == ']') return FALSE;
1825        if (*ptr == terminator && ptr[1] == ']')
1826          {
1827          *endptr = ptr;
1828          return TRUE;
1829          }
1830        }
1831    }    }
1832  return FALSE;  return FALSE;
1833  }  }
# Line 1430  Returns:     a value representing the na Line 1852  Returns:     a value representing the na
1852  static int  static int
1853  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
1854  {  {
1855    const char *pn = posix_names;
1856  register int yield = 0;  register int yield = 0;
1857  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
1858    {    {
1859    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
1860      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
1861      pn += posix_name_lengths[yield] + 1;
1862    yield++;    yield++;
1863    }    }
1864  return -1;  return -1;
# Line 1449  return -1; Line 1873  return -1;
1873  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
1874  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
1875  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
1876  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1877  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
1878  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
1879  offsets adjusted. That is the job of this function. Before it is called, the  have their offsets adjusted. That one of the jobs of this function. Before it
1880  partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
1881    OP_END.
1882    
1883    This function has been extended with the possibility of forward references for
1884    recursions and subroutine calls. It must also check the list of such references
1885    for the group we are dealing with. If it finds that one of the recursions in
1886    the current group is on this list, it adjusts the offset in the list, not the
1887    value in the reference (which is a group number).
1888    
1889  Arguments:  Arguments:
1890    group      points to the start of the group    group      points to the start of the group
1891    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1892    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1893    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1894      save_hwm   the hwm forward reference pointer at the start of the group
1895    
1896  Returns:     nothing  Returns:     nothing
1897  */  */
1898    
1899  static void  static void
1900  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1901      uschar *save_hwm)
1902  {  {
1903  uschar *ptr = group;  uschar *ptr = group;
1904    
1905  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1906    {    {
1907    int offset = GET(ptr, 1);    int offset;
1908    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1909    
1910      /* See if this recursion is on the forward reference list. If so, adjust the
1911      reference. */
1912    
1913      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1914        {
1915        offset = GET(hc, 0);
1916        if (cd->start_code + offset == ptr + 1)
1917          {
1918          PUT(hc, 0, offset + adjust);
1919          break;
1920          }
1921        }
1922    
1923      /* Otherwise, adjust the recursion offset if it's after the start of this
1924      group. */
1925    
1926      if (hc >= cd->hwm)
1927        {
1928        offset = GET(ptr, 1);
1929        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1930        }
1931    
1932    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1933    }    }
1934  }  }
# Line 1550  Yield:        TRUE when range returned; Line 2007  Yield:        TRUE when range returned;
2007  */  */
2008    
2009  static BOOL  static BOOL
2010  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2011      unsigned int *odptr)
2012  {  {
2013  int c, othercase, next;  unsigned int c, othercase, next;
2014    
2015  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2016    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
2017    
2018  if (c > d) return FALSE;  if (c > d) return FALSE;
2019    
# Line 1576  return TRUE; Line 2034  return TRUE;
2034  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2035    
2036    
2037    
2038  /*************************************************  /*************************************************
2039  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
2040  *************************************************/  *************************************************/
2041    
2042  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
2043  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
2044  bits.  sense to automatically possessify the repeated item.
2045    
2046  Arguments:  Arguments:
2047    optionsptr     pointer to the option bits    op_code       the repeated op code
2048    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
2049    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
2050    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
2051    errorcodeptr   points to error code variable    ptr           next character in pattern
2052    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
2053    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
2054    
2055  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
2056  */  */
2057    
2058  static BOOL  static BOOL
2059  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2060    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
2061  {  {
2062  int repeat_type, op_type;  int next;
2063  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
2064  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
2065  int greedy_default, greedy_non_default;  
2066  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
2067  int zeroreqbyte, zerofirstbyte;    {
2068  int req_caseopt, reqvary, tempreqvary;    for (;;)
2069  int options = *optionsptr;      {
2070  int after_manual_callout = 0;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2071  register int c;      if (*ptr == '#')
2072  register uschar *code = *codeptr;        {
2073  uschar *tempcode;        while (*(++ptr) != 0)
2074  BOOL inescq = FALSE;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2075  BOOL groupsetfirstbyte = FALSE;        }
2076  const uschar *ptr = *ptrptr;      else break;
2077  const uschar *tempptr;      }
2078  uschar *previous = NULL;    }
2079  uschar *previous_callout = NULL;  
2080  uschar classbits[32];  /* If the next item is one that we can handle, get its value. A non-negative
2081    value is a character, a negative value is an escape value. */
2082    
2083    if (*ptr == '\\')
2084      {
2085      int temperrorcode = 0;
2086      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2087      if (temperrorcode != 0) return FALSE;
2088      ptr++;    /* Point after the escape sequence */
2089      }
2090    
2091    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2092      {
2093  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2094  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
2095  #endif  #endif
2096      next = *ptr++;
2097      }
2098    
2099  /* Set up the default and non-default settings for greediness */  else return FALSE;
2100    
2101  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
2102    
2103  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
2104  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
2105  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
2106  find one.      {
2107        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2108        if (*ptr == '#')
2109          {
2110          while (*(++ptr) != 0)
2111            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2112          }
2113        else break;
2114        }
2115      }
2116    
2117  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
2118    
2119  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2120      return FALSE;
2121    
2122  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
2123  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
2124  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
2125  case status of the value. This is used only for ASCII characters. */  utf8_char. */
2126    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
2127    
2128  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
2129    
2130  for (;; ptr++)  if (next >= 0) switch(op_code)
2131    {    {
2132    BOOL negate_class;    case OP_CHAR:
2133    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
2134    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2135    int class_charcount;  #endif
2136    int class_lastchar;    return item != next;
2137    
2138      /* For CHARNC (caseless character) we must check the other case. If we have
2139      Unicode property support, we can use it to test the other case of
2140      high-valued characters. */
2141    
2142      case OP_CHARNC:
2143    #ifdef SUPPORT_UTF8
2144      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2145    #endif
2146      if (item == next) return FALSE;
2147    #ifdef SUPPORT_UTF8
2148      if (utf8)
2149        {
2150        unsigned int othercase;
2151        if (next < 128) othercase = cd->fcc[next]; else
2152    #ifdef SUPPORT_UCP
2153        othercase = _pcre_ucp_othercase((unsigned int)next);
2154    #else
2155        othercase = NOTACHAR;
2156    #endif
2157        return (unsigned int)item != othercase;
2158        }
2159      else
2160    #endif  /* SUPPORT_UTF8 */
2161      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2162    
2163      /* For OP_NOT, "item" must be a single-byte character. */
2164    
2165      case OP_NOT:
2166      if (item == next) return TRUE;
2167      if ((options & PCRE_CASELESS) == 0) return FALSE;
2168    #ifdef SUPPORT_UTF8
2169      if (utf8)
2170        {
2171        unsigned int othercase;
2172        if (next < 128) othercase = cd->fcc[next]; else
2173    #ifdef SUPPORT_UCP
2174        othercase = _pcre_ucp_othercase(next);
2175    #else
2176        othercase = NOTACHAR;
2177    #endif
2178        return (unsigned int)item == othercase;
2179        }
2180      else
2181    #endif  /* SUPPORT_UTF8 */
2182      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2183    
2184      case OP_DIGIT:
2185      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2186    
2187      case OP_NOT_DIGIT:
2188      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2189    
2190      case OP_WHITESPACE:
2191      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2192    
2193      case OP_NOT_WHITESPACE:
2194      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2195    
2196      case OP_WORDCHAR:
2197      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2198    
2199      case OP_NOT_WORDCHAR:
2200      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2201    
2202      case OP_HSPACE:
2203      case OP_NOT_HSPACE:
2204      switch(next)
2205        {
2206        case 0x09:
2207        case 0x20:
2208        case 0xa0:
2209        case 0x1680:
2210        case 0x180e:
2211        case 0x2000:
2212        case 0x2001:
2213        case 0x2002:
2214        case 0x2003:
2215        case 0x2004:
2216        case 0x2005:
2217        case 0x2006:
2218        case 0x2007:
2219        case 0x2008:
2220        case 0x2009:
2221        case 0x200A:
2222        case 0x202f:
2223        case 0x205f:
2224        case 0x3000:
2225        return op_code != OP_HSPACE;
2226        default:
2227        return op_code == OP_HSPACE;
2228        }
2229    
2230      case OP_VSPACE:
2231      case OP_NOT_VSPACE:
2232      switch(next)
2233        {
2234        case 0x0a:
2235        case 0x0b:
2236        case 0x0c:
2237        case 0x0d:
2238        case 0x85:
2239        case 0x2028:
2240        case 0x2029:
2241        return op_code != OP_VSPACE;
2242        default:
2243        return op_code == OP_VSPACE;
2244        }
2245    
2246      default:
2247      return FALSE;
2248      }
2249    
2250    
2251    /* Handle the case when the next item is \d, \s, etc. */
2252    
2253    switch(op_code)
2254      {
2255      case OP_CHAR:
2256      case OP_CHARNC:
2257    #ifdef SUPPORT_UTF8
2258      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2259    #endif
2260      switch(-next)
2261        {
2262        case ESC_d:
2263        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2264    
2265        case ESC_D:
2266        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2267    
2268        case ESC_s:
2269        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2270    
2271        case ESC_S:
2272        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2273    
2274        case ESC_w:
2275        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2276    
2277        case ESC_W:
2278        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2279    
2280        case ESC_h:
2281        case ESC_H:
2282        switch(item)
2283          {
2284          case 0x09:
2285          case 0x20:
2286          case 0xa0:
2287          case 0x1680:
2288          case 0x180e:
2289          case 0x2000:
2290          case 0x2001:
2291          case 0x2002:
2292          case 0x2003:
2293          case 0x2004:
2294          case 0x2005:
2295          case 0x2006:
2296          case 0x2007:
2297          case 0x2008:
2298          case 0x2009:
2299          case 0x200A:
2300          case 0x202f:
2301          case 0x205f:
2302          case 0x3000:
2303          return -next != ESC_h;
2304          default:
2305          return -next == ESC_h;
2306          }
2307    
2308        case ESC_v:
2309        case ESC_V:
2310        switch(item)
2311          {
2312          case 0x0a:
2313          case 0x0b:
2314          case 0x0c:
2315          case 0x0d:
2316          case 0x85:
2317          case 0x2028:
2318          case 0x2029:
2319          return -next != ESC_v;
2320          default:
2321          return -next == ESC_v;
2322          }
2323    
2324        default:
2325        return FALSE;
2326        }
2327    
2328      case OP_DIGIT:
2329      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2330             next == -ESC_h || next == -ESC_v;
2331    
2332      case OP_NOT_DIGIT:
2333      return next == -ESC_d;
2334    
2335      case OP_WHITESPACE:
2336      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2337    
2338      case OP_NOT_WHITESPACE:
2339      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2340    
2341      case OP_HSPACE:
2342      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2343    
2344      case OP_NOT_HSPACE:
2345      return next == -ESC_h;
2346    
2347      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2348      case OP_VSPACE:
2349      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2350    
2351      case OP_NOT_VSPACE:
2352      return next == -ESC_v;
2353    
2354      case OP_WORDCHAR:
2355      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2356    
2357      case OP_NOT_WORDCHAR:
2358      return next == -ESC_w || next == -ESC_d;
2359    
2360      default:
2361      return FALSE;
2362      }
2363    
2364    /* Control does not reach here */
2365    }
2366    
2367    
2368    
2369    /*************************************************
2370    *           Compile one branch                   *
2371    *************************************************/
2372    
2373    /* Scan the pattern, compiling it into the a vector. If the options are
2374    changed during the branch, the pointer is used to change the external options
2375    bits. This function is used during the pre-compile phase when we are trying
2376    to find out the amount of memory needed, as well as during the real compile
2377    phase. The value of lengthptr distinguishes the two phases.
2378    
2379    Arguments:
2380      optionsptr     pointer to the option bits
2381      codeptr        points to the pointer to the current code point
2382      ptrptr         points to the current pattern pointer
2383      errorcodeptr   points to error code variable
2384      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2385      reqbyteptr     set to the last literal character required, else < 0
2386      bcptr          points to current branch chain
2387      cd             contains pointers to tables etc.
2388      lengthptr      NULL during the real compile phase
2389                     points to length accumulator during pre-compile phase
2390    
2391    Returns:         TRUE on success
2392                     FALSE, with *errorcodeptr set non-zero on error
2393    */
2394    
2395    static BOOL
2396    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2397      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2398      compile_data *cd, int *lengthptr)
2399    {
2400    int repeat_type, op_type;
2401    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2402    int bravalue = 0;
2403    int greedy_default, greedy_non_default;
2404    int firstbyte, reqbyte;
2405    int zeroreqbyte, zerofirstbyte;
2406    int req_caseopt, reqvary, tempreqvary;
2407    int options = *optionsptr;
2408    int after_manual_callout = 0;
2409    int length_prevgroup = 0;
2410    register int c;
2411    register uschar *code = *codeptr;
2412    uschar *last_code = code;
2413    uschar *orig_code = code;
2414    uschar *tempcode;
2415    BOOL inescq = FALSE;
2416    BOOL groupsetfirstbyte = FALSE;
2417    const uschar *ptr = *ptrptr;
2418    const uschar *tempptr;
2419    uschar *previous = NULL;
2420    uschar *previous_callout = NULL;
2421    uschar *save_hwm = NULL;
2422    uschar classbits[32];
2423    
2424    #ifdef SUPPORT_UTF8
2425    BOOL class_utf8;
2426    BOOL utf8 = (options & PCRE_UTF8) != 0;
2427    uschar *class_utf8data;
2428    uschar *class_utf8data_base;
2429    uschar utf8_char[6];
2430    #else
2431    BOOL utf8 = FALSE;
2432    uschar *utf8_char = NULL;
2433    #endif
2434    
2435    #ifdef DEBUG
2436    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2437    #endif
2438    
2439    /* Set up the default and non-default settings for greediness */
2440    
2441    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2442    greedy_non_default = greedy_default ^ 1;
2443    
2444    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2445    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2446    matches a non-fixed char first char; reqbyte just remains unset if we never
2447    find one.
2448    
2449    When we hit a repeat whose minimum is zero, we may have to adjust these values
2450    to take the zero repeat into account. This is implemented by setting them to
2451    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2452    item types that can be repeated set these backoff variables appropriately. */
2453    
2454    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2455    
2456    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2457    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2458    value > 255. It is added into the firstbyte or reqbyte variables to record the
2459    case status of the value. This is used only for ASCII characters. */
2460    
2461    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2462    
2463    /* Switch on next character until the end of the branch */
2464    
2465    for (;; ptr++)
2466      {
2467      BOOL negate_class;
2468      BOOL should_flip_negation;
2469      BOOL possessive_quantifier;
2470      BOOL is_quantifier;
2471      BOOL is_recurse;
2472      BOOL reset_bracount;
2473      int class_charcount;
2474      int class_lastchar;
2475    int newoptions;    int newoptions;
2476    int recno;    int recno;
2477      int refsign;
2478    int skipbytes;    int skipbytes;
2479    int subreqbyte;    int subreqbyte;
2480    int subfirstbyte;    int subfirstbyte;
2481      int terminator;
2482    int mclength;    int mclength;
2483    uschar mcbuffer[8];    uschar mcbuffer[8];
2484    
2485    /* Next byte in the pattern */    /* Get next byte in the pattern */
2486    
2487    c = *ptr;    c = *ptr;
2488    
2489      /* If we are in the pre-compile phase, accumulate the length used for the
2490      previous cycle of this loop. */
2491    
2492      if (lengthptr != NULL)
2493        {
2494    #ifdef DEBUG
2495        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2496    #endif
2497        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2498          {
2499          *errorcodeptr = ERR52;
2500          goto FAILED;
2501          }
2502    
2503        /* There is at least one situation where code goes backwards: this is the
2504        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2505        the class is simply eliminated. However, it is created first, so we have to
2506        allow memory for it. Therefore, don't ever reduce the length at this point.
2507        */
2508    
2509        if (code < last_code) code = last_code;
2510    
2511        /* Paranoid check for integer overflow */
2512    
2513        if (OFLOW_MAX - *lengthptr < code - last_code)
2514          {
2515          *errorcodeptr = ERR20;
2516          goto FAILED;
2517          }
2518    
2519        *lengthptr += code - last_code;
2520        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2521    
2522        /* If "previous" is set and it is not at the start of the work space, move
2523        it back to there, in order to avoid filling up the work space. Otherwise,
2524        if "previous" is NULL, reset the current code pointer to the start. */
2525    
2526        if (previous != NULL)
2527          {
2528          if (previous > orig_code)
2529            {
2530            memmove(orig_code, previous, code - previous);
2531            code -= previous - orig_code;
2532            previous = orig_code;
2533            }
2534          }
2535        else code = orig_code;
2536    
2537        /* Remember where this code item starts so we can pick up the length
2538        next time round. */
2539    
2540        last_code = code;
2541        }
2542    
2543      /* In the real compile phase, just check the workspace used by the forward
2544      reference list. */
2545    
2546      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2547        {
2548        *errorcodeptr = ERR52;
2549        goto FAILED;
2550        }
2551    
2552    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2553    
# Line 1692  for (;; ptr++) Line 2563  for (;; ptr++)
2563        {        {
2564        if (previous_callout != NULL)        if (previous_callout != NULL)
2565          {          {
2566          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2567              complete_callout(previous_callout, ptr, cd);
2568          previous_callout = NULL;          previous_callout = NULL;
2569          }          }
2570        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1713  for (;; ptr++) Line 2585  for (;; ptr++)
2585    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2586         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2587      {      {
2588      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2589          complete_callout(previous_callout, ptr, cd);
2590      previous_callout = NULL;      previous_callout = NULL;
2591      }      }
2592    
# Line 1724  for (;; ptr++) Line 2597  for (;; ptr++)
2597      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2598      if (c == '#')      if (c == '#')
2599        {        {
2600        while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;        while (*(++ptr) != 0)
       if (*ptr != 0)  
2601          {          {
2602          ptr += cd->nllen - 1;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         continue;  
2603          }          }
2604          if (*ptr != 0) continue;
2605    
2606        /* Else fall through to handle end of string */        /* Else fall through to handle end of string */
2607        c = 0;        c = 0;
2608        }        }
# Line 1745  for (;; ptr++) Line 2618  for (;; ptr++)
2618    
2619    switch(c)    switch(c)
2620      {      {
2621      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2622        case 0:                        /* The branch terminates at string end */
2623      case 0:      case '|':                      /* or | or ) */
     case '|':  
2624      case ')':      case ')':
2625      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2626      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2627      *codeptr = code;      *codeptr = code;
2628      *ptrptr = ptr;      *ptrptr = ptr;
2629        if (lengthptr != NULL)
2630          {
2631          if (OFLOW_MAX - *lengthptr < code - last_code)
2632            {
2633            *errorcodeptr = ERR20;
2634            goto FAILED;
2635            }
2636          *lengthptr += code - last_code;   /* To include callout length */
2637          DPRINTF((">> end branch\n"));
2638          }
2639      return TRUE;      return TRUE;
2640    
2641    
2642        /* ===================================================================*/
2643      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2644      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2645    
# Line 1784  for (;; ptr++) Line 2668  for (;; ptr++)
2668      *code++ = OP_ANY;      *code++ = OP_ANY;
2669      break;      break;
2670    
2671    
2672        /* ===================================================================*/
2673      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2674      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2675      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1794  for (;; ptr++) Line 2680  for (;; ptr++)
2680      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2681      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2682      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
2683      */  
2684        In JavaScript compatibility mode, an isolated ']' causes an error. In
2685        default (Perl) mode, it is treated as a data character. */
2686    
2687        case ']':
2688        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2689          {
2690          *errorcodeptr = ERR64;
2691          goto FAILED;
2692          }
2693        goto NORMAL_CHAR;
2694    
2695      case '[':      case '[':
2696      previous = code;      previous = code;
# Line 1803  for (;; ptr++) Line 2699  for (;; ptr++)
2699      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2700    
2701      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2702          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2703        {        {
2704        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2705        goto FAILED;        goto FAILED;
2706        }        }
2707    
2708      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2709        if the first few characters (either before or after ^) are \Q\E or \E we
2710        skip them too. This makes for compatibility with Perl. */
2711    
2712      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2713        for (;;)
2714        {        {
       negate_class = TRUE;  
2715        c = *(++ptr);        c = *(++ptr);
2716        }        if (c == '\\')
2717      else          {
2718            if (ptr[1] == 'E') ptr++;
2719              else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2720                else break;
2721            }
2722          else if (!negate_class && c == '^')
2723            negate_class = TRUE;
2724          else break;
2725          }
2726    
2727        /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2728        an initial ']' is taken as a data character -- the code below handles
2729        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2730        [^] must match any character, so generate OP_ALLANY. */
2731    
2732        if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2733        {        {
2734        negate_class = FALSE;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
2735        }        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2736          zerofirstbyte = firstbyte;
2737          break;
2738          }
2739    
2740        /* If a class contains a negative special such as \S, we need to flip the
2741        negation flag at the end, so that support for characters > 255 works
2742        correctly (they are all included in the class). */
2743    
2744        should_flip_negation = FALSE;
2745    
2746      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2747      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2748      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2749    
2750      class_charcount = 0;      class_charcount = 0;
2751      class_lastchar = -1;      class_lastchar = -1;
2752    
2753        /* Initialize the 32-char bit map to all zeros. We build the map in a
2754        temporary bit of memory, in case the class contains only 1 character (less
2755        than 256), because in that case the compiled code doesn't use the bit map.
2756        */
2757    
2758        memset(classbits, 0, 32 * sizeof(uschar));
2759    
2760  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2761      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2762      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2763        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2764  #endif  #endif
2765    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2766      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2767      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2768      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2769    
2770      do      if (c != 0) do
2771        {        {
2772          const uschar *oldptr;
2773    
2774  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2775        if (utf8 && c > 127)        if (utf8 && c > 127)
2776          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2777          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2778          }          }
2779    
2780          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2781          data and reset the pointer. This is so that very large classes that
2782          contain a zillion UTF-8 characters no longer overwrite the work space
2783          (which is on the stack). */
2784    
2785          if (lengthptr != NULL)
2786            {
2787            *lengthptr += class_utf8data - class_utf8data_base;
2788            class_utf8data = class_utf8data_base;
2789            }
2790    
2791  #endif  #endif
2792    
2793        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
2794    
2795        if (inescq)        if (inescq)
2796          {          {
2797          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2798            {            {
2799            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2800            ptr++;            ptr++;                            /* Skip the 'E' */
2801            continue;            continue;                         /* Carry on with next */
2802            }            }
2803          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2804          }          }
2805    
2806        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1876  for (;; ptr++) Line 2811  for (;; ptr++)
2811    
2812        if (c == '[' &&        if (c == '[' &&
2813            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2814            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2815          {          {
2816          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2817          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
# Line 1893  for (;; ptr++) Line 2828  for (;; ptr++)
2828          if (*ptr == '^')          if (*ptr == '^')
2829            {            {
2830            local_negate = TRUE;            local_negate = TRUE;
2831              should_flip_negation = TRUE;  /* Note negative special */
2832            ptr++;            ptr++;
2833            }            }
2834    
# Line 1956  for (;; ptr++) Line 2892  for (;; ptr++)
2892          }          }
2893    
2894        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2895        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2896        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2897        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2898        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2899        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2900    
2901        if (c == '\\')        if (c == '\\')
2902          {          {
2903          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2904            if (*errorcodeptr != 0) goto FAILED;
2905    
2906          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2907          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2908            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2909          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2910            {            {
2911            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1978  for (;; ptr++) Line 2915  for (;; ptr++)
2915            else inescq = TRUE;            else inescq = TRUE;
2916            continue;            continue;
2917            }            }
2918            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2919    
2920          if (c < 0)          if (c < 0)
2921            {            {
2922            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2923            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2924            switch (-c)  
2925              /* Save time by not doing this in the pre-compile phase. */
2926    
2927              if (lengthptr == NULL) switch (-c)
2928              {              {
2929              case ESC_d:              case ESC_d:
2930              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2931              continue;              continue;
2932    
2933              case ESC_D:              case ESC_D:
2934                should_flip_negation = TRUE;
2935              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2936              continue;              continue;
2937    
# Line 1998  for (;; ptr++) Line 2940  for (;; ptr++)
2940              continue;              continue;
2941    
2942              case ESC_W:              case ESC_W:
2943                should_flip_negation = TRUE;
2944              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2945              continue;              continue;
2946    
# Line 2007  for (;; ptr++) Line 2950  for (;; ptr++)
2950              continue;              continue;
2951    
2952              case ESC_S:              case ESC_S:
2953                should_flip_negation = TRUE;
2954              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2955              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2956              continue;              continue;
2957    
2958  #ifdef SUPPORT_UCP              default:    /* Not recognized; fall through */
2959              case ESC_p:              break;      /* Need "default" setting to stop compiler warning. */
2960              case ESC_P:              }
2961    
2962              /* In the pre-compile phase, just do the recognition. */
2963    
2964              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2965                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2966    
2967              /* We need to deal with \H, \h, \V, and \v in both phases because
2968              they use extra memory. */
2969    
2970              if (-c == ESC_h)
2971                {
2972                SETBIT(classbits, 0x09); /* VT */
2973                SETBIT(classbits, 0x20); /* SPACE */
2974                SETBIT(classbits, 0xa0); /* NSBP */
2975    #ifdef SUPPORT_UTF8
2976                if (utf8)
2977                {                {
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
2978                class_utf8 = TRUE;                class_utf8 = TRUE;
2979                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2980                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2981                *class_utf8data++ = ptype;                *class_utf8data++ = XCL_SINGLE;
2982                *class_utf8data++ = pdata;                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2983                class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = XCL_RANGE;
2984                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2985                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2986                  *class_utf8data++ = XCL_SINGLE;
2987                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2988                  *class_utf8data++ = XCL_SINGLE;
2989                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2990                  *class_utf8data++ = XCL_SINGLE;
2991                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2992                }                }
             continue;  
2993  #endif  #endif
2994                continue;
2995                }
2996    
2997              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
2998              strict mode. By default, for compatibility with Perl, they are              {
2999              treated as literals. */              for (c = 0; c < 32; c++)
3000                  {
3001                  int x = 0xff;
3002                  switch (c)
3003                    {
3004                    case 0x09/8: x ^= 1 << (0x09%8); break;
3005                    case 0x20/8: x ^= 1 << (0x20%8); break;
3006                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
3007                    default: break;
3008                    }
3009                  classbits[c] |= x;
3010                  }
3011    
3012              default:  #ifdef SUPPORT_UTF8
3013              if ((options & PCRE_EXTRA) != 0)              if (utf8)
3014                {                {
3015                *errorcodeptr = ERR7;                class_utf8 = TRUE;
3016                goto FAILED;                *class_utf8data++ = XCL_RANGE;
3017                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3018                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3019                  *class_utf8data++ = XCL_RANGE;
3020                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3021                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3022                  *class_utf8data++ = XCL_RANGE;
3023                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3024                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3025                  *class_utf8data++ = XCL_RANGE;
3026                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3027                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3028                  *class_utf8data++ = XCL_RANGE;
3029                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3030                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3031                  *class_utf8data++ = XCL_RANGE;
3032                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3033                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3034                  *class_utf8data++ = XCL_RANGE;
3035                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3036                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3037                }                }
3038              c = *ptr;              /* The final character */  #endif
3039              class_charcount -= 2;  /* Undo the default count from above */              continue;
3040              }              }
3041    
3042              if (-c == ESC_v)
3043                {
3044                SETBIT(classbits, 0x0a); /* LF */
3045                SETBIT(classbits, 0x0b); /* VT */
3046                SETBIT(classbits, 0x0c); /* FF */
3047                SETBIT(classbits, 0x0d); /* CR */
3048                SETBIT(classbits, 0x85); /* NEL */
3049    #ifdef SUPPORT_UTF8
3050                if (utf8)
3051                  {
3052                  class_utf8 = TRUE;
3053                  *class_utf8data++ = XCL_RANGE;
3054                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3055                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3056                  }
3057    #endif
3058                continue;
3059                }
3060    
3061              if (-c == ESC_V)
3062                {
3063                for (c = 0; c < 32; c++)
3064                  {
3065                  int x = 0xff;
3066                  switch (c)
3067                    {
3068                    case 0x0a/8: x ^= 1 << (0x0a%8);
3069                                 x ^= 1 << (0x0b%8);
3070                                 x ^= 1 << (0x0c%8);
3071                                 x ^= 1 << (0x0d%8);
3072                                 break;
3073                    case 0x85/8: x ^= 1 << (0x85%8); break;
3074                    default: break;
3075                    }
3076                  classbits[c] |= x;
3077                  }
3078    
3079    #ifdef SUPPORT_UTF8
3080                if (utf8)
3081                  {
3082                  class_utf8 = TRUE;
3083                  *class_utf8data++ = XCL_RANGE;
3084                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3085                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3086                  *class_utf8data++ = XCL_RANGE;
3087                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3088                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3089                  }
3090    #endif
3091                continue;
3092                }
3093    
3094              /* We need to deal with \P and \p in both phases. */
3095    
3096    #ifdef SUPPORT_UCP
3097              if (-c == ESC_p || -c == ESC_P)
3098                {
3099                BOOL negated;
3100                int pdata;
3101                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3102                if (ptype < 0) goto FAILED;
3103                class_utf8 = TRUE;
3104                *class_utf8data++ = ((-c == ESC_p) != negated)?
3105                  XCL_PROP : XCL_NOTPROP;
3106                *class_utf8data++ = ptype;
3107                *class_utf8data++ = pdata;
3108                class_charcount -= 2;   /* Not a < 256 character */
3109                continue;
3110                }
3111    #endif
3112              /* Unrecognized escapes are faulted if PCRE is running in its
3113              strict mode. By default, for compatibility with Perl, they are
3114              treated as literals. */
3115    
3116              if ((options & PCRE_EXTRA) != 0)
3117                {
3118                *errorcodeptr = ERR7;
3119                goto FAILED;
3120                }
3121    
3122              class_charcount -= 2;  /* Undo the default count from above */
3123              c = *ptr;              /* Get the final character and fall through */
3124            }            }
3125    
3126          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
3127          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
3128    
3129          }   /* End of backslash handling */          }   /* End of backslash handling */
3130    
3131        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
3132        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
3133        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
3134          entirely. The code for handling \Q and \E is messy. */
3135    
3136          CHECK_RANGE:
3137          while (ptr[1] == '\\' && ptr[2] == 'E')
3138            {
3139            inescq = FALSE;
3140            ptr += 2;
3141            }
3142    
3143          oldptr = ptr;
3144    
3145          /* Remember \r or \n */
3146    
3147          if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3148    
3149          /* Check for range */
3150    
3151        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
3152          {          {
3153          int d;          int d;
3154          ptr += 2;          ptr += 2;
3155            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3156    
3157            /* If we hit \Q (not followed by \E) at this point, go into escaped
3158            mode. */
3159    
3160            while (*ptr == '\\' && ptr[1] == 'Q')
3161              {
3162              ptr += 2;
3163              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3164              inescq = TRUE;
3165              break;
3166              }
3167    
3168            if (*ptr == 0 || (!inescq && *ptr == ']'))
3169              {
3170              ptr = oldptr;
3171              goto LONE_SINGLE_CHARACTER;
3172              }
3173    
3174  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3175          if (utf8)          if (utf8)
# Line 2071  for (;; ptr++) Line 3184  for (;; ptr++)
3184          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3185          in such circumstances. */          in such circumstances. */
3186    
3187          if (d == '\\')          if (!inescq && d == '\\')
3188            {            {
3189            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3190            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
3191    
3192            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backspace; \X is literal X; \R is literal R; any other
3193            was literal */            special means the '-' was literal */
3194    
3195            if (d < 0)            if (d < 0)
3196              {              {
3197              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
3198              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
3199                else if (d == -ESC_R) d = 'R'; else
3200                {                {
3201                ptr = oldptr - 2;                ptr = oldptr;
3202                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3203                }                }
3204              }              }
3205            }            }
3206    
3207          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3208          the pre-pass. Optimize one-character ranges */          one-character ranges */
3209    
3210            if (d < c)
3211              {
3212              *errorcodeptr = ERR8;
3213              goto FAILED;
3214              }
3215    
3216          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3217    
3218            /* Remember \r or \n */
3219    
3220            if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3221    
3222          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3223          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
3224          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 2112  for (;; ptr++) Line 3236  for (;; ptr++)
3236  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3237            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3238              {              {
3239              int occ, ocd;              unsigned int occ, ocd;
3240              int cc = c;              unsigned int cc = c;
3241              int origd = d;              unsigned int origd = d;
3242              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3243                {                {
3244                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3245                      ocd <= (unsigned int)d)
3246                    continue;                          /* Skip embedded ranges */
3247    
3248                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3249                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3250                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3251                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3252                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3253                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3254                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3255                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3256                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3257                  d = ocd;                  d = ocd;
3258                  continue;                  continue;
# Line 2172  for (;; ptr++) Line 3300  for (;; ptr++)
3300          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3301          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3302    
3303          for (; c <= d; c++)          class_charcount += d - c + 1;
3304            class_lastchar = d;
3305    
3306            /* We can save a bit of time by skipping this in the pre-compile. */
3307    
3308            if (lengthptr == NULL) for (; c <= d; c++)
3309            {            {
3310            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3311            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2180  for (;; ptr++) Line 3313  for (;; ptr++)
3313              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3314              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3315              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3316            }            }
3317    
3318          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2205  for (;; ptr++) Line 3336  for (;; ptr++)
3336  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3337          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3338            {            {
3339            int othercase;            unsigned int othercase;
3340            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3341              {              {
3342              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3343              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2231  for (;; ptr++) Line 3362  for (;; ptr++)
3362          }          }
3363        }        }
3364    
3365      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3366      loop. This "while" is the end of the "do" above. */  
3367        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3368    
3369        if (c == 0)                          /* Missing terminating ']' */
3370          {
3371          *errorcodeptr = ERR6;
3372          goto FAILED;
3373          }
3374    
3375    
3376    /* This code has been disabled because it would mean that \s counts as
3377    an explicit \r or \n reference, and that's not really what is wanted. Now
3378    we set the flag only if there is a literal "\r" or "\n" in the class. */
3379    
3380    #if 0
3381        /* Remember whether \r or \n are in this class */
3382    
3383        if (negate_class)
3384          {
3385          if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3386          }
3387        else
3388          {
3389          if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3390          }
3391    #endif
3392    
     while ((c = *(++ptr)) != ']' || inescq);  
3393    
3394      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3395      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3396      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3397      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3398      single-bytes only. This is an historical hangover. Maybe one day we can  
3399      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3400        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3401        operate on single-bytes only. This is an historical hangover. Maybe one day
3402        we can tidy these opcodes to handle multi-byte characters.
3403    
3404      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3405      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 2251  for (;; ptr++) Line 3409  for (;; ptr++)
3409      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3410    
3411  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3412      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3413            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3414  #else  #else
3415      if (class_charcount == 1)      if (class_charcount == 1)
3416  #endif  #endif
# Line 2297  for (;; ptr++) Line 3453  for (;; ptr++)
3453      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3454    
3455      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3456      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
3457      we can omit the bitmap. */      such as \S in the class, because in that case all characters > 255 are in
3458        the class, so any that were explicitly given as well can be ignored. If
3459        (when there are explicit characters > 255 that must be listed) there are no
3460        characters < 256, we can omit the bitmap in the actual compiled code. */
3461    
3462  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3463      if (class_utf8)      if (class_utf8 && !should_flip_negation)
3464        {        {
3465        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3466        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
3467        code += LINK_SIZE;        code += LINK_SIZE;
3468        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3469    
3470        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3471        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3472    
3473        if (class_charcount > 0)        if (class_charcount > 0)
3474          {          {
3475          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3476            memmove(code + 32, code, class_utf8data - code);
3477          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3478          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3479          }          }
3480          else code = class_utf8data;
3481    
3482        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3483    
# Line 2334  for (;; ptr++) Line 3486  for (;; ptr++)
3486        }        }
3487  #endif  #endif
3488    
3489      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, set the opcode to OP_CLASS or
3490      and copy it into the code vector. If this is the first thing in the branch,      OP_NCLASS, depending on whether the whole class was negated and whether
3491      there can be no first char setting, whatever the repeat count. Any reqbyte      there were negative specials such as \S in the class. Then copy the 32-byte
3492      setting must remain unchanged after any kind of repeat. */      map into the code vector, negating it if necessary. */
3493    
3494        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3495      if (negate_class)      if (negate_class)
3496        {        {
3497        *code++ = OP_NCLASS;        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3498        for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3499        }        }
3500      else      else
3501        {        {
       *code++ = OP_CLASS;  
3502        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
3503        }        }
3504      code += 32;      code += 32;
3505      break;      break;
3506    
3507    
3508        /* ===================================================================*/
3509      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3510      has been tested above. */      has been tested above. */
3511    
# Line 2419  for (;; ptr++) Line 3573  for (;; ptr++)
3573        }        }
3574      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3575    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3576      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3577      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3578      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2466  for (;; ptr++) Line 3606  for (;; ptr++)
3606          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3607          }          }
3608    
3609          /* If the repetition is unlimited, it pays to see if the next thing on
3610          the line is something that cannot possibly match this character. If so,
3611          automatically possessifying this item gains some performance in the case
3612          where the match fails. */
3613    
3614          if (!possessive_quantifier &&
3615              repeat_max < 0 &&
3616              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3617                options, cd))
3618            {
3619            repeat_type = 0;    /* Force greedy */
3620            possessive_quantifier = TRUE;
3621            }
3622    
3623        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3624        }        }
3625    
3626      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3627      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3628      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3629      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3630        currently used only for single-byte chars. */
3631    
3632      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3633        {        {
3634        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3635        c = previous[1];        c = previous[1];
3636          if (!possessive_quantifier &&
3637              repeat_max < 0 &&
3638              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3639            {
3640            repeat_type = 0;    /* Force greedy */
3641            possessive_quantifier = TRUE;
3642            }
3643        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3644        }        }
3645    
# Line 2495  for (;; ptr++) Line 3657  for (;; ptr++)
3657        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3658        c = *previous;        c = *previous;
3659    
3660          if (!possessive_quantifier &&
3661              repeat_max < 0 &&
3662              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3663            {
3664            repeat_type = 0;    /* Force greedy */
3665            possessive_quantifier = TRUE;
3666            }
3667    
3668        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3669        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3670          {          {
# Line 2514  for (;; ptr++) Line 3684  for (;; ptr++)
3684        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3685        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3686    
3687        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3688    
3689        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3690    
# Line 2535  for (;; ptr++) Line 3705  for (;; ptr++)
3705          }          }
3706    
3707        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3708        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3709        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3710        one less than the maximum. */        one less than the maximum. */
3711    
# Line 2588  for (;; ptr++) Line 3758  for (;; ptr++)
3758            }            }
3759    
3760          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3761          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3762            UPTO is just for 1 instance, we can use QUERY instead. */
3763    
3764          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3765            {            {
# Line 2607  for (;; ptr++) Line 3778  for (;; ptr++)
3778              *code++ = prop_value;              *code++ = prop_value;
3779              }              }
3780            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3781            *code++ = OP_UPTO + repeat_type;  
3782            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3783                {
3784                *code++ = OP_QUERY + repeat_type;
3785                }
3786              else
3787                {
3788                *code++ = OP_UPTO + repeat_type;
3789                PUT2INC(code, 0, repeat_max);
3790                }
3791            }            }
3792          }          }
3793    
# Line 2655  for (;; ptr++) Line 3834  for (;; ptr++)
3834        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3835        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3836    
3837        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3838    
3839        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
3840          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 2675  for (;; ptr++) Line 3854  for (;; ptr++)
3854      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3855      cases. */      cases. */
3856    
3857      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3858               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3859        {        {
3860        register int i;        register int i;
3861        int ketoffset = 0;        int ketoffset = 0;
3862        int len = code - previous;        int len = code - previous;
3863        uschar *bralink = NULL;        uschar *bralink = NULL;
3864    
3865          /* Repeating a DEFINE group is pointless */
3866    
3867          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3868            {
3869            *errorcodeptr = ERR55;
3870            goto FAILED;
3871            }
3872    
3873        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3874        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3875        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2705  for (;; ptr++) Line 3892  for (;; ptr++)
3892    
3893        if (repeat_min == 0)        if (repeat_min == 0)
3894          {          {
3895          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
3896          altogether. */          output altogether, like this:
   
         if (repeat_max == 0)  
           {  
           code = previous;  
           goto END_REPEAT;  
           }  
3897    
3898          /* If the maximum is 1 or unlimited, we just have to stick in the          ** if (repeat_max == 0)
3899          BRAZERO and do no more at this point. However, we do need to adjust          **   {
3900          any OP_RECURSE calls inside the group that refer to the group itself or          **   code = previous;
3901          any internal group, because the offset is from the start of the whole          **   goto END_REPEAT;
3902          regex. Temporarily terminate the pattern while doing this. */          **   }
3903    
3904            However, that fails when a group is referenced as a subroutine from
3905            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3906            so that it is skipped on execution. As we don't have a list of which
3907            groups are referenced, we cannot do this selectively.
3908    
3909            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3910            and do no more at this point. However, we do need to adjust any
3911            OP_RECURSE calls inside the group that refer to the group itself or any
3912            internal or forward referenced group, because the offset is from the
3913            start of the whole regex. Temporarily terminate the pattern while doing
3914            this. */
3915    
3916          if (repeat_max <= 1)          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3917            {            {
3918            *code = OP_END;            *code = OP_END;
3919            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3920            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3921            code++;            code++;
3922              if (repeat_max == 0)
3923                {
3924                *previous++ = OP_SKIPZERO;
3925                goto END_REPEAT;
3926                }
3927            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
3928            }            }
3929    
# Line 2741  for (;; ptr++) Line 3939  for (;; ptr++)
3939            {            {
3940            int offset;            int offset;
3941            *code = OP_END;            *code = OP_END;
3942            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3943            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3944            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3945            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2761  for (;; ptr++) Line 3959  for (;; ptr++)
3959        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3960        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3961        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3962        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3963          forward reference subroutine calls in the group, there will be entries on
3964          the workspace list; replicate these with an appropriate increment. */
3965    
3966        else        else
3967          {          {
3968          if (repeat_min > 1)          if (repeat_min > 1)
3969            {            {
3970            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3971            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3972              potential integer overflow. */
3973    
3974              if (lengthptr != NULL)
3975                {
3976                int delta = (repeat_min - 1)*length_prevgroup;
3977                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3978                                                                (double)INT_MAX ||
3979                    OFLOW_MAX - *lengthptr < delta)
3980                  {
3981                  *errorcodeptr = ERR20;
3982                  goto FAILED;
3983                  }
3984                *lengthptr += delta;
3985                }
3986    
3987              /* This is compiling for real */
3988    
3989              else
3990              {              {
3991              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3992              code += len;              for (i = 1; i < repeat_min; i++)
3993                  {
3994                  uschar *hc;
3995                  uschar *this_hwm = cd->hwm;
3996                  memcpy(code, previous, len);
3997                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3998                    {
3999                    PUT(cd->hwm, 0, GET(hc, 0) + len);
4000                    cd->hwm += LINK_SIZE;
4001                    }
4002                  save_hwm = this_hwm;
4003                  code += len;
4004                  }
4005              }              }
4006            }            }
4007    
4008          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
4009          }          }
4010    
# Line 2781  for (;; ptr++) Line 4012  for (;; ptr++)
4012        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
4013        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
4014        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
4015        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
4016          replicate entries on the forward reference list. */
4017    
4018        if (repeat_max >= 0)        if (repeat_max >= 0)
4019          {          {
4020          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
4021            just adjust the length as if we had. For each repetition we must add 1
4022            to the length for BRAZERO and for all but the last repetition we must
4023            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4024            paranoid checks to avoid integer overflow. */
4025    
4026            if (lengthptr != NULL && repeat_max > 0)
4027              {
4028              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4029                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4030              if ((double)repeat_max *
4031                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4032                      > (double)INT_MAX ||
4033                  OFLOW_MAX - *lengthptr < delta)
4034                {
4035                *errorcodeptr = ERR20;
4036                goto FAILED;
4037                }
4038              *lengthptr += delta;
4039              }
4040    
4041            /* This is compiling for real */
4042    
4043            else for (i = repeat_max - 1; i >= 0; i--)
4044            {            {
4045              uschar *hc;
4046              uschar *this_hwm = cd->hwm;
4047    
4048            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
4049    
4050            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2802  for (;; ptr++) Line 4060  for (;; ptr++)
4060              }              }
4061    
4062            memcpy(code, previous, len);            memcpy(code, previous, len);
4063              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4064                {
4065                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4066                cd->hwm += LINK_SIZE;
4067                }
4068              save_hwm = this_hwm;
4069            code += len;            code += len;
4070            }            }
4071    
# Line 2824  for (;; ptr++) Line 4088  for (;; ptr++)
4088        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
4089        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
4090        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
4091        correct offset was computed above. */        correct offset was computed above.
4092    
4093          Then, when we are doing the actual compile phase, check to see whether
4094          this group is a non-atomic one that could match an empty string. If so,
4095          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4096          that runtime checking can be done. [This check is also applied to
4097          atomic groups at runtime, but in a different way.] */
4098    
4099        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
4100            {
4101            uschar *ketcode = code - ketoffset;
4102            uschar *bracode = ketcode - GET(ketcode, 1);
4103            *ketcode = OP_KETRMAX + repeat_type;
4104            if (lengthptr == NULL && *bracode != OP_ONCE)
4105              {
4106              uschar *scode = bracode;
4107              do
4108                {
4109                if (could_be_empty_branch(scode, ketcode, utf8))
4110                  {
4111                  *bracode += OP_SBRA - OP_BRA;
4112                  break;
4113                  }
4114                scode += GET(scode, 1);
4115                }
4116              while (*scode == OP_ALT);
4117              }
4118            }
4119        }        }
4120    
4121        /* If previous is OP_FAIL, it was generated by an empty class [] in
4122        JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4123        by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4124        error above. We can just ignore the repeat in JS case. */
4125    
4126        else if (*previous == OP_FAIL) goto END_REPEAT;
4127    
4128      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
4129    
# Line 2837  for (;; ptr++) Line 4133  for (;; ptr++)
4133        goto FAILED;        goto FAILED;
4134        }        }
4135    
4136      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
4137      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
4138      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
4139      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4140      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
4141        but the special opcodes can optimize it a bit. The repeated item starts at
4142        tempcode, not at previous, which might be the first part of a string whose
4143        (former) last char we repeated.
4144    
4145        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4146        an 'upto' may follow. We skip over an 'exact' item, and then test the
4147        length of what remains before proceeding. */
4148    
4149      if (possessive_quantifier)      if (possessive_quantifier)
4150        {        {
4151        int len = code - tempcode;        int len;
4152        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4153        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
4154        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode] +
4155        tempcode[0] = OP_ONCE;            ((*tempcode == OP_TYPEEXACT &&
4156        *code++ = OP_KET;               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4157        PUTINC(code, 0, len);        len = code - tempcode;
4158        PUT(tempcode, 1, len);        if (len > 0) switch (*tempcode)
4159            {
4160            case OP_STAR:  *tempcode = OP_POSSTAR; break;
4161            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4162            case OP_QUERY: *tempcode = OP_POSQUERY; break;
4163            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4164    
4165            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4166            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4167            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4168            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4169    
4170            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4171            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4172            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4173            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4174    
4175            default:
4176            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4177            code += 1 + LINK_SIZE;
4178            len += 1 + LINK_SIZE;
4179            tempcode[0] = OP_ONCE;
4180            *code++ = OP_KET;
4181            PUTINC(code, 0, len);
4182            PUT(tempcode, 1, len);
4183            break;
4184            }
4185        }        }
4186    
4187      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2865  for (;; ptr++) Line 4194  for (;; ptr++)
4194      break;      break;
4195    
4196    
4197      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
4198      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
4199      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
4200      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
4201    
4202      case '(':      case '(':
4203      newoptions = options;      newoptions = options;
4204      skipbytes = 0;      skipbytes = 0;
4205        bravalue = OP_CBRA;
4206        save_hwm = cd->hwm;
4207        reset_bracount = FALSE;
4208    
4209        /* First deal with various "verbs" that can be introduced by '*'. */
4210    
4211        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4212          {
4213          int i, namelen;
4214          const char *vn = verbnames;
4215          const uschar *name = ++ptr;
4216          previous = NULL;
4217          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4218          if (*ptr == ':')
4219            {
4220            *errorcodeptr = ERR59;   /* Not supported */
4221            goto FAILED;
4222            }
4223          if (*ptr != ')')
4224            {
4225            *errorcodeptr = ERR60;
4226            goto FAILED;
4227            }
4228          namelen = ptr - name;
4229          for (i = 0; i < verbcount; i++)
4230            {
4231            if (namelen == verbs[i].len &&
4232                strncmp((char *)name, vn, namelen) == 0)
4233              {
4234              *code = verbs[i].op;
4235              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4236              break;
4237              }
4238            vn += verbs[i].len + 1;
4239            }
4240          if (i < verbcount) continue;
4241          *errorcodeptr = ERR60;
4242          goto FAILED;
4243          }
4244    
4245      if (*(++ptr) == '?')      /* Deal with the extended parentheses; all are introduced by '?', and the
4246        appearance of any of them means that this is not a capturing group. */
4247    
4248        else if (*ptr == '?')
4249        {        {
4250        int set, unset;        int i, set, unset, namelen;
4251        int *optset;        int *optset;
4252          const uschar *name;
4253          uschar *slot;
4254    
4255        switch (*(++ptr))        switch (*(++ptr))
4256          {          {
4257          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
4258          ptr++;          ptr++;
4259          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
4260            if (*ptr == 0)
4261              {
4262              *errorcodeptr = ERR18;
4263              goto FAILED;
4264              }
4265          continue;          continue;
4266    
4267          case ':':                 /* Non-extracting bracket */  
4268            /* ------------------------------------------------------------ */
4269            case '|':                 /* Reset capture count for each branch */
4270            reset_bracount = TRUE;
4271            /* Fall through */
4272    
4273            /* ------------------------------------------------------------ */
4274            case ':':                 /* Non-capturing bracket */
4275          bravalue = OP_BRA;          bravalue = OP_BRA;
4276          ptr++;          ptr++;
4277          break;          break;
4278    
4279    
4280            /* ------------------------------------------------------------ */
4281          case '(':          case '(':
4282          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4283    
4284          /* A condition can be a number, referring to a numbered group, a name,          /* A condition can be an assertion, a number (referring to a numbered
4285          referring to a named group, 'R', referring to recursion, or an          group), a name (referring to a named group), or 'R', referring to
4286          assertion. There are two unfortunate ambiguities, caused by history.          recursion. R<digits> and R&name are also permitted for recursion tests.
4287          (a) 'R' can be the recursive thing or the name 'R', and (b) a number  
4288          could be a name that consists of digits. In both cases, we look for a          There are several syntaxes for testing a named group: (?(name)) is used
4289          name first; if not found, we try the other cases. If the first          by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4290          character after (?( is a word character, we know the rest up to ) will  
4291          also be word characters because the syntax was checked in the first          There are two unfortunate ambiguities, caused by history. (a) 'R' can
4292          pass. */          be the recursive thing or the name 'R' (and similarly for 'R' followed
4293            by digits), and (b) a number could be a name that consists of digits.
4294          if ((cd->ctypes[ptr[1]] & ctype_word) != 0)          In both cases, we look for a name first; if not found, we try the other
4295            {          cases. */
4296            int i, namelen;  
4297            int condref = 0;          /* For conditions that are assertions, check the syntax, and then exit
4298            const uschar *name;          the switch. This will take control down to where bracketed groups,
4299            uschar *slot = cd->name_table;          including assertions, are processed. */
4300    
4301            /* This is needed for all successful cases. */          if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4302              break;
4303    
4304            skipbytes = 3;          /* Most other conditions use OP_CREF (a couple change to OP_RREF
4305            below), and all need to skip 3 bytes at the start of the group. */
4306    
4307            /* Read the name, but also get it as a number if it's all digits */          code[1+LINK_SIZE] = OP_CREF;
4308            skipbytes = 3;
4309            refsign = -1;
4310    
4311            name = ++ptr;          /* Check for a test for recursion in a named group. */
4312            while (*ptr != ')')  
4313              {          if (ptr[1] == 'R' && ptr[2] == '&')
4314              if (condref >= 0)            {
4315                condref = ((digitab[*ptr] & ctype_digit) != 0)?            terminator = -1;
4316                  condref * 10 + *ptr - '0' : -1;            ptr += 2;
4317              ptr++;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4318              }            }
4319            namelen = ptr - name;  
4320            /* Check for a test for a named group's having been set, using the Perl
4321            syntax (?(<name>) or (?('name') */
4322    
4323            else if (ptr[1] == '<')
4324              {
4325              terminator = '>';
4326              ptr++;
4327              }
4328            else if (ptr[1] == '\'')
4329              {
4330              terminator = '\'';
4331            ptr++;            ptr++;
4332              }
4333            else
4334              {
4335              terminator = 0;
4336              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4337              }
4338    
4339            for (i = 0; i < cd->names_found; i++)          /* We now expect to read a name; any thing else is an error */
             {  
             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;  
             slot += cd->name_entry_size;  
             }  
4340    
4341            /* Found a previous named subpattern */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4342              {
4343              ptr += 1;  /* To get the right offset */
4344              *errorcodeptr = ERR28;
4345              goto FAILED;
4346              }
4347    
4348            if (i < cd->names_found)          /* Read the name, but also get it as a number if it's all digits */
             {  
             condref = GET2(slot, 0);  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, condref);  
             }  
4349    
4350            /* Search the pattern for a forward reference */          recno = 0;
4351            name = ++ptr;
4352            while ((cd->ctypes[*ptr] & ctype_word) != 0)
4353              {
4354              if (recno >= 0)
4355                recno = ((digitab[*ptr] & ctype_digit) != 0)?
4356                  recno * 10 + *ptr - '0' : -1;
4357              ptr++;
4358              }
4359            namelen = ptr - name;
4360    
4361            else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4362              {            {
4363              code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
4364              PUT2(code, 2+LINK_SIZE, i);            *errorcodeptr = ERR26;
4365              }            goto FAILED;
4366              }
4367    
4368            /* Check for 'R' for recursion */          /* Do no further checking in the pre-compile phase. */
4369    
4370            else if (namelen == 1 && *name == 'R')          if (lengthptr != NULL) break;
             {  
             code[1+LINK_SIZE] = OP_CREF;  
             PUT2(code, 2+LINK_SIZE, CREF_RECURSE);  
             }  
4371    
4372            /* Check for a subpattern number */          /* In the real compile we do the work of looking for the actual
4373            reference. If the string started with "+" or "-" we require the rest to
4374            be digits, in which case recno will be set. */
4375    
4376            else if (condref > 0)          if (refsign > 0)
4377              {
4378              if (recno <= 0)
4379              {              {
4380              code[1+LINK_SIZE] = OP_CREF;              *errorcodeptr = ERR58;
4381              PUT2(code, 2+LINK_SIZE, condref);              goto FAILED;
4382              }              }
4383              recno = (refsign == '-')?
4384            /* Either an unidentified subpattern, or a reference to (?(0) */              cd->bracount - recno + 1 : recno +cd->bracount;
4385              if (recno <= 0 || recno > cd->final_bracount)
           else  
4386              {              {
4387              *errorcodeptr = (condref == 0)? ERR35: ERR15;              *errorcodeptr = ERR15;
4388              goto FAILED;              goto FAILED;
4389              }              }
4390              PUT2(code, 2+LINK_SIZE, recno);
4391              break;
4392            }            }
4393    
4394          /* For conditions that are assertions, we just fall through, having          /* Otherwise (did not start with "+" or "-"), start by looking for the
4395          set bravalue above. */          name. */
4396    
4397          break;          slot = cd->name_table;
4398            for (i = 0; i < cd->names_found; i++)
4399              {
4400              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4401              slot += cd->name_entry_size;
4402              }
4403    
4404          case '=':                 /* Positive lookahead */          /* Found a previous named subpattern */
         bravalue = OP_ASSERT;  
         ptr++;  
         break;  
4405    
4406          case '!':                 /* Negative lookahead */          if (i < cd->names_found)
4407          bravalue = OP_ASSERT_NOT;            {
4408          ptr++;            recno = GET2(slot, 0);
4409          break;            PUT2(code, 2+LINK_SIZE, recno);
4410              }
4411    
4412          case '<':                 /* Lookbehinds */          /* Search the pattern for a forward reference */
4413          switch (*(++ptr))  
4414            else if ((i = find_parens(ptr, cd, name, namelen,
4415                            (options & PCRE_EXTENDED) != 0)) > 0)
4416            {            {
4417            case '=':               /* Positive lookbehind */            PUT2(code, 2+LINK_SIZE, i);
4418            bravalue = OP_ASSERTBACK;            }
           ptr++;  
           break;  
4419    
4420            case '!':               /* Negative lookbehind */          /* If terminator == 0 it means that the name followed directly after
4421            bravalue = OP_ASSERTBACK_NOT;          the opening parenthesis [e.g. (?(abc)...] and in this case there are
4422            ptr++;          some further alternatives to try. For the cases where terminator != 0
4423            break;          [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4424            now checked all the possibilities, so give an error. */
4425    
4426            else if (terminator != 0)
4427              {
4428              *errorcodeptr = ERR15;
4429              goto FAILED;
4430            }            }
         break;  
4431    
4432          case '>':                 /* One-time brackets */          /* Check for (?(R) for recursion. Allow digits after R to specify a
4433          bravalue = OP_ONCE;          specific group number. */
         ptr++;  
         break;  
4434    
4435          case 'C':                 /* Callout - may be followed by digits; */          else if (*name == 'R')
4436          previous_callout = code;  /* Save for later completion */            {
4437              recno = 0;
4438              for (i = 1; i < namelen; i++)
4439                {
4440                if ((digitab[name[i]] & ctype_digit) == 0)
4441                  {
4442                  *errorcodeptr = ERR15;
4443                  goto FAILED;
4444                  }
4445                recno = recno * 10 + name[i] - '0';
4446                }
4447              if (recno == 0) recno = RREF_ANY;
4448              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4449              PUT2(code, 2+LINK_SIZE, recno);
4450              }
4451    
4452            /* Similarly, check for the (?(DEFINE) "condition", which is always
4453            false. */
4454    
4455            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4456              {
4457              code[1+LINK_SIZE] = OP_DEF;
4458              skipbytes = 1;
4459              }
4460    
4461            /* Check for the "name" actually being a subpattern number. We are
4462            in the second pass here, so final_bracount is set. */
4463    
4464            else if (recno > 0 && recno <= cd->final_bracount)
4465              {
4466              PUT2(code, 2+LINK_SIZE, recno);
4467              }
4468    
4469            /* Either an unidentified subpattern, or a reference to (?(0) */
4470    
4471            else
4472              {
4473              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4474              goto FAILED;
4475              }
4476            break;
4477    
4478    
4479            /* ------------------------------------------------------------ */
4480            case '=':                 /* Positive lookahead */
4481            bravalue = OP_ASSERT;
4482            ptr++;
4483            break;
4484    
4485    
4486            /* ------------------------------------------------------------ */
4487            case '!':                 /* Negative lookahead */
4488            ptr++;
4489            if (*ptr == ')')          /* Optimize (?!) */
4490              {
4491              *code++ = OP_FAIL;
4492              previous = NULL;
4493              continue;
4494              }
4495            bravalue = OP_ASSERT_NOT;
4496            break;
4497    
4498    
4499            /* ------------------------------------------------------------ */
4500            case '<':                 /* Lookbehind or named define */
4501            switch (ptr[1])
4502              {
4503              case '=':               /* Positive lookbehind */
4504              bravalue = OP_ASSERTBACK;
4505              ptr += 2;
4506              break;
4507    
4508              case '!':               /* Negative lookbehind */
4509              bravalue = OP_ASSERTBACK_NOT;
4510              ptr += 2;
4511              break;
4512    
4513              default:                /* Could be name define, else bad */
4514              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4515              ptr++;                  /* Correct offset for error */
4516              *errorcodeptr = ERR24;
4517              goto FAILED;
4518              }
4519            break;
4520    
4521    
4522            /* ------------------------------------------------------------ */
4523            case '>':                 /* One-time brackets */
4524            bravalue = OP_ONCE;
4525            ptr++;
4526            break;
4527    
4528    
4529            /* ------------------------------------------------------------ */
4530            case 'C':                 /* Callout - may be followed by digits; */
4531            previous_callout = code;  /* Save for later completion */
4532          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
4533          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
4534            {                       /* closing parenthesis is present. */            {
4535            int n = 0;            int n = 0;
4536            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
4537              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
4538              if (*ptr != ')')
4539                {
4540                *errorcodeptr = ERR39;
4541                goto FAILED;
4542                }
4543            if (n > 255)            if (n > 255)
4544              {              {
4545              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 3034  for (;; ptr++) Line 4553  for (;; ptr++)
4553          previous = NULL;          previous = NULL;
4554          continue;          continue;
4555    
4556          case 'P':                 /* Named subpattern handling */  
4557          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4558            case 'P':                 /* Python-style named subpattern handling */
4559            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4560              {
4561              is_recurse = *ptr == '>';
4562              terminator = ')';
4563              goto NAMED_REF_OR_RECURSE;
4564              }
4565            else if (*ptr != '<')    /* Test for Python-style definition */
4566              {
4567              *errorcodeptr = ERR41;
4568              goto FAILED;
4569              }
4570            /* Fall through to handle (?P< as (?< is handled */
4571    
4572    
4573            /* ------------------------------------------------------------ */
4574            DEFINE_NAME:    /* Come here from (?< handling */
4575            case '\'':
4576            {            {
4577            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4578            uschar *slot = cd->name_table;            name = ++ptr;
           const uschar *name;     /* Don't amalgamate; some compilers */  
           name = ++ptr;           /* grumble at autoincrement in declaration */  
4579    
4580            while (*ptr++ != '>');            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4581            namelen = ptr - name - 1;            namelen = ptr - name;
4582    
4583            for (i = 0; i < cd->names_found; i++)            /* In the pre-compile phase, just do a syntax check. */
4584    
4585              if (lengthptr != NULL)
4586              {              {
4587              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
             if (crc == 0)  
4588                {                {
4589                if (slot[2+namelen] == 0)                *errorcodeptr = ERR42;
4590                  goto FAILED;
4591                  }
4592                if (cd->names_found >= MAX_NAME_COUNT)
4593                  {
4594                  *errorcodeptr = ERR49;
4595                  goto FAILED;
4596                  }
4597                if (namelen + 3 > cd->name_entry_size)
4598                  {
4599                  cd->name_entry_size = namelen + 3;
4600                  if (namelen > MAX_NAME_SIZE)
4601                  {                  {
4602                  if ((options & PCRE_DUPNAMES) == 0)                  *errorcodeptr = ERR48;
4603                    {                  goto FAILED;
                   *errorcodeptr = ERR43;  
                   goto FAILED;  
                   }  
4604                  }                  }
               else crc = -1;      /* Current name is substring */  
4605                }                }
4606              if (crc < 0)              }
4607    
4608              /* In the real compile, create the entry in the table */
4609    
4610              else
4611                {
4612                slot = cd->name_table;
4613                for (i = 0; i < cd->names_found; i++)
4614                {                {
4615                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
4616                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
4617                break;                  {
4618                    if (slot[2+namelen] == 0)
4619                      {
4620                      if ((options & PCRE_DUPNAMES) == 0)
4621                        {
4622                        *errorcodeptr = ERR43;
4623                        goto FAILED;
4624                        }
4625                      }
4626                    else crc = -1;      /* Current name is substring */
4627                    }
4628                  if (crc < 0)
4629                    {
4630                    memmove(slot + cd->name_entry_size, slot,
4631                      (cd->names_found - i) * cd->name_entry_size);
4632                    break;
4633                    }
4634                  slot += cd->name_entry_size;
4635                }                }
             slot += cd->name_entry_size;  
             }  
4636    
4637            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
4638            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4639            slot[2+namelen] = 0;              slot[2+namelen] = 0;
4640            cd->names_found++;              }
           goto NUMBERED_GROUP;  
4641            }            }
4642    
4643          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4644    
4645            ptr++;                    /* Move past > or ' */
4646            cd->names_found++;
4647            goto NUMBERED_GROUP;
4648    
4649    
4650            /* ------------------------------------------------------------ */
4651            case '&':                 /* Perl recursion/subroutine syntax */
4652            terminator = ')';
4653            is_recurse = TRUE;
4654            /* Fall through */
4655    
4656            /* We come here from the Python syntax above that handles both
4657            references (?P=name) and recursion (?P>name), as well as falling
4658            through from the Perl recursion syntax (?&name). We also come here from
4659            the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4660            .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4661    
4662            NAMED_REF_OR_RECURSE:
4663            name = ++ptr;
4664            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4665            namelen = ptr - name;
4666    
4667            /* In the pre-compile phase, do a syntax check and set a dummy
4668            reference number. */
4669    
4670            if (lengthptr != NULL)
4671            {            {
4672            int i, namelen;            if (namelen == 0)
4673            int type = *ptr++;              {
4674            const uschar *name = ptr;              *errorcodeptr = ERR62;
4675            uschar *slot = cd->name_table;              goto FAILED;
4676                }
4677              if (*ptr != terminator)
4678                {
4679                *errorcodeptr = ERR42;
4680                goto FAILED;
4681                }
4682              if (namelen > MAX_NAME_SIZE)
4683                {
4684                *errorcodeptr = ERR48;
4685                goto FAILED;
4686                }
4687              recno = 0;
4688              }
4689    
4690            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table. We check the name
4691            namelen = ptr - name;          first, and then check that we have reached the end of the name in the
4692            table. That way, if the name that is longer than any in the table,
4693            the comparison will fail without reading beyond the table entry. */
4694    
4695            else
4696              {
4697              slot = cd->name_table;
4698            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4699              {              {
4700              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4701                    slot[2+namelen] == 0)
4702                  break;
4703              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4704              }              }
4705    
# Line 3097  for (;; ptr++) Line 4708  for (;; ptr++)
4708              recno = GET2(slot, 0);              recno = GET2(slot, 0);
4709              }              }
4710            else if ((recno =                /* Forward back reference */            else if ((recno =                /* Forward back reference */
4711                      find_named_parens(ptr, *brackets, name, namelen)) <= 0)                      find_parens(ptr, cd, name, namelen,
4712                          (options & PCRE_EXTENDED) != 0)) <= 0)
4713              {              {
4714              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4715              goto FAILED;              goto FAILED;
4716              }              }
4717              }
4718    
4719            if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */          /* In both phases, we can now go to the code than handles numerical
4720            recursion or backreferences. */
           /* Back reference */  
4721    
4722            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4723            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4724    
         /* Should never happen */  
         break;  
4725    
4726          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4727            case 'R':                 /* Recursion */
4728          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4729          /* Fall through */          /* Fall through */
4730    
         /* Recursion or "subroutine" call */  
4731    
4732          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4733          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4734            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4735            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4736            {            {
4737            const uschar *called;            const uschar *called;
4738              terminator = ')';
4739    
4740              /* Come here from the \g<...> and \g'...' code (Oniguruma
4741              compatibility). However, the syntax has been checked to ensure that
4742              the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4743              be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4744              ever be taken. */
4745    
4746              HANDLE_NUMERICAL_RECURSION:
4747    
4748              if ((refsign = *ptr) == '+')
4749                {
4750                ptr++;
4751                if ((digitab[*ptr] & ctype_digit) == 0)
4752                  {
4753                  *errorcodeptr = ERR63;
4754                  goto FAILED;
4755                  }
4756                }
4757              else if (refsign == '-')
4758                {
4759                if ((digitab[ptr[1]] & ctype_digit) == 0)
4760                  goto OTHER_CHAR_AFTER_QUERY;
4761                ptr++;
4762                }
4763    
4764            recno = 0;            recno = 0;
4765            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4766              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4767    
4768              if (*ptr != terminator)
4769                {
4770                *errorcodeptr = ERR29;
4771                goto FAILED;
4772                }
4773    
4774              if (refsign == '-')
4775                {
4776                if (recno == 0)
4777                  {
4778                  *errorcodeptr = ERR58;
4779                  goto FAILED;
4780                  }
4781                recno = cd->bracount - recno + 1;
4782                if (recno <= 0)
4783                  {
4784                  *errorcodeptr = ERR15;
4785                  goto FAILED;
4786                  }
4787                }
4788              else if (refsign == '+')
4789                {
4790                if (recno == 0)
4791                  {
4792                  *errorcodeptr = ERR58;
4793                  goto FAILED;
4794                  }
4795                recno += cd->bracount;
4796                }
4797    
4798            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4799    
4800            HANDLE_RECURSION:            HANDLE_RECURSION:
4801    
4802            previous = code;            previous = code;
4803              called = cd->start_code;
4804    
4805            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4806            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4807              this point. If we end up with a forward reference, first check that
4808              the bracket does occur later so we can give the error (and position)
4809              now. Then remember this forward reference in the workspace so it can
4810              be filled in at the end. */
4811    
4812            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)? cd->start_code :  
             find_bracket(cd->start_code, utf8, recno);