/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 380 by ph10, Tue Mar 3 12:32:47 2009 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2008 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
# Line 53  used by pcretest. DEBUG is not defined w Line 61  used by pcretest. DEBUG is not defined w
61  #endif  #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 96  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 115  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144  terminated by a zero length entry. The first three must be alpha, lower, upper,  searched linearly. Put all the names into a single string, in order to reduce
145  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. */
146    
147  static const char *const posix_names[] = {  typedef struct verbitem {
148    "alpha", "lower", "upper",    int   len;
149    "alnum", "ascii", "blank", "cntrl", "digit", "graph",    int   op;
150    "print", "punct", "space", "word",  "xdigit" };  } verbitem;
151    
152    static const char verbnames[] =
153      "ACCEPT\0"
154      "COMMIT\0"
155      "F\0"
156      "FAIL\0"
157      "PRUNE\0"
158      "SKIP\0"
159      "THEN";
160    
161    static const verbitem verbs[] = {
162      { 6, OP_ACCEPT },
163      { 6, OP_COMMIT },
164      { 1, OP_FAIL },
165      { 4, OP_FAIL },
166      { 5, OP_PRUNE },
167      { 4, OP_SKIP  },
168      { 4, OP_THEN  }
169    };
170    
171    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174    /* Tables of names of POSIX character classes and their lengths. The names are
175    now all in a single string, to reduce the number of relocations when a shared
176    library is dynamically loaded. The list of lengths is terminated by a zero
177    length entry. The first three must be alpha, lower, upper, as this is assumed
178    for handling case independence. */
179    
180    static const char posix_names[] =
181      "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
182      "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
183      "word\0"   "xdigit";
184    
185  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
186    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 155  static const int posix_class_maps[] = { Line 213  static const int posix_class_maps[] = {
213  };  };
214    
215    
216  /* The texts of compile-time error messages. These are "char *" because they  #define STRING(a)  # a
217  are passed to the outside world. */  #define XSTRING(s) STRING(s)
218    
219  static const char *error_texts[] = {  /* The texts of compile-time error messages. These are "char *" because they
220    "no error",  are passed to the outside world. Do not ever re-use any error number, because
221    "\\ at end of pattern",  they are documented. Always add a new error instead. Messages marked DEAD below
222    "\\c at end of pattern",  are no longer used. This used to be a table of strings, but in order to reduce
223    "unrecognized character follows \\",  the number of relocations needed when a shared library is loaded dynamically,
224    "numbers out of order in {} quantifier",  it is now one long string. We cannot use a table of offsets, because the
225    lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226    simply count through to the one we want - this isn't a performance issue
227    because these strings are used only when there is a compilation error. */
228    
229    static const char error_texts[] =
230      "no error\0"
231      "\\ at end of pattern\0"
232      "\\c at end of pattern\0"
233      "unrecognized character follows \\\0"
234      "numbers out of order in {} quantifier\0"
235    /* 5 */    /* 5 */
236    "number too big in {} quantifier",    "number too big in {} quantifier\0"
237    "missing terminating ] for character class",    "missing terminating ] for character class\0"
238    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
239    "range out of order in character class",    "range out of order in character class\0"
240    "nothing to repeat",    "nothing to repeat\0"
241    /* 10 */    /* 10 */
242    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
244    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
245    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
246    "missing )",    "missing )\0"
247    /* 15 */    /* 15 */
248    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
249    "erroffset passed as NULL",    "erroffset passed as NULL\0"
250    "unknown option bit(s) set",    "unknown option bit(s) set\0"
251    "missing ) after comment",    "missing ) after comment\0"
252    "parentheses nested too deeply",    "parentheses nested too deeply\0"  /** DEAD **/
253    /* 20 */    /* 20 */
254    "regular expression too large",    "regular expression is too large\0"
255    "failed to get memory",    "failed to get memory\0"
256    "unmatched parentheses",    "unmatched parentheses\0"
257    "internal error: code overflow",    "internal error: code overflow\0"
258    "unrecognized character after (?<",    "unrecognized character after (?<\0"
259    /* 25 */    /* 25 */
260    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
261    "malformed number after (?(",    "malformed number or name after (?(\0"
262    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
263    "assertion expected after (?(",    "assertion expected after (?(\0"
264    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
265    /* 30 */    /* 30 */
266    "unknown POSIX class name",    "unknown POSIX class name\0"
267    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
268    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269    "spare error",    "spare error\0"  /** DEAD **/
270    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
271    /* 35 */    /* 35 */
272    "invalid condition (?(0)",    "invalid condition (?(0)\0"
273    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
274    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275    "number after (?C is > 255",    "number after (?C is > 255\0"
276    "closing ) for (?C expected",    "closing ) for (?C expected\0"
277    /* 40 */    /* 40 */
278    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
279    "unrecognized character after (?P",    "unrecognized character after (?P\0"
280    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)\0"
281    "two named groups have the same name",    "two named subpatterns have the same name\0"
282    "invalid UTF-8 string",    "invalid UTF-8 string\0"
283    /* 45 */    /* 45 */
284    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
285    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
286    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p\0"
287  };    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289      /* 50 */
290      "repeated subpattern is too long\0"    /** DEAD **/
291      "octal value is greater than \\377 (not in UTF-8 mode)\0"
292      "internal error: overran compiling workspace\0"
293      "internal error: previously-checked referenced subpattern not found\0"
294      "DEFINE group contains more than one branch\0"
295      /* 55 */
296      "repeating a DEFINE group is not allowed\0"
297      "inconsistent NEWLINE options\0"
298      "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299      "a numbered reference must not be zero\0"
300      "(*VERB) with an argument is not supported\0"
301      /* 60 */
302      "(*VERB) not recognized\0"
303      "number is too big\0"
304      "subpattern name expected\0"
305      "digit expected after (?+\0"
306      "] is an invalid data character in JavaScript compatibility mode";
307    
308    
309  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 235  For convenience, we use the same bit def Line 322  For convenience, we use the same bit def
322    
323  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
324    
325  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
326  static const unsigned char digitab[] =  static const unsigned char digitab[] =
327    {    {
328    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 271  static const unsigned char digitab[] = Line 358  static const unsigned char digitab[] =
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360    
361  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
362  static const unsigned char digitab[] =  static const unsigned char digitab[] =
363    {    {
364    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 285  static const unsigned char digitab[] = Line 372  static const unsigned char digitab[] =
372    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
373    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
374    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
375    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
376    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
377    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
378    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 319  static const unsigned char ebcdic_charta Line 406  static const unsigned char ebcdic_charta
406    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
407    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
408    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
409    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
410    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
411    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
412    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 346  static const unsigned char ebcdic_charta Line 433  static const unsigned char ebcdic_charta
433  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
434    
435  static BOOL  static BOOL
436    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
438    
439    
440    
441    /*************************************************
442    *            Find an error text                  *
443    *************************************************/
444    
445    /* The error texts are now all in one long string, to save on relocations. As
446    some of the text is of unknown length, we can't use a table of offsets.
447    Instead, just count through the strings. This is not a performance issue
448    because it happens only when there has been a compilation error.
449    
450    Argument:   the error number
451    Returns:    pointer to the error string
452    */
453    
454    static const char *
455    find_error_text(int n)
456    {
457    const char *s = error_texts;
458    for (; n > 0; n--) while (*s++ != 0) {};
459    return s;
460    }
461    
462    
463  /*************************************************  /*************************************************
# Line 357  static BOOL Line 466  static BOOL
466    
467  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
468  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
469  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
470  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472    ptr is pointing at the \. On exit, it is on the final character of the escape
473    sequence.
474    
475  Arguments:  Arguments:
476    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 370  Arguments: Line 481  Arguments:
481    
482  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
483                   negative => a special escape sequence                   negative => a special escape sequence
484                   on error, errorptr is set                   on error, errorcodeptr is set
485  */  */
486    
487  static int  static int
# Line 388  ptr--;                            /* Set Line 499  ptr--;                            /* Set
499    
500  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
501    
502  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
504  Otherwise further processing may be required. */  Otherwise further processing may be required. */
505    
506  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
507  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
508  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
509    
510  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
511  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
512  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
513  #endif  #endif
514    
# Line 406  else if ((i = escapes[c - 0x48]) != 0) Line 517  else if ((i = escapes[c - 0x48]) != 0)
517  else  else
518    {    {
519    const uschar *oldptr;    const uschar *oldptr;
520      BOOL braced, negated;
521    
522    switch (c)    switch (c)
523      {      {
524      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 419  else Line 532  else
532      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
533      break;      break;
534    
535        /* \g must be followed by one of a number of specific things:
536    
537        (1) A number, either plain or braced. If positive, it is an absolute
538        backreference. If negative, it is a relative backreference. This is a Perl
539        5.10 feature.
540    
541        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542        is part of Perl's movement towards a unified syntax for back references. As
543        this is synonymous with \k{name}, we fudge it up by pretending it really
544        was \k.
545    
546        (3) For Oniguruma compatibility we also support \g followed by a name or a
547        number either in angle brackets or in single quotes. However, these are
548        (possibly recursive) subroutine calls, _not_ backreferences. Just return
549        the -ESC_g code (cf \k). */
550    
551        case 'g':
552        if (ptr[1] == '<' || ptr[1] == '\'')
553          {
554          c = -ESC_g;
555          break;
556          }
557    
558        /* Handle the Perl-compatible cases */
559    
560        if (ptr[1] == '{')
561          {
562          const uschar *p;
563          for (p = ptr+2; *p != 0 && *p != '}'; p++)
564            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565          if (*p != 0 && *p != '}')
566            {
567            c = -ESC_k;
568            break;
569            }
570          braced = TRUE;
571          ptr++;
572          }
573        else braced = FALSE;
574    
575        if (ptr[1] == '-')
576          {
577          negated = TRUE;
578          ptr++;
579          }
580        else negated = FALSE;
581    
582        c = 0;
583        while ((digitab[ptr[1]] & ctype_digit) != 0)
584          c = c * 10 + *(++ptr) - '0';
585    
586        if (c < 0)   /* Integer overflow */
587          {
588          *errorcodeptr = ERR61;
589          break;
590          }
591    
592        if (braced && *(++ptr) != '}')
593          {
594          *errorcodeptr = ERR57;
595          break;
596          }
597    
598        if (c == 0)
599          {
600          *errorcodeptr = ERR58;
601          break;
602          }
603    
604        if (negated)
605          {
606          if (c > bracount)
607            {
608            *errorcodeptr = ERR15;
609            break;
610            }
611          c = bracount - (c - 1);
612          }
613    
614        c = -(ESC_REF + c);
615        break;
616    
617      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
618      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
619      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 440  else Line 635  else
635        c -= '0';        c -= '0';
636        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
637          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
638          if (c < 0)    /* Integer overflow */
639            {
640            *errorcodeptr = ERR61;
641            break;
642            }
643        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
644          {          {
645          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 460  else Line 660  else
660        }        }
661    
662      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
663      larger first octal digit. */      larger first octal digit. The original code used just to take the least
664        significant 8 bits of octal numbers (I think this is what early Perls used
665        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666        than 3 octal digits. */
667    
668      case '0':      case '0':
669      c -= '0';      c -= '0';
670      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
672      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
673      break;      break;
674    
675      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
# Line 486  else Line 689  else
689          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == '0') continue;     /* Leading zeroes */
690          count++;          count++;
691    
692  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
693          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
694          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
696          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
697          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698  #endif  #endif
# Line 513  else Line 716  else
716        {        {
717        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
718        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
719  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
720        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
721        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
723        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
724        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725  #endif  #endif
726        }        }
727      break;      break;
728    
729      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730        This coding is ASCII-specific, but then the whole concept of \cx is
731        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732    
733      case 'c':      case 'c':
734      c = *(++ptr);      c = *(++ptr);
735      if (c == 0)      if (c == 0)
736        {        {
737        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
738        return 0;        break;
739        }        }
740    
741      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
742      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
743      c ^= 0x40;      c ^= 0x40;
744  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
745      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
746      c ^= 0xC0;      c ^= 0xC0;
747  #endif  #endif
748      break;      break;
749    
750      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
752      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
753      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
754      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
755    
756      default:      default:
757      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 610  if (c == '{') Line 811  if (c == '{')
811      *negptr = TRUE;      *negptr = TRUE;
812      ptr++;      ptr++;
813      }      }
814    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
815      {      {
816      c = *(++ptr);      c = *(++ptr);
817      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
# Line 639  top = _pcre_utt_size; Line 840  top = _pcre_utt_size;
840  while (bot < top)  while (bot < top)
841    {    {
842    i = (bot + top) >> 1;    i = (bot + top) >> 1;
843    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844    if (c == 0)    if (c == 0)
845      {      {
846      *dptr = _pcre_utt[i].value;      *dptr = _pcre_utt[i].value;
# Line 763  return p; Line 964  return p;
964    
965    
966  /*************************************************  /*************************************************
967    *       Find forward referenced subpattern       *
968    *************************************************/
969    
970    /* This function scans along a pattern's text looking for capturing
971    subpatterns, and counting them. If it finds a named pattern that matches the
972    name it is given, it returns its number. Alternatively, if the name is NULL, it
973    returns when it reaches a given numbered subpattern. This is used for forward
974    references to subpatterns. We know that if (?P< is encountered, the name will
975    be terminated by '>' because that is checked in the first pass.
976    
977    Arguments:
978      ptr          current position in the pattern
979      cd           compile background data
980      name         name to seek, or NULL if seeking a numbered subpattern
981      lorn         name length, or subpattern number if name is NULL
982      xmode        TRUE if we are in /x mode
983    
984    Returns:       the number of the named subpattern, or -1 if not found
985    */
986    
987    static int
988    find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
989      BOOL xmode)
990    {
991    const uschar *thisname;
992    int count = cd->bracount;
993    
994    for (; *ptr != 0; ptr++)
995      {
996      int term;
997    
998      /* Skip over backslashed characters and also entire \Q...\E */
999    
1000      if (*ptr == '\\')
1001        {
1002        if (*(++ptr) == 0) return -1;
1003        if (*ptr == 'Q') for (;;)
1004          {
1005          while (*(++ptr) != 0 && *ptr != '\\') {};
1006          if (*ptr == 0) return -1;
1007          if (*(++ptr) == 'E') break;
1008          }
1009        continue;
1010        }
1011    
1012      /* Skip over character classes; this logic must be similar to the way they
1013      are handled for real. If the first character is '^', skip it. Also, if the
1014      first few characters (either before or after ^) are \Q\E or \E we skip them
1015      too. This makes for compatibility with Perl. */
1016    
1017      if (*ptr == '[')
1018        {
1019        BOOL negate_class = FALSE;
1020        for (;;)
1021          {
1022          int c = *(++ptr);
1023          if (c == '\\')
1024            {
1025            if (ptr[1] == 'E') ptr++;
1026              else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1027                else break;
1028            }
1029          else if (!negate_class && c == '^')
1030            negate_class = TRUE;
1031          else break;
1032          }
1033    
1034        /* If the next character is ']', it is a data character that must be
1035        skipped, except in JavaScript compatibility mode. */
1036    
1037        if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1038          ptr++;
1039    
1040        while (*(++ptr) != ']')
1041          {
1042          if (*ptr == 0) return -1;
1043          if (*ptr == '\\')
1044            {
1045            if (*(++ptr) == 0) return -1;
1046            if (*ptr == 'Q') for (;;)
1047              {
1048              while (*(++ptr) != 0 && *ptr != '\\') {};
1049              if (*ptr == 0) return -1;
1050              if (*(++ptr) == 'E') break;
1051              }
1052            continue;
1053            }
1054          }
1055        continue;
1056        }
1057    
1058      /* Skip comments in /x mode */
1059    
1060      if (xmode && *ptr == '#')
1061        {
1062        while (*(++ptr) != 0 && *ptr != '\n') {};
1063        if (*ptr == 0) return -1;
1064        continue;
1065        }
1066    
1067      /* An opening parens must now be a real metacharacter */
1068    
1069      if (*ptr != '(') continue;
1070      if (ptr[1] != '?' && ptr[1] != '*')
1071        {
1072        count++;
1073        if (name == NULL && count == lorn) return count;
1074        continue;
1075        }
1076    
1077      ptr += 2;
1078      if (*ptr == 'P') ptr++;                      /* Allow optional P */
1079    
1080      /* We have to disambiguate (?<! and (?<= from (?<name> */
1081    
1082      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1083           *ptr != '\'')
1084        continue;
1085    
1086      count++;
1087    
1088      if (name == NULL && count == lorn) return count;
1089      term = *ptr++;
1090      if (term == '<') term = '>';
1091      thisname = ptr;
1092      while (*ptr != term) ptr++;
1093      if (name != NULL && lorn == ptr - thisname &&
1094          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1095        return count;
1096      }
1097    
1098    return -1;
1099    }
1100    
1101    
1102    
1103    /*************************************************
1104  *      Find first significant op code            *  *      Find first significant op code            *
1105  *************************************************/  *************************************************/
1106    
# Line 811  for (;;) Line 1149  for (;;)
1149    
1150      case OP_CALLOUT:      case OP_CALLOUT:
1151      case OP_CREF:      case OP_CREF:
1152      case OP_BRANUMBER:      case OP_RREF:
1153        case OP_DEF:
1154      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1155      break;      break;
1156    
# Line 856  for (;;) Line 1195  for (;;)
1195    {    {
1196    int d;    int d;
1197    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1198    switch (op)    switch (op)
1199      {      {
1200        case OP_CBRA:
1201      case OP_BRA:      case OP_BRA:
1202      case OP_ONCE:      case OP_ONCE:
1203      case OP_COND:      case OP_COND:
1204      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1205      if (d < 0) return d;      if (d < 0) return d;
1206      branchlength += d;      branchlength += d;
1207      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 898  for (;;) Line 1236  for (;;)
1236      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1237    
1238      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1239      case OP_CREF:      case OP_CREF:
1240        case OP_RREF:
1241        case OP_DEF:
1242      case OP_OPT:      case OP_OPT:
1243      case OP_CALLOUT:      case OP_CALLOUT:
1244      case OP_SOD:      case OP_SOD:
# Line 917  for (;;) Line 1256  for (;;)
1256    
1257      case OP_CHAR:      case OP_CHAR:
1258      case OP_CHARNC:      case OP_CHARNC:
1259        case OP_NOT:
1260      branchlength++;      branchlength++;
1261      cc += 2;      cc += 2;
1262  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 943  for (;;) Line 1283  for (;;)
1283    
1284      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1285      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1286        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1287      cc += 4;      cc += 4;
1288      break;      break;
1289    
# Line 960  for (;;) Line 1301  for (;;)
1301      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1302      case OP_WORDCHAR:      case OP_WORDCHAR:
1303      case OP_ANY:      case OP_ANY:
1304        case OP_ALLANY:
1305      branchlength++;      branchlength++;
1306      cc++;      cc++;
1307      break;      break;
# Line 1031  Returns:      pointer to the opcode for Line 1373  Returns:      pointer to the opcode for
1373  static const uschar *  static const uschar *
1374  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1375  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1376  for (;;)  for (;;)
1377    {    {
1378    register int c = *code;    register int c = *code;
1379    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1380    else if (c > OP_BRA)  
1381      /* XCLASS is used for classes that cannot be represented just by a bit
1382      map. This includes negated single high-valued characters. The length in
1383      the table is zero; the actual length is stored in the compiled code. */
1384    
1385      if (c == OP_XCLASS) code += GET(code, 1);
1386    
1387      /* Handle capturing bracket */
1388    
1389      else if (c == OP_CBRA)
1390      {      {
1391      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1392      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1393      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1394      }      }
1395    
1396      /* Otherwise, we can get the item's length from the table, except that for
1397      repeated character types, we have to test for \p and \P, which have an extra
1398      two bytes of parameters. */
1399    
1400    else    else
1401      {      {
1402      code += _pcre_OP_lengths[c];      switch(c)
1403          {
1404          case OP_TYPESTAR:
1405          case OP_TYPEMINSTAR:
1406          case OP_TYPEPLUS:
1407          case OP_TYPEMINPLUS:
1408          case OP_TYPEQUERY:
1409          case OP_TYPEMINQUERY:
1410          case OP_TYPEPOSSTAR:
1411          case OP_TYPEPOSPLUS:
1412          case OP_TYPEPOSQUERY:
1413          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1414          break;
1415    
1416  #ifdef SUPPORT_UTF8        case OP_TYPEUPTO:
1417          case OP_TYPEMINUPTO:
1418          case OP_TYPEEXACT:
1419          case OP_TYPEPOSUPTO:
1420          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1421          break;
1422          }
1423    
1424      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* Add in the fixed length from the table */
1425      by a multi-byte character. The length in the table is a minimum, so we have  
1426      to scan along to skip the extra bytes. All opcodes are less than 128, so we      code += _pcre_OP_lengths[c];
1427      can use relatively efficient code. */  
1428      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1429      a multi-byte character. The length in the table is a minimum, so we have to
1430      arrange to skip the extra bytes. */
1431    
1432    #ifdef SUPPORT_UTF8
1433      if (utf8) switch(c)      if (utf8) switch(c)
1434        {        {
1435        case OP_CHAR:        case OP_CHAR:
# Line 1064  for (;;) Line 1437  for (;;)
1437        case OP_EXACT:        case OP_EXACT:
1438        case OP_UPTO:        case OP_UPTO:
1439        case OP_MINUPTO:        case OP_MINUPTO:
1440          case OP_POSUPTO:
1441        case OP_STAR:        case OP_STAR:
1442        case OP_MINSTAR:        case OP_MINSTAR:
1443          case OP_POSSTAR:
1444        case OP_PLUS:        case OP_PLUS:
1445        case OP_MINPLUS:        case OP_MINPLUS:
1446          case OP_POSPLUS:
1447        case OP_QUERY:        case OP_QUERY:
1448        case OP_MINQUERY:        case OP_MINQUERY:
1449        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1450        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1451        break;        break;
1452        }        }
1453    #else
1454        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1455  #endif  #endif
1456      }      }
1457    }    }
# Line 1105  Returns:      pointer to the opcode for Line 1476  Returns:      pointer to the opcode for
1476  static const uschar *  static const uschar *
1477  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1478  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1479  for (;;)  for (;;)
1480    {    {
1481    register int c = *code;    register int c = *code;
1482    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1483    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1484    else if (c > OP_BRA)  
1485      {    /* XCLASS is used for classes that cannot be represented just by a bit
1486      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1487      }    the table is zero; the actual length is stored in the compiled code. */
1488    
1489      if (c == OP_XCLASS) code += GET(code, 1);
1490    
1491      /* Otherwise, we can get the item's length from the table, except that for
1492      repeated character types, we have to test for \p and \P, which have an extra
1493      two bytes of parameters. */
1494    
1495    else    else
1496      {      {
1497      code += _pcre_OP_lengths[c];      switch(c)
1498          {
1499          case OP_TYPESTAR:
1500          case OP_TYPEMINSTAR:
1501          case OP_TYPEPLUS:
1502          case OP_TYPEMINPLUS:
1503          case OP_TYPEQUERY:
1504          case OP_TYPEMINQUERY:
1505          case OP_TYPEPOSSTAR:
1506          case OP_TYPEPOSPLUS:
1507          case OP_TYPEPOSQUERY:
1508          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1509          break;
1510    
1511  #ifdef SUPPORT_UTF8        case OP_TYPEPOSUPTO:
1512          case OP_TYPEUPTO:
1513          case OP_TYPEMINUPTO:
1514          case OP_TYPEEXACT:
1515          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1516          break;
1517          }
1518    
1519        /* Add in the fixed length from the table */
1520    
1521        code += _pcre_OP_lengths[c];
1522    
1523      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1524      by a multi-byte character. The length in the table is a minimum, so we have      by a multi-byte character. The length in the table is a minimum, so we have
1525      to scan along to skip the extra bytes. All opcodes are less than 128, so we      to arrange to skip the extra bytes. */
     can use relatively efficient code. */  
1526    
1527    #ifdef SUPPORT_UTF8
1528      if (utf8) switch(c)      if (utf8) switch(c)
1529        {        {
1530        case OP_CHAR:        case OP_CHAR:
# Line 1136  for (;;) Line 1532  for (;;)
1532        case OP_EXACT:        case OP_EXACT:
1533        case OP_UPTO:        case OP_UPTO:
1534        case OP_MINUPTO:        case OP_MINUPTO:
1535          case OP_POSUPTO:
1536        case OP_STAR:        case OP_STAR:
1537        case OP_MINSTAR:        case OP_MINSTAR:
1538          case OP_POSSTAR:
1539        case OP_PLUS:        case OP_PLUS:
1540        case OP_MINPLUS:        case OP_MINPLUS:
1541          case OP_POSPLUS:
1542        case OP_QUERY:        case OP_QUERY:
1543        case OP_MINQUERY:        case OP_MINQUERY:
1544        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1545        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1546        break;        break;
1547        }        }
1548    #else
1549        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1550  #endif  #endif
1551      }      }
1552    }    }
# Line 1165  for (;;) Line 1559  for (;;)
1559  *************************************************/  *************************************************/
1560    
1561  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1562  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1563  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1564  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1565  whose current branch will already have been scanned.  backward and negative forward assertions when its final argument is TRUE. If we
1566    hit an unclosed bracket, we return "empty" - this means we've struck an inner
1567    bracket whose current branch will already have been scanned.
1568    
1569  Arguments:  Arguments:
1570    code        points to start of search    code        points to start of search
# Line 1182  static BOOL Line 1578  static BOOL
1578  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1579  {  {
1580  register int c;  register int c;
1581  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1582       code < endcode;       code < endcode;
1583       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1584    {    {
# Line 1190  for (code = first_significant_code(code Line 1586  for (code = first_significant_code(code
1586    
1587    c = *code;    c = *code;
1588    
1589    if (c >= OP_BRA)    /* Skip over forward assertions; the other assertions are skipped by
1590      first_significant_code() with a TRUE final argument. */
1591    
1592      if (c == OP_ASSERT)
1593        {
1594        do code += GET(code, 1); while (*code == OP_ALT);
1595        c = *code;
1596        continue;
1597        }
1598    
1599      /* Groups with zero repeats can of course be empty; skip them. */
1600    
1601      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1602        {
1603        code += _pcre_OP_lengths[c];
1604        do code += GET(code, 1); while (*code == OP_ALT);
1605        c = *code;
1606        continue;
1607        }
1608    
1609      /* For other groups, scan the branches. */
1610    
1611      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1612      {      {
1613      BOOL empty_branch;      BOOL empty_branch;
1614      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1206  for (code = first_significant_code(code Line 1624  for (code = first_significant_code(code
1624        }        }
1625      while (*code == OP_ALT);      while (*code == OP_ALT);
1626      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1627      c = *code;      c = *code;
1628        continue;
1629      }      }
1630    
1631    else switch (c)    /* Handle the other opcodes */
1632    
1633      switch (c)
1634      {      {
1635      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1636        cannot be represented just by a bit map. This includes negated single
1637        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1638        actual length is stored in the compiled code, so we must update "code"
1639        here. */
1640    
1641  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1642      case OP_XCLASS:      case OP_XCLASS:
1643      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1644      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1645  #endif  #endif
1646    
# Line 1260  for (code = first_significant_code(code Line 1684  for (code = first_significant_code(code
1684      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1685      case OP_WORDCHAR:      case OP_WORDCHAR:
1686      case OP_ANY:      case OP_ANY:
1687        case OP_ALLANY:
1688      case OP_ANYBYTE:      case OP_ANYBYTE:
1689      case OP_CHAR:      case OP_CHAR:
1690      case OP_CHARNC:      case OP_CHARNC:
1691      case OP_NOT:      case OP_NOT:
1692      case OP_PLUS:      case OP_PLUS:
1693      case OP_MINPLUS:      case OP_MINPLUS:
1694        case OP_POSPLUS:
1695      case OP_EXACT:      case OP_EXACT:
1696      case OP_NOTPLUS:      case OP_NOTPLUS:
1697      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1698        case OP_NOTPOSPLUS:
1699      case OP_NOTEXACT:      case OP_NOTEXACT:
1700      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1701      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1702        case OP_TYPEPOSPLUS:
1703      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1704      return FALSE;      return FALSE;
1705    
1706        /* These are going to continue, as they may be empty, but we have to
1707        fudge the length for the \p and \P cases. */
1708    
1709        case OP_TYPESTAR:
1710        case OP_TYPEMINSTAR:
1711        case OP_TYPEPOSSTAR:
1712        case OP_TYPEQUERY:
1713        case OP_TYPEMINQUERY:
1714        case OP_TYPEPOSQUERY:
1715        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1716        break;
1717    
1718        /* Same for these */
1719    
1720        case OP_TYPEUPTO:
1721        case OP_TYPEMINUPTO:
1722        case OP_TYPEPOSUPTO:
1723        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1724        break;
1725    
1726      /* End of branch */      /* End of branch */
1727    
1728      case OP_KET:      case OP_KET:
# Line 1283  for (code = first_significant_code(code Line 1731  for (code = first_significant_code(code
1731      case OP_ALT:      case OP_ALT:
1732      return TRUE;      return TRUE;
1733    
1734      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1735      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1736    
1737  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1738      case OP_STAR:      case OP_STAR:
1739      case OP_MINSTAR:      case OP_MINSTAR:
1740        case OP_POSSTAR:
1741      case OP_QUERY:      case OP_QUERY:
1742      case OP_MINQUERY:      case OP_MINQUERY:
1743        case OP_POSQUERY:
1744      case OP_UPTO:      case OP_UPTO:
1745      case OP_MINUPTO:      case OP_MINUPTO:
1746        case OP_POSUPTO:
1747      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1748      break;      break;
1749  #endif  #endif
# Line 1341  return TRUE; Line 1792  return TRUE;
1792  *************************************************/  *************************************************/
1793    
1794  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
1795  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
1796  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1797  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
1798    
1799    Originally, this function only recognized a sequence of letters between the
1800    terminators, but it seems that Perl recognizes any sequence of characters,
1801    though of course unknown POSIX names are subsequently rejected. Perl gives an
1802    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1803    didn't consider this to be a POSIX class. Likewise for [:1234:].
1804    
1805    The problem in trying to be exactly like Perl is in the handling of escapes. We
1806    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1807    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1808    below handles the special case of \], but does not try to do any other escape
1809    processing. This makes it different from Perl for cases such as [:l\ower:]
1810    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1811    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1812    I think.
1813    
1814  Argument:  Arguments:
1815    ptr      pointer to the initial [    ptr      pointer to the initial [
1816    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
1817    
1818  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
1819  */  */
1820    
1821  static BOOL  static BOOL
1822  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
1823  {  {
1824  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
1825  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1826  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
1827    {    {
1828    *endptr = ptr;    if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1829    return TRUE;      {
1830        if (*ptr == ']') return FALSE;
1831        if (*ptr == terminator && ptr[1] == ']')
1832          {
1833          *endptr = ptr;
1834          return TRUE;
1835          }
1836        }
1837    }    }
1838  return FALSE;  return FALSE;
1839  }  }
# Line 1388  Returns:     a value representing the na Line 1858  Returns:     a value representing the na
1858  static int  static int
1859  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
1860  {  {
1861    const char *pn = posix_names;
1862  register int yield = 0;  register int yield = 0;
1863  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
1864    {    {
1865    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
1866      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
1867      pn += posix_name_lengths[yield] + 1;
1868    yield++;    yield++;
1869    }    }
1870  return -1;  return -1;
# Line 1407  return -1; Line 1879  return -1;
1879  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
1880  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
1881  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
1882  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1883  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
1884  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
1885  offsets adjusted. That is the job of this function. Before it is called, the  have their offsets adjusted. That one of the jobs of this function. Before it
1886  partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
1887    OP_END.
1888    
1889    This function has been extended with the possibility of forward references for
1890    recursions and subroutine calls. It must also check the list of such references
1891    for the group we are dealing with. If it finds that one of the recursions in
1892    the current group is on this list, it adjusts the offset in the list, not the
1893    value in the reference (which is a group number).
1894    
1895  Arguments:  Arguments:
1896    group      points to the start of the group    group      points to the start of the group
1897    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1898    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1899    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1900      save_hwm   the hwm forward reference pointer at the start of the group
1901    
1902  Returns:     nothing  Returns:     nothing
1903  */  */
1904    
1905  static void  static void
1906  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1907      uschar *save_hwm)
1908  {  {
1909  uschar *ptr = group;  uschar *ptr = group;
1910    
1911  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1912    {    {
1913    int offset = GET(ptr, 1);    int offset;
1914    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1915    
1916      /* See if this recursion is on the forward reference list. If so, adjust the
1917      reference. */
1918    
1919      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1920        {
1921        offset = GET(hc, 0);
1922        if (cd->start_code + offset == ptr + 1)
1923          {
1924          PUT(hc, 0, offset + adjust);
1925          break;
1926          }
1927        }
1928    
1929      /* Otherwise, adjust the recursion offset if it's after the start of this
1930      group. */
1931    
1932      if (hc >= cd->hwm)
1933        {
1934        offset = GET(ptr, 1);
1935        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1936        }
1937    
1938    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1939    }    }
1940  }  }
# Line 1508  Yield:        TRUE when range returned; Line 2013  Yield:        TRUE when range returned;
2013  */  */
2014    
2015  static BOOL  static BOOL
2016  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2017      unsigned int *odptr)
2018  {  {
2019  int c, othercase, next;  unsigned int c, othercase, next;
2020    
2021  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2022    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2023    
2024  if (c > d) return FALSE;  if (c > d) return FALSE;
2025    
# Line 1522  next = othercase + 1; Line 2028  next = othercase + 1;
2028    
2029  for (++c; c <= d; c++)  for (++c; c <= d; c++)
2030    {    {
2031    if (_pcre_ucp_othercase(c) != next) break;    if (UCD_OTHERCASE(c) != next) break;
2032    next++;    next++;
2033    }    }
2034    
# Line 1534  return TRUE; Line 2040  return TRUE;
2040  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2041    
2042    
2043    
2044  /*************************************************  /*************************************************
2045  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
2046  *************************************************/  *************************************************/
2047    
2048  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
2049  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
2050  bits.  sense to automatically possessify the repeated item.
2051    
2052  Arguments:  Arguments:
2053    optionsptr     pointer to the option bits    op_code       the repeated op code
2054    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
2055    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
2056    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
2057    errorcodeptr   points to error code variable    ptr           next character in pattern
2058    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
2059    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
2060    
2061  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
2062  */  */
2063    
2064  static BOOL  static BOOL
2065  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2066    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
2067  {  {
2068  int repeat_type, op_type;  int next;
2069  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
2070  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
2071  int greedy_default, greedy_non_default;  
2072  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
2073  int zeroreqbyte, zerofirstbyte;    {
2074  int req_caseopt, reqvary, tempreqvary;    for (;;)
2075  int condcount = 0;      {
2076  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2077  int after_manual_callout = 0;      if (*ptr == '#')
2078  register int c;        {
2079  register uschar *code = *codeptr;        while (*(++ptr) != 0)
2080  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2081  BOOL inescq = FALSE;        }
2082        else break;
2083        }
2084      }
2085    
2086    /* If the next item is one that we can handle, get its value. A non-negative
2087    value is a character, a negative value is an escape value. */
2088    
2089    if (*ptr == '\\')
2090      {
2091      int temperrorcode = 0;
2092      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2093      if (temperrorcode != 0) return FALSE;
2094      ptr++;    /* Point after the escape sequence */
2095      }
2096    
2097    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2098      {
2099    #ifdef SUPPORT_UTF8
2100      if (utf8) { GETCHARINC(next, ptr); } else
2101    #endif
2102      next = *ptr++;
2103      }
2104    
2105    else return FALSE;
2106    
2107    /* Skip whitespace and comments in extended mode */
2108    
2109    if ((options & PCRE_EXTENDED) != 0)
2110      {
2111      for (;;)
2112        {
2113        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2114        if (*ptr == '#')
2115          {
2116          while (*(++ptr) != 0)
2117            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2118          }
2119        else break;
2120        }
2121      }
2122    
2123    /* If the next thing is itself optional, we have to give up. */
2124    
2125    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2126      return FALSE;
2127    
2128    /* Now compare the next item with the previous opcode. If the previous is a
2129    positive single character match, "item" either contains the character or, if
2130    "item" is greater than 127 in utf8 mode, the character's bytes are in
2131    utf8_char. */
2132    
2133    
2134    /* Handle cases when the next item is a character. */
2135    
2136    if (next >= 0) switch(op_code)
2137      {
2138      case OP_CHAR:
2139    #ifdef SUPPORT_UTF8
2140      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2141    #else
2142      (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2143    #endif
2144      return item != next;
2145    
2146      /* For CHARNC (caseless character) we must check the other case. If we have
2147      Unicode property support, we can use it to test the other case of
2148      high-valued characters. */
2149    
2150      case OP_CHARNC:
2151    #ifdef SUPPORT_UTF8
2152      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2153    #endif
2154      if (item == next) return FALSE;
2155    #ifdef SUPPORT_UTF8
2156      if (utf8)
2157        {
2158        unsigned int othercase;
2159        if (next < 128) othercase = cd->fcc[next]; else
2160    #ifdef SUPPORT_UCP
2161        othercase = UCD_OTHERCASE((unsigned int)next);
2162    #else
2163        othercase = NOTACHAR;
2164    #endif
2165        return (unsigned int)item != othercase;
2166        }
2167      else
2168    #endif  /* SUPPORT_UTF8 */
2169      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2170    
2171      /* For OP_NOT, "item" must be a single-byte character. */
2172    
2173      case OP_NOT:
2174      if (item == next) return TRUE;
2175      if ((options & PCRE_CASELESS) == 0) return FALSE;
2176    #ifdef SUPPORT_UTF8
2177      if (utf8)
2178        {
2179        unsigned int othercase;
2180        if (next < 128) othercase = cd->fcc[next]; else
2181    #ifdef SUPPORT_UCP
2182        othercase = UCD_OTHERCASE(next);
2183    #else
2184        othercase = NOTACHAR;
2185    #endif
2186        return (unsigned int)item == othercase;
2187        }
2188      else
2189    #endif  /* SUPPORT_UTF8 */
2190      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2191    
2192      case OP_DIGIT:
2193      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2194    
2195      case OP_NOT_DIGIT:
2196      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2197    
2198      case OP_WHITESPACE:
2199      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2200    
2201      case OP_NOT_WHITESPACE:
2202      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2203    
2204      case OP_WORDCHAR:
2205      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2206    
2207      case OP_NOT_WORDCHAR:
2208      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2209    
2210      case OP_HSPACE:
2211      case OP_NOT_HSPACE:
2212      switch(next)
2213        {
2214        case 0x09:
2215        case 0x20:
2216        case 0xa0:
2217        case 0x1680:
2218        case 0x180e:
2219        case 0x2000:
2220        case 0x2001:
2221        case 0x2002:
2222        case 0x2003:
2223        case 0x2004:
2224        case 0x2005:
2225        case 0x2006:
2226        case 0x2007:
2227        case 0x2008:
2228        case 0x2009:
2229        case 0x200A:
2230        case 0x202f:
2231        case 0x205f:
2232        case 0x3000:
2233        return op_code != OP_HSPACE;
2234        default:
2235        return op_code == OP_HSPACE;
2236        }
2237    
2238      case OP_VSPACE:
2239      case OP_NOT_VSPACE:
2240      switch(next)
2241        {
2242        case 0x0a:
2243        case 0x0b:
2244        case 0x0c:
2245        case 0x0d:
2246        case 0x85:
2247        case 0x2028:
2248        case 0x2029:
2249        return op_code != OP_VSPACE;
2250        default:
2251        return op_code == OP_VSPACE;
2252        }
2253    
2254      default:
2255      return FALSE;
2256      }
2257    
2258    
2259    /* Handle the case when the next item is \d, \s, etc. */
2260    
2261    switch(op_code)
2262      {
2263      case OP_CHAR:
2264      case OP_CHARNC:
2265    #ifdef SUPPORT_UTF8
2266      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2267    #endif
2268      switch(-next)
2269        {
2270        case ESC_d:
2271        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2272    
2273        case ESC_D:
2274        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2275    
2276        case ESC_s:
2277        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2278    
2279        case ESC_S:
2280        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2281    
2282        case ESC_w:
2283        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2284    
2285        case ESC_W:
2286        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2287    
2288        case ESC_h:
2289        case ESC_H:
2290        switch(item)
2291          {
2292          case 0x09:
2293          case 0x20:
2294          case 0xa0:
2295          case 0x1680:
2296          case 0x180e:
2297          case 0x2000:
2298          case 0x2001:
2299          case 0x2002:
2300          case 0x2003:
2301          case 0x2004:
2302          case 0x2005:
2303          case 0x2006:
2304          case 0x2007:
2305          case 0x2008:
2306          case 0x2009:
2307          case 0x200A:
2308          case 0x202f:
2309          case 0x205f:
2310          case 0x3000:
2311          return -next != ESC_h;
2312          default:
2313          return -next == ESC_h;
2314          }
2315    
2316        case ESC_v:
2317        case ESC_V:
2318        switch(item)
2319          {
2320          case 0x0a:
2321          case 0x0b:
2322          case 0x0c:
2323          case 0x0d:
2324          case 0x85:
2325          case 0x2028:
2326          case 0x2029:
2327          return -next != ESC_v;
2328          default:
2329          return -next == ESC_v;
2330          }
2331    
2332        default:
2333        return FALSE;
2334        }
2335    
2336      case OP_DIGIT:
2337      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2338             next == -ESC_h || next == -ESC_v;
2339    
2340      case OP_NOT_DIGIT:
2341      return next == -ESC_d;
2342    
2343      case OP_WHITESPACE:
2344      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2345    
2346      case OP_NOT_WHITESPACE:
2347      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2348    
2349      case OP_HSPACE:
2350      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2351    
2352      case OP_NOT_HSPACE:
2353      return next == -ESC_h;
2354    
2355      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2356      case OP_VSPACE:
2357      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2358    
2359      case OP_NOT_VSPACE:
2360      return next == -ESC_v;
2361    
2362      case OP_WORDCHAR:
2363      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2364    
2365      case OP_NOT_WORDCHAR:
2366      return next == -ESC_w || next == -ESC_d;
2367    
2368      default:
2369      return FALSE;
2370      }
2371    
2372    /* Control does not reach here */
2373    }
2374    
2375    
2376    
2377    /*************************************************
2378    *           Compile one branch                   *
2379    *************************************************/
2380    
2381    /* Scan the pattern, compiling it into the a vector. If the options are
2382    changed during the branch, the pointer is used to change the external options
2383    bits. This function is used during the pre-compile phase when we are trying
2384    to find out the amount of memory needed, as well as during the real compile
2385    phase. The value of lengthptr distinguishes the two phases.
2386    
2387    Arguments:
2388      optionsptr     pointer to the option bits
2389      codeptr        points to the pointer to the current code point
2390      ptrptr         points to the current pattern pointer
2391      errorcodeptr   points to error code variable
2392      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2393      reqbyteptr     set to the last literal character required, else < 0
2394      bcptr          points to current branch chain
2395      cd             contains pointers to tables etc.
2396      lengthptr      NULL during the real compile phase
2397                     points to length accumulator during pre-compile phase
2398    
2399    Returns:         TRUE on success
2400                     FALSE, with *errorcodeptr set non-zero on error
2401    */
2402    
2403    static BOOL
2404    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2405      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2406      compile_data *cd, int *lengthptr)
2407    {
2408    int repeat_type, op_type;
2409    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2410    int bravalue = 0;
2411    int greedy_default, greedy_non_default;
2412    int firstbyte, reqbyte;
2413    int zeroreqbyte, zerofirstbyte;
2414    int req_caseopt, reqvary, tempreqvary;
2415    int options = *optionsptr;
2416    int after_manual_callout = 0;
2417    int length_prevgroup = 0;
2418    register int c;
2419    register uschar *code = *codeptr;
2420    uschar *last_code = code;
2421    uschar *orig_code = code;
2422    uschar *tempcode;
2423    BOOL inescq = FALSE;
2424  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
2425  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
2426  const uschar *tempptr;  const uschar *tempptr;
2427  uschar *previous = NULL;  uschar *previous = NULL;
2428  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2429    uschar *save_hwm = NULL;
2430  uschar classbits[32];  uschar classbits[32];
2431    
2432  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2433  BOOL class_utf8;  BOOL class_utf8;
2434  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2435  uschar *class_utf8data;  uschar *class_utf8data;
2436    uschar *class_utf8data_base;
2437  uschar utf8_char[6];  uschar utf8_char[6];
2438  #else  #else
2439  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2440    uschar *utf8_char = NULL;
2441    #endif
2442    
2443    #ifdef DEBUG
2444    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2445  #endif  #endif
2446    
2447  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1621  req_caseopt = ((options & PCRE_CASELESS) Line 2473  req_caseopt = ((options & PCRE_CASELESS)
2473  for (;; ptr++)  for (;; ptr++)
2474    {    {
2475    BOOL negate_class;    BOOL negate_class;
2476      BOOL should_flip_negation;
2477    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2478    BOOL is_quantifier;    BOOL is_quantifier;
2479      BOOL is_recurse;
2480      BOOL reset_bracount;
2481    int class_charcount;    int class_charcount;
2482    int class_lastchar;    int class_lastchar;
2483    int newoptions;    int newoptions;
2484    int recno;    int recno;
2485      int refsign;
2486    int skipbytes;    int skipbytes;
2487    int subreqbyte;    int subreqbyte;
2488    int subfirstbyte;    int subfirstbyte;
2489      int terminator;
2490    int mclength;    int mclength;
2491    uschar mcbuffer[8];    uschar mcbuffer[8];
2492    
2493    /* Next byte in the pattern */    /* Get next byte in the pattern */
2494    
2495    c = *ptr;    c = *ptr;
2496    
2497      /* If we are in the pre-compile phase, accumulate the length used for the
2498      previous cycle of this loop. */
2499    
2500      if (lengthptr != NULL)
2501        {
2502    #ifdef DEBUG
2503        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2504    #endif
2505        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2506          {
2507          *errorcodeptr = ERR52;
2508          goto FAILED;
2509          }
2510    
2511        /* There is at least one situation where code goes backwards: this is the
2512        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2513        the class is simply eliminated. However, it is created first, so we have to
2514        allow memory for it. Therefore, don't ever reduce the length at this point.
2515        */
2516    
2517        if (code < last_code) code = last_code;
2518    
2519        /* Paranoid check for integer overflow */
2520    
2521        if (OFLOW_MAX - *lengthptr < code - last_code)
2522          {
2523          *errorcodeptr = ERR20;
2524          goto FAILED;
2525          }
2526    
2527        *lengthptr += code - last_code;
2528        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2529    
2530        /* If "previous" is set and it is not at the start of the work space, move
2531        it back to there, in order to avoid filling up the work space. Otherwise,
2532        if "previous" is NULL, reset the current code pointer to the start. */
2533    
2534        if (previous != NULL)
2535          {
2536          if (previous > orig_code)
2537            {
2538            memmove(orig_code, previous, code - previous);
2539            code -= previous - orig_code;
2540            previous = orig_code;
2541            }
2542          }
2543        else code = orig_code;
2544    
2545        /* Remember where this code item starts so we can pick up the length
2546        next time round. */
2547    
2548        last_code = code;
2549        }
2550    
2551      /* In the real compile phase, just check the workspace used by the forward
2552      reference list. */
2553    
2554      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2555        {
2556        *errorcodeptr = ERR52;
2557        goto FAILED;
2558        }
2559    
2560    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2561    
2562    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1651  for (;; ptr++) Line 2571  for (;; ptr++)
2571        {        {
2572        if (previous_callout != NULL)        if (previous_callout != NULL)
2573          {          {
2574          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2575              complete_callout(previous_callout, ptr, cd);
2576          previous_callout = NULL;          previous_callout = NULL;
2577          }          }
2578        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1672  for (;; ptr++) Line 2593  for (;; ptr++)
2593    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2594         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2595      {      {
2596      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2597          complete_callout(previous_callout, ptr, cd);
2598      previous_callout = NULL;      previous_callout = NULL;
2599      }      }
2600    
# Line 1683  for (;; ptr++) Line 2605  for (;; ptr++)
2605      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2606      if (c == '#')      if (c == '#')
2607        {        {
2608        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2609        on the Macintosh. */          {
2610        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2611        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2612          if (*ptr != 0) continue;
2613    
2614          /* Else fall through to handle end of string */
2615          c = 0;
2616        }        }
2617      }      }
2618    
# Line 1700  for (;; ptr++) Line 2626  for (;; ptr++)
2626    
2627    switch(c)    switch(c)
2628      {      {
2629      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2630        case 0:                        /* The branch terminates at string end */
2631      case 0:      case '|':                      /* or | or ) */
     case '|':  
2632      case ')':      case ')':
2633      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2634      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2635      *codeptr = code;      *codeptr = code;
2636      *ptrptr = ptr;      *ptrptr = ptr;
2637        if (lengthptr != NULL)
2638          {
2639          if (OFLOW_MAX - *lengthptr < code - last_code)
2640            {
2641            *errorcodeptr = ERR20;
2642            goto FAILED;
2643            }
2644          *lengthptr += code - last_code;   /* To include callout length */
2645          DPRINTF((">> end branch\n"));
2646          }
2647      return TRUE;      return TRUE;
2648    
2649    
2650        /* ===================================================================*/
2651      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2652      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2653    
# Line 1736  for (;; ptr++) Line 2673  for (;; ptr++)
2673      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
2674      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
2675      previous = code;      previous = code;
2676      *code++ = OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2677      break;      break;
2678    
2679    
2680        /* ===================================================================*/
2681      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2682      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2683      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1749  for (;; ptr++) Line 2688  for (;; ptr++)
2688      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2689      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2690      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
2691      */  
2692        In JavaScript compatibility mode, an isolated ']' causes an error. In
2693        default (Perl) mode, it is treated as a data character. */
2694    
2695        case ']':
2696        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2697          {
2698          *errorcodeptr = ERR64;
2699          goto FAILED;
2700          }
2701        goto NORMAL_CHAR;
2702    
2703      case '[':      case '[':
2704      previous = code;      previous = code;
# Line 1758  for (;; ptr++) Line 2707  for (;; ptr++)
2707      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2708    
2709      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2710          check_posix_syntax(ptr, &tempptr, cd))          check_posix_syntax(ptr, &tempptr))
2711        {        {
2712        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2713        goto FAILED;        goto FAILED;
2714        }        }
2715    
2716      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2717        if the first few characters (either before or after ^) are \Q\E or \E we
2718        skip them too. This makes for compatibility with Perl. */
2719    
2720      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2721        for (;;)
2722        {        {
       negate_class = TRUE;  
2723        c = *(++ptr);        c = *(++ptr);
2724          if (c == '\\')
2725            {
2726            if (ptr[1] == 'E') ptr++;
2727              else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2728                else break;
2729            }
2730          else if (!negate_class && c == '^')
2731            negate_class = TRUE;
2732          else break;
2733        }        }
2734      else  
2735        /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2736        an initial ']' is taken as a data character -- the code below handles
2737        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2738        [^] must match any character, so generate OP_ALLANY. */
2739    
2740        if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2741        {        {
2742        negate_class = FALSE;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
2743          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2744          zerofirstbyte = firstbyte;
2745          break;
2746        }        }
2747    
2748        /* If a class contains a negative special such as \S, we need to flip the
2749        negation flag at the end, so that support for characters > 255 works
2750        correctly (they are all included in the class). */
2751    
2752        should_flip_negation = FALSE;
2753    
2754      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2755      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2756      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2757    
2758      class_charcount = 0;      class_charcount = 0;
2759      class_lastchar = -1;      class_lastchar = -1;
2760    
2761        /* Initialize the 32-char bit map to all zeros. We build the map in a
2762        temporary bit of memory, in case the class contains only 1 character (less
2763        than 256), because in that case the compiled code doesn't use the bit map.
2764        */
2765    
2766        memset(classbits, 0, 32 * sizeof(uschar));
2767    
2768  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2769      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2770      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2771        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2772  #endif  #endif
2773    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2774      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2775      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2776      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2777    
2778      do      if (c != 0) do
2779        {        {
2780          const uschar *oldptr;
2781    
2782  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2783        if (utf8 && c > 127)        if (utf8 && c > 127)
2784          {                           /* Braces are required because the */          {                           /* Braces are required because the */
2785          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2786          }          }
2787    
2788          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2789          data and reset the pointer. This is so that very large classes that
2790          contain a zillion UTF-8 characters no longer overwrite the work space
2791          (which is on the stack). */
2792    
2793          if (lengthptr != NULL)
2794            {
2795            *lengthptr += class_utf8data - class_utf8data_base;
2796            class_utf8data = class_utf8data_base;
2797            }
2798    
2799  #endif  #endif
2800    
2801        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
2802    
2803        if (inescq)        if (inescq)
2804          {          {
2805          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2806            {            {
2807            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2808            ptr++;            ptr++;                            /* Skip the 'E' */
2809            continue;            continue;                         /* Carry on with next */
2810            }            }
2811          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2812          }          }
2813    
2814        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1831  for (;; ptr++) Line 2819  for (;; ptr++)
2819    
2820        if (c == '[' &&        if (c == '[' &&
2821            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2822            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr))
2823          {          {
2824          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2825          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
# Line 1848  for (;; ptr++) Line 2836  for (;; ptr++)
2836          if (*ptr == '^')          if (*ptr == '^')
2837            {            {
2838            local_negate = TRUE;            local_negate = TRUE;
2839              should_flip_negation = TRUE;  /* Note negative special */
2840            ptr++;            ptr++;
2841            }            }
2842    
# Line 1911  for (;; ptr++) Line 2900  for (;; ptr++)
2900          }          }
2901    
2902        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2903        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2904        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2905        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2906        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2907        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2908    
2909        if (c == '\\')        if (c == '\\')
2910          {          {
2911          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2912            if (*errorcodeptr != 0) goto FAILED;
2913    
2914          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2915          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2916            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2917          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2918            {            {
2919            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1933  for (;; ptr++) Line 2923  for (;; ptr++)
2923            else inescq = TRUE;            else inescq = TRUE;
2924            continue;            continue;
2925            }            }
2926            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2927    
2928          if (c < 0)          if (c < 0)
2929            {            {
2930            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2931            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2932            switch (-c)  
2933              /* Save time by not doing this in the pre-compile phase. */
2934    
2935              if (lengthptr == NULL) switch (-c)
2936              {              {
2937              case ESC_d:              case ESC_d:
2938              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2939              continue;              continue;
2940    
2941              case ESC_D:              case ESC_D:
2942                should_flip_negation = TRUE;
2943              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2944              continue;              continue;
2945    
# Line 1953  for (;; ptr++) Line 2948  for (;; ptr++)
2948              continue;              continue;
2949    
2950              case ESC_W:              case ESC_W:
2951                should_flip_negation = TRUE;
2952              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2953              continue;              continue;
2954    
# Line 1962  for (;; ptr++) Line 2958  for (;; ptr++)
2958              continue;              continue;
2959    
2960              case ESC_S:              case ESC_S:
2961                should_flip_negation = TRUE;
2962              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2963              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2964              continue;              continue;
2965    
2966  #ifdef SUPPORT_UCP              default:    /* Not recognized; fall through */
2967              case ESC_p:              break;      /* Need "default" setting to stop compiler warning. */
2968              case ESC_P:              }
2969    
2970              /* In the pre-compile phase, just do the recognition. */
2971    
2972              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2973                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2974    
2975              /* We need to deal with \H, \h, \V, and \v in both phases because
2976              they use extra memory. */
2977    
2978              if (-c == ESC_h)
2979                {
2980                SETBIT(classbits, 0x09); /* VT */
2981                SETBIT(classbits, 0x20); /* SPACE */
2982                SETBIT(classbits, 0xa0); /* NSBP */
2983    #ifdef SUPPORT_UTF8
2984                if (utf8)
2985                {                {
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
2986                class_utf8 = TRUE;                class_utf8 = TRUE;
2987                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2988                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2989                *class_utf8data++ = ptype;                *class_utf8data++ = XCL_SINGLE;
2990                *class_utf8data++ = pdata;                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2991                class_charcount -= 2;   /* Not a < 256 character */                *class_utf8data++ = XCL_RANGE;
2992                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2993                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2994                  *class_utf8data++ = XCL_SINGLE;
2995                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2996                  *class_utf8data++ = XCL_SINGLE;
2997                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2998                  *class_utf8data++ = XCL_SINGLE;
2999                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3000                }                }
             continue;  
3001  #endif  #endif
3002                continue;
3003                }
3004    
3005              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
3006              strict mode. By default, for compatibility with Perl, they are              {
3007              treated as literals. */              for (c = 0; c < 32; c++)
3008                  {
3009                  int x = 0xff;
3010                  switch (c)
3011                    {
3012                    case 0x09/8: x ^= 1 << (0x09%8); break;
3013                    case 0x20/8: x ^= 1 << (0x20%8); break;
3014                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
3015                    default: break;
3016                    }
3017                  classbits[c] |= x;
3018                  }
3019    
3020              default:  #ifdef SUPPORT_UTF8
3021              if ((options & PCRE_EXTRA) != 0)              if (utf8)
3022                {                {
3023                *errorcodeptr = ERR7;                class_utf8 = TRUE;
3024                goto FAILED;                *class_utf8data++ = XCL_RANGE;
3025                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3026                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3027                  *class_utf8data++ = XCL_RANGE;
3028                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3029                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3030                  *class_utf8data++ = XCL_RANGE;
3031                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3032                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3033                  *class_utf8data++ = XCL_RANGE;
3034                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3035                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3036                  *class_utf8data++ = XCL_RANGE;
3037                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3038                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3039                  *class_utf8data++ = XCL_RANGE;
3040                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3041                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3042                  *class_utf8data++ = XCL_RANGE;
3043                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3044                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3045                }                }
3046              c = *ptr;              /* The final character */  #endif
3047              class_charcount -= 2;  /* Undo the default count from above */              continue;
3048              }              }
           }  
   
         /* Fall through if we have a single character (c >= 0). This may be  
         > 256 in UTF-8 mode. */  
3049    
3050          }   /* End of backslash handling */            if (-c == ESC_v)
3051                {
3052        /* A single character may be followed by '-' to form a range. However,              SETBIT(classbits, 0x0a); /* LF */
3053        Perl does not permit ']' to be the end of the range. A '-' character              SETBIT(classbits, 0x0b); /* VT */
3054        here is treated as a literal. */              SETBIT(classbits, 0x0c); /* FF */
3055                SETBIT(classbits, 0x0d); /* CR */
3056                SETBIT(classbits, 0x85); /* NEL */
3057    #ifdef SUPPORT_UTF8
3058                if (utf8)
3059                  {
3060                  class_utf8 = TRUE;
3061                  *class_utf8data++ = XCL_RANGE;
3062                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3063                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3064                  }
3065    #endif
3066                continue;
3067                }
3068    
3069              if (-c == ESC_V)
3070                {
3071                for (c = 0; c < 32; c++)
3072                  {
3073                  int x = 0xff;
3074                  switch (c)
3075                    {
3076                    case 0x0a/8: x ^= 1 << (0x0a%8);
3077                                 x ^= 1 << (0x0b%8);
3078                                 x ^= 1 << (0x0c%8);
3079                                 x ^= 1 << (0x0d%8);
3080                                 break;
3081                    case 0x85/8: x ^= 1 << (0x85%8); break;
3082                    default: break;
3083                    }
3084                  classbits[c] |= x;
3085                  }
3086    
3087    #ifdef SUPPORT_UTF8
3088                if (utf8)
3089                  {
3090                  class_utf8 = TRUE;
3091                  *class_utf8data++ = XCL_RANGE;
3092                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3093                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3094                  *class_utf8data++ = XCL_RANGE;
3095                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3096                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3097                  }
3098    #endif
3099                continue;
3100                }
3101    
3102              /* We need to deal with \P and \p in both phases. */
3103    
3104    #ifdef SUPPORT_UCP
3105              if (-c == ESC_p || -c == ESC_P)
3106                {
3107                BOOL negated;
3108                int pdata;
3109                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3110                if (ptype < 0) goto FAILED;
3111                class_utf8 = TRUE;
3112                *class_utf8data++ = ((-c == ESC_p) != negated)?
3113                  XCL_PROP : XCL_NOTPROP;
3114                *class_utf8data++ = ptype;
3115                *class_utf8data++ = pdata;
3116                class_charcount -= 2;   /* Not a < 256 character */
3117                continue;
3118                }
3119    #endif
3120              /* Unrecognized escapes are faulted if PCRE is running in its
3121              strict mode. By default, for compatibility with Perl, they are
3122              treated as literals. */
3123    
3124        if (ptr[1] == '-' && ptr[2] != ']')            if ((options & PCRE_EXTRA) != 0)
3125                {
3126                *errorcodeptr = ERR7;
3127                goto FAILED;
3128                }
3129    
3130              class_charcount -= 2;  /* Undo the default count from above */
3131              c = *ptr;              /* Get the final character and fall through */
3132              }
3133    
3134            /* Fall through if we have a single character (c >= 0). This may be
3135            greater than 256 in UTF-8 mode. */
3136    
3137            }   /* End of backslash handling */
3138    
3139          /* A single character may be followed by '-' to form a range. However,
3140          Perl does not permit ']' to be the end of the range. A '-' character
3141          at the end is treated as a literal. Perl ignores orphaned \E sequences
3142          entirely. The code for handling \Q and \E is messy. */
3143    
3144          CHECK_RANGE:
3145          while (ptr[1] == '\\' && ptr[2] == 'E')
3146            {
3147            inescq = FALSE;
3148            ptr += 2;
3149            }
3150    
3151          oldptr = ptr;
3152    
3153          /* Remember \r or \n */
3154    
3155          if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3156    
3157          /* Check for range */
3158    
3159          if (!inescq && ptr[1] == '-')
3160          {          {
3161          int d;          int d;
3162          ptr += 2;          ptr += 2;
3163            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3164    
3165            /* If we hit \Q (not followed by \E) at this point, go into escaped
3166            mode. */
3167    
3168            while (*ptr == '\\' && ptr[1] == 'Q')
3169              {
3170              ptr += 2;
3171              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3172              inescq = TRUE;
3173              break;
3174              }
3175    
3176            if (*ptr == 0 || (!inescq && *ptr == ']'))
3177              {
3178              ptr = oldptr;
3179              goto LONE_SINGLE_CHARACTER;
3180              }
3181    
3182  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3183          if (utf8)          if (utf8)
# Line 2026  for (;; ptr++) Line 3192  for (;; ptr++)
3192          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3193          in such circumstances. */          in such circumstances. */
3194    
3195          if (d == '\\')          if (!inescq && d == '\\')
3196            {            {
3197            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3198            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
3199    
3200            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backspace; \X is literal X; \R is literal R; any other
3201            was literal */            special means the '-' was literal */
3202    
3203            if (d < 0)            if (d < 0)
3204              {              {
3205              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
3206              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
3207                else if (d == -ESC_R) d = 'R'; else
3208                {                {
3209                ptr = oldptr - 2;                ptr = oldptr;
3210                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3211                }                }
3212              }              }
3213            }            }
3214    
3215          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3216          the pre-pass. Optimize one-character ranges */          one-character ranges */
3217    
3218            if (d < c)
3219              {
3220              *errorcodeptr = ERR8;
3221              goto FAILED;
3222              }
3223    
3224          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3225    
3226            /* Remember \r or \n */
3227    
3228            if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3229    
3230          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3231          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
3232          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 2067  for (;; ptr++) Line 3244  for (;; ptr++)
3244  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3245            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3246              {              {
3247              int occ, ocd;              unsigned int occ, ocd;
3248              int cc = c;              unsigned int cc = c;
3249              int origd = d;              unsigned int origd = d;
3250              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3251                {                {
3252                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3253                      ocd <= (unsigned int)d)
3254                    continue;                          /* Skip embedded ranges */
3255    
3256                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3257                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3258                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3259                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3260                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3261                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3262                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3263                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3264                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3265                  d = ocd;                  d = ocd;
3266                  continue;                  continue;
# Line 2127  for (;; ptr++) Line 3308  for (;; ptr++)
3308          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3309          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3310    
3311          for (; c <= d; c++)          class_charcount += d - c + 1;
3312            class_lastchar = d;
3313    
3314            /* We can save a bit of time by skipping this in the pre-compile. */
3315    
3316            if (lengthptr == NULL) for (; c <= d; c++)
3317            {            {
3318            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3319            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2135  for (;; ptr++) Line 3321  for (;; ptr++)
3321              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3322              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3323              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3324            }            }
3325    
3326          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2160  for (;; ptr++) Line 3344  for (;; ptr++)
3344  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3345          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3346            {            {
3347            int othercase;            unsigned int othercase;
3348            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = UCD_OTHERCASE(c)) != c)
3349              {              {
3350              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3351              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2186  for (;; ptr++) Line 3370  for (;; ptr++)
3370          }          }
3371        }        }
3372    
3373      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3374      loop. This "while" is the end of the "do" above. */  
3375        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3376    
3377        if (c == 0)                          /* Missing terminating ']' */
3378          {
3379          *errorcodeptr = ERR6;
3380          goto FAILED;
3381          }
3382    
3383    
3384    /* This code has been disabled because it would mean that \s counts as
3385    an explicit \r or \n reference, and that's not really what is wanted. Now
3386    we set the flag only if there is a literal "\r" or "\n" in the class. */
3387    
3388    #if 0
3389        /* Remember whether \r or \n are in this class */
3390    
3391        if (negate_class)
3392          {
3393          if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3394          }
3395        else
3396          {
3397          if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3398          }
3399    #endif
3400    
     while ((c = *(++ptr)) != ']' || inescq);  
3401    
3402      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3403      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3404      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3405      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3406      single-bytes only. This is an historical hangover. Maybe one day we can  
3407      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3408        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3409        operate on single-bytes only. This is an historical hangover. Maybe one day
3410        we can tidy these opcodes to handle multi-byte characters.
3411    
3412      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3413      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 2206  for (;; ptr++) Line 3417  for (;; ptr++)
3417      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3418    
3419  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3420      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3421            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3422  #else  #else
3423      if (class_charcount == 1)      if (class_charcount == 1)
3424  #endif  #endif
# Line 2252  for (;; ptr++) Line 3461  for (;; ptr++)
3461      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3462    
3463      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3464      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
3465      we can omit the bitmap. */      such as \S in the class, because in that case all characters > 255 are in
3466        the class, so any that were explicitly given as well can be ignored. If
3467        (when there are explicit characters > 255 that must be listed) there are no
3468        characters < 256, we can omit the bitmap in the actual compiled code. */
3469    
3470  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3471      if (class_utf8)      if (class_utf8 && !should_flip_negation)
3472        {        {
3473        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3474        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
3475        code += LINK_SIZE;        code += LINK_SIZE;
3476        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3477    
3478        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3479        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3480    
3481        if (class_charcount > 0)        if (class_charcount > 0)
3482          {          {
3483          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3484            memmove(code + 32, code, class_utf8data - code);
3485          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3486          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3487          }          }
3488          else code = class_utf8data;
3489    
3490        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3491    
# Line 2289  for (;; ptr++) Line 3494  for (;; ptr++)
3494        }        }
3495  #endif  #endif
3496    
3497      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, set the opcode to OP_CLASS or
3498      and copy it into the code vector. If this is the first thing in the branch,      OP_NCLASS, depending on whether the whole class was negated and whether
3499      there can be no first char setting, whatever the repeat count. Any reqbyte      there were negative specials such as \S in the class. Then copy the 32-byte
3500      setting must remain unchanged after any kind of repeat. */      map into the code vector, negating it if necessary. */
3501    
3502        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3503      if (negate_class)      if (negate_class)
3504        {        {
3505        *code++ = OP_NCLASS;        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3506        for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3507        }        }
3508      else      else
3509        {        {
       *code++ = OP_CLASS;  
3510        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
3511        }        }
3512      code += 32;      code += 32;
3513      break;      break;
3514    
3515    
3516        /* ===================================================================*/
3517      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3518      has been tested above. */      has been tested above. */
3519    
# Line 2374  for (;; ptr++) Line 3581  for (;; ptr++)
3581        }        }
3582      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3583    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3584      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3585      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3586      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2421  for (;; ptr++) Line 3614  for (;; ptr++)
3614          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3615          }          }
3616    
3617          /* If the repetition is unlimited, it pays to see if the next thing on
3618          the line is something that cannot possibly match this character. If so,
3619          automatically possessifying this item gains some performance in the case
3620          where the match fails. */
3621    
3622          if (!possessive_quantifier &&
3623              repeat_max < 0 &&
3624              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3625                options, cd))
3626            {
3627            repeat_type = 0;    /* Force greedy */
3628            possessive_quantifier = TRUE;
3629            }
3630    
3631        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3632        }        }
3633    
3634      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3635      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3636      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3637      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3638        currently used only for single-byte chars. */
3639    
3640      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3641        {        {
3642        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3643        c = previous[1];        c = previous[1];
3644          if (!possessive_quantifier &&
3645              repeat_max < 0 &&
3646              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3647            {
3648            repeat_type = 0;    /* Force greedy */
3649            possessive_quantifier = TRUE;
3650            }
3651        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3652        }        }
3653    
# Line 2450  for (;; ptr++) Line 3665  for (;; ptr++)
3665        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3666        c = *previous;        c = *previous;
3667    
3668          if (!possessive_quantifier &&
3669              repeat_max < 0 &&
3670              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3671            {
3672            repeat_type = 0;    /* Force greedy */
3673            possessive_quantifier = TRUE;
3674            }
3675    
3676        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3677        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3678          {          {
# Line 2469  for (;; ptr++) Line 3692  for (;; ptr++)
3692        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3693        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3694    
3695        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3696    
3697        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3698    
# Line 2490  for (;; ptr++) Line 3713  for (;; ptr++)
3713          }          }
3714    
3715        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3716        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3717        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3718        one less than the maximum. */        one less than the maximum. */
3719    
# Line 2543  for (;; ptr++) Line 3766  for (;; ptr++)
3766            }            }
3767    
3768          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3769          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3770            UPTO is just for 1 instance, we can use QUERY instead. */
3771    
3772          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3773            {            {
# Line 2562  for (;; ptr++) Line 3786  for (;; ptr++)
3786              *code++ = prop_value;              *code++ = prop_value;
3787              }              }
3788            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3789            *code++ = OP_UPTO + repeat_type;  
3790            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3791                {
3792                *code++ = OP_QUERY + repeat_type;
3793                }
3794              else
3795                {
3796                *code++ = OP_UPTO + repeat_type;
3797                PUT2INC(code, 0, repeat_max);
3798                }
3799            }            }
3800          }          }
3801    
# Line 2610  for (;; ptr++) Line 3842  for (;; ptr++)
3842        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3843        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3844    
3845        if (repeat_max != 1) cd->nopartial = TRUE;        if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3846    
3847        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
3848          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 2630  for (;; ptr++) Line 3862  for (;; ptr++)
3862      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3863      cases. */      cases. */
3864    
3865      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3866               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3867        {        {
3868        register int i;        register int i;
3869        int ketoffset = 0;        int ketoffset = 0;
3870        int len = code - previous;        int len = code - previous;
3871        uschar *bralink = NULL;        uschar *bralink = NULL;
3872    
3873          /* Repeating a DEFINE group is pointless */
3874    
3875          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3876            {
3877            *errorcodeptr = ERR55;
3878            goto FAILED;
3879            }
3880    
3881        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3882        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3883        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2660  for (;; ptr++) Line 3900  for (;; ptr++)
3900    
3901        if (repeat_min == 0)        if (repeat_min == 0)
3902          {          {
3903          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
3904          altogether. */          output altogether, like this:
   
         if (repeat_max == 0)  
           {  
           code = previous;  
           goto END_REPEAT;  
           }  
3905    
3906          /* If the maximum is 1 or unlimited, we just have to stick in the          ** if (repeat_max == 0)
3907          BRAZERO and do no more at this point. However, we do need to adjust          **   {
3908          any OP_RECURSE calls inside the group that refer to the group itself or          **   code = previous;
3909          any internal group, because the offset is from the start of the whole          **   goto END_REPEAT;
3910          regex. Temporarily terminate the pattern while doing this. */          **   }
3911    
3912            However, that fails when a group is referenced as a subroutine from
3913            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3914            so that it is skipped on execution. As we don't have a list of which
3915            groups are referenced, we cannot do this selectively.
3916    
3917            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3918            and do no more at this point. However, we do need to adjust any
3919            OP_RECURSE calls inside the group that refer to the group itself or any
3920            internal or forward referenced group, because the offset is from the
3921            start of the whole regex. Temporarily terminate the pattern while doing
3922            this. */
3923    
3924          if (repeat_max <= 1)          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3925            {            {
3926            *code = OP_END;            *code = OP_END;
3927            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3928            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3929            code++;            code++;
3930              if (repeat_max == 0)
3931                {
3932                *previous++ = OP_SKIPZERO;
3933                goto END_REPEAT;
3934                }
3935            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
3936            }            }
3937    
# Line 2696  for (;; ptr++) Line 3947  for (;; ptr++)
3947            {            {
3948            int offset;            int offset;
3949            *code = OP_END;            *code = OP_END;
3950            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3951            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3952            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3953            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2716  for (;; ptr++) Line 3967  for (;; ptr++)
3967        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3968        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3969        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3970        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3971          forward reference subroutine calls in the group, there will be entries on
3972          the workspace list; replicate these with an appropriate increment. */
3973    
3974        else        else
3975          {          {
3976          if (repeat_min > 1)          if (repeat_min > 1)
3977            {            {
3978            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3979            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3980              potential integer overflow. */
3981    
3982              if (lengthptr != NULL)
3983                {
3984                int delta = (repeat_min - 1)*length_prevgroup;
3985                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3986                                                                (double)INT_MAX ||
3987                    OFLOW_MAX - *lengthptr < delta)
3988                  {
3989                  *errorcodeptr = ERR20;
3990                  goto FAILED;
3991                  }
3992                *lengthptr += delta;
3993                }
3994    
3995              /* This is compiling for real */
3996    
3997              else
3998              {              {
3999              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4000              code += len;              for (i = 1; i < repeat_min; i++)
4001                  {
4002                  uschar *hc;
4003                  uschar *this_hwm = cd->hwm;
4004                  memcpy(code, previous, len);
4005                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4006                    {
4007                    PUT(cd->hwm, 0, GET(hc, 0) + len);
4008                    cd->hwm += LINK_SIZE;
4009                    }
4010                  save_hwm = this_hwm;
4011                  code += len;
4012                  }
4013              }              }
4014            }            }
4015    
4016          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
4017          }          }
4018    
# Line 2736  for (;; ptr++) Line 4020  for (;; ptr++)
4020        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
4021        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
4022        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
4023        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
4024          replicate entries on the forward reference list. */
4025    
4026        if (repeat_max >= 0)        if (repeat_max >= 0)
4027          {          {
4028          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
4029            just adjust the length as if we had. For each repetition we must add 1
4030            to the length for BRAZERO and for all but the last repetition we must
4031            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4032            paranoid checks to avoid integer overflow. */
4033    
4034            if (lengthptr != NULL && repeat_max > 0)
4035              {
4036              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4037                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4038              if ((double)repeat_max *
4039                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4040                      > (double)INT_MAX ||
4041                  OFLOW_MAX - *lengthptr < delta)
4042                {
4043                *errorcodeptr = ERR20;
4044                goto FAILED;
4045                }
4046              *lengthptr += delta;
4047              }
4048    
4049            /* This is compiling for real */
4050    
4051            else for (i = repeat_max - 1; i >= 0; i--)
4052            {            {
4053              uschar *hc;
4054              uschar *this_hwm = cd->hwm;
4055    
4056            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
4057    
4058            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2757  for (;; ptr++) Line 4068  for (;; ptr++)
4068              }              }
4069    
4070            memcpy(code, previous, len);            memcpy(code, previous, len);
4071              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4072                {
4073                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4074                cd->hwm += LINK_SIZE;
4075                }
4076              save_hwm = this_hwm;
4077            code += len;            code += len;
4078            }            }
4079    
# Line 2779  for (;; ptr++) Line 4096  for (;; ptr++)
4096        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
4097        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
4098        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
4099        correct offset was computed above. */        correct offset was computed above.
4100    
4101          Then, when we are doing the actual compile phase, check to see whether
4102          this group is a non-atomic one that could match an empty string. If so,
4103          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4104          that runtime checking can be done. [This check is also applied to
4105          atomic groups at runtime, but in a different way.] */
4106    
4107        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
4108            {
4109            uschar *ketcode = code - ketoffset;
4110            uschar *bracode = ketcode - GET(ketcode, 1);
4111            *ketcode = OP_KETRMAX + repeat_type;
4112            if (lengthptr == NULL && *bracode != OP_ONCE)
4113              {
4114              uschar *scode = bracode;
4115              do
4116                {
4117                if (could_be_empty_branch(scode, ketcode, utf8))
4118                  {
4119                  *bracode += OP_SBRA - OP_BRA;
4120                  break;
4121                  }
4122                scode += GET(scode, 1);
4123                }
4124              while (*scode == OP_ALT);
4125              }
4126            }
4127        }        }
4128    
4129        /* If previous is OP_FAIL, it was generated by an empty class [] in
4130        JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4131        by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4132        error above. We can just ignore the repeat in JS case. */
4133    
4134        else if (*previous == OP_FAIL) goto END_REPEAT;
4135    
4136      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
4137    
4138      else      else
# Line 2792  for (;; ptr++) Line 4141  for (;; ptr++)
4141        goto FAILED;        goto FAILED;
4142        }        }
4143    
4144      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
4145      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
4146      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
4147      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4148      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
4149        but the special opcodes can optimize it a bit. The repeated item starts at
4150        tempcode, not at previous, which might be the first part of a string whose
4151        (former) last char we repeated.
4152    
4153        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4154        an 'upto' may follow. We skip over an 'exact' item, and then test the
4155        length of what remains before proceeding. */
4156    
4157      if (possessive_quantifier)      if (possessive_quantifier)
4158        {        {
4159        int len = code - tempcode;        int len;
4160        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4161        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
4162        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode] +
4163        tempcode[0] = OP_ONCE;            ((*tempcode == OP_TYPEEXACT &&
4164        *code++ = OP_KET;               (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4165        PUTINC(code, 0, len);        len = code - tempcode;
4166        PUT(tempcode, 1, len);        if (len > 0) switch (*tempcode)
4167            {
4168            case OP_STAR:  *tempcode = OP_POSSTAR; break;
4169            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4170            case OP_QUERY: *tempcode = OP_POSQUERY; break;
4171            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4172    
4173            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4174            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4175            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4176            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4177    
4178            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4179            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4180            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4181            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4182    
4183            default:
4184            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4185            code += 1 + LINK_SIZE;
4186            len += 1 + LINK_SIZE;
4187            tempcode[0] = OP_ONCE;
4188            *code++ = OP_KET;
4189            PUTINC(code, 0, len);
4190            PUT(tempcode, 1, len);
4191            break;
4192            }
4193        }        }
4194    
4195      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2820  for (;; ptr++) Line 4202  for (;; ptr++)
4202      break;      break;
4203    
4204    
4205      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
4206      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
4207      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
4208      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
4209    
4210      case '(':      case '(':
4211      newoptions = options;      newoptions = options;
4212      skipbytes = 0;      skipbytes = 0;
4213        bravalue = OP_CBRA;
4214        save_hwm = cd->hwm;
4215        reset_bracount = FALSE;
4216    
4217        /* First deal with various "verbs" that can be introduced by '*'. */
4218    
4219        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4220          {
4221          int i, namelen;
4222          const char *vn = verbnames;
4223          const uschar *name = ++ptr;
4224          previous = NULL;
4225          while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4226          if (*ptr == ':')
4227            {
4228            *errorcodeptr = ERR59;   /* Not supported */
4229            goto FAILED;
4230            }
4231          if (*ptr != ')')
4232            {
4233            *errorcodeptr = ERR60;
4234            goto FAILED;
4235            }
4236          namelen = ptr - name;
4237          for (i = 0; i < verbcount; i++)
4238            {
4239            if (namelen == verbs[i].len &&
4240                strncmp((char *)name, vn, namelen) == 0)
4241              {
4242              *code = verbs[i].op;
4243              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4244              break;
4245              }
4246            vn += verbs[i].len + 1;
4247            }
4248          if (i < verbcount) continue;
4249          *errorcodeptr = ERR60;
4250          goto FAILED;
4251          }
4252    
4253      if (*(++ptr) == '?')      /* Deal with the extended parentheses; all are introduced by '?', and the
4254        appearance of any of them means that this is not a capturing group. */
4255    
4256        else if (*ptr == '?')
4257        {        {
4258        int set, unset;        int i, set, unset, namelen;
4259        int *optset;        int *optset;
4260          const uschar *name;
4261          uschar *slot;
4262    
4263        switch (*(++ptr))        switch (*(++ptr))
4264          {          {
4265          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
4266          ptr++;          ptr++;
4267          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
4268            if (*ptr == 0)
4269              {
4270              *errorcodeptr = ERR18;
4271              goto FAILED;
4272              }
4273          continue;          continue;
4274    
4275          case ':':                 /* Non-extracting bracket */  
4276            /* ------------------------------------------------------------ */
4277            case '|':                 /* Reset capture count for each branch */
4278            reset_bracount = TRUE;
4279            /* Fall through */
4280    
4281            /* ------------------------------------------------------------ */
4282            case ':':                 /* Non-capturing bracket */
4283          bravalue = OP_BRA;          bravalue = OP_BRA;
4284          ptr++;          ptr++;
4285          break;          break;
4286    
4287    
4288            /* ------------------------------------------------------------ */
4289          case '(':          case '(':
4290          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4291    
4292          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
4293            group), a name (referring to a named group), or 'R', referring to
4294            recursion. R<digits> and R&name are also permitted for recursion tests.
4295    
4296            There are several syntaxes for testing a named group: (?(name)) is used
4297            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4298    
4299            There are two unfortunate ambiguities, caused by history. (a) 'R' can
4300            be the recursive thing or the name 'R' (and similarly for 'R' followed
4301            by digits), and (b) a number could be a name that consists of digits.
4302            In both cases, we look for a name first; if not found, we try the other
4303            cases. */
4304    
4305            /* For conditions that are assertions, check the syntax, and then exit
4306            the switch. This will take control down to where bracketed groups,
4307            including assertions, are processed. */
4308    
4309            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4310              break;
4311    
4312            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4313            below), and all need to skip 3 bytes at the start of the group. */
4314    
4315            code[1+LINK_SIZE] = OP_CREF;
4316            skipbytes = 3;
4317            refsign = -1;
4318    
4319            /* Check for a test for recursion in a named group. */
4320    
4321          if (ptr[1] == 'R')          if (ptr[1] == 'R' && ptr[2] == '&')
4322            {            {
4323            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4324            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4325            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4326            }            }
4327    
4328          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4329          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4330    
4331          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4332            {            {
4333            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4334            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4335            }            }
4336          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
4337          set bravalue above. */            {
4338          break;            terminator = '\'';
4339              ptr++;
4340          case '=':                 /* Positive lookahead */            }
4341          bravalue = OP_ASSERT;          else
4342          ptr++;            {
4343          break;            terminator = 0;
4344              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4345              }
4346    
4347          case '!':                 /* Negative lookahead */          /* We now expect to read a name; any thing else is an error */
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
4348    
4349          case '<':                 /* Lookbehinds */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
         switch (*(++ptr))  
4350            {            {
4351            case '=':               /* Positive lookbehind */            ptr += 1;  /* To get the right offset */
4352            bravalue = OP_ASSERTBACK;            *errorcodeptr = ERR28;
4353            ptr++;            goto FAILED;
4354            break;            }
4355    
4356            case '!':               /* Negative lookbehind */          /* Read the name, but also get it as a number if it's all digits */
4357            bravalue = OP_ASSERTBACK_NOT;  
4358            recno = 0;
4359            name = ++ptr;
4360            while ((cd->ctypes[*ptr] & ctype_word) != 0)
4361              {
4362              if (recno >= 0)
4363                recno = ((digitab[*ptr] & ctype_digit) != 0)?
4364                  recno * 10 + *ptr - '0' : -1;
4365            ptr++;            ptr++;
           break;  
4366            }            }
4367          break;          namelen = ptr - name;
4368    
4369          case '>':                 /* One-time brackets */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4370          bravalue = OP_ONCE;            {
4371          ptr++;            ptr--;      /* Error offset */
4372          break;            *errorcodeptr = ERR26;
4373              goto FAILED;
4374              }
4375    
4376          case 'C':                 /* Callout - may be followed by digits; */          /* Do no further checking in the pre-compile phase. */
4377          previous_callout = code;  /* Save for later completion */  
4378          after_manual_callout = 1; /* Skip one item before completing */          if (lengthptr != NULL) break;
4379          *code++ = OP_CALLOUT;     /* Already checked that the terminating */  
4380            {                       /* closing parenthesis is present. */          /* In the real compile we do the work of looking for the actual
4381            int n = 0;          reference. If the string started with "+" or "-" we require the rest to
4382            while ((digitab[*(++ptr)] & ctype_digit) != 0)          be digits, in which case recno will be set. */
4383              n = n * 10 + *ptr - '0';  
4384            if (n > 255)          if (refsign > 0)
4385              {
4386              if (recno <= 0)
4387              {              {
4388              *errorcodeptr = ERR38;              *errorcodeptr = ERR58;
4389              goto FAILED;              goto FAILED;
4390              }              }
4391            *code++ = n;            recno = (refsign == '-')?
4392            PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */              cd->bracount - recno + 1 : recno +cd->bracount;
4393            PUT(code, LINK_SIZE, 0);                    /* Default length */            if (recno <= 0 || recno > cd->final_bracount)
4394            code += 2 * LINK_SIZE;              {
4395                *errorcodeptr = ERR15;
4396                goto FAILED;
4397                }
4398              PUT2(code, 2+LINK_SIZE, recno);
4399              break;
4400            }            }
         previous = NULL;  
         continue;  
4401    
4402          case 'P':                 /* Named subpattern handling */          /* Otherwise (did not start with "+" or "-"), start by looking for the
4403          if (*(++ptr) == '<')      /* Definition */          name. */
4404    
4405            slot = cd->name_table;
4406            for (i = 0; i < cd->names_found; i++)
4407            {            {
4408            int i, namelen;            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4409            uschar *slot = cd->name_table;            slot += cd->name_entry_size;
4410            const uschar *name;     /* Don't amalgamate; some compilers */            }
           name = ++ptr;           /* grumble at autoincrement in declaration */  
4411    
4412            while (*ptr++ != '>');          /* Found a previous named subpattern */
           namelen = ptr - name - 1;  
4413    
4414            for (i = 0; i < cd->names_found; i++)          if (i < cd->names_found)
4415              {
4416              recno = GET2(slot, 0);
4417              PUT2(code, 2+LINK_SIZE, recno);
4418              }
4419    
4420            /* Search the pattern for a forward reference */
4421    
4422            else if ((i = find_parens(ptr, cd, name, namelen,
4423                            (options & PCRE_EXTENDED) != 0)) > 0)
4424              {
4425              PUT2(code, 2+LINK_SIZE, i);
4426              }
4427    
4428            /* If terminator == 0 it means that the name followed directly after
4429            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4430            some further alternatives to try. For the cases where terminator != 0
4431            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4432            now checked all the possibilities, so give an error. */
4433    
4434            else if (terminator != 0)
4435              {
4436              *errorcodeptr = ERR15;
4437              goto FAILED;
4438              }
4439    
4440            /* Check for (?(R) for recursion. Allow digits after R to specify a
4441            specific group number. */
4442    
4443            else if (*name == 'R')
4444              {
4445              recno = 0;
4446              for (i = 1; i < namelen; i++)
4447                {
4448                if ((digitab[name[i]] & ctype_digit) == 0)
4449                  {
4450                  *errorcodeptr = ERR15;
4451                  goto FAILED;
4452                  }
4453                recno = recno * 10 + name[i] - '0';
4454                }
4455              if (recno == 0) recno = RREF_ANY;
4456              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4457              PUT2(code, 2+LINK_SIZE, recno);
4458              }
4459    
4460            /* Similarly, check for the (?(DEFINE) "condition", which is always
4461            false. */
4462    
4463            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4464              {
4465              code[1+LINK_SIZE] = OP_DEF;
4466              skipbytes = 1;
4467              }
4468    
4469            /* Check for the "name" actually being a subpattern number. We are
4470            in the second pass here, so final_bracount is set. */
4471    
4472            else if (recno > 0 && recno <= cd->final_bracount)
4473              {
4474              PUT2(code, 2+LINK_SIZE, recno);
4475              }
4476    
4477            /* Either an unidentified subpattern, or a reference to (?(0) */
4478    
4479            else
4480              {
4481              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4482              goto FAILED;
4483              }
4484            break;
4485    
4486    
4487            /* ------------------------------------------------------------ */
4488            case '=':                 /* Positive lookahead */
4489            bravalue = OP_ASSERT;
4490            ptr++;
4491            break;
4492    
4493    
4494            /* ------------------------------------------------------------ */
4495            case '!':                 /* Negative lookahead */
4496            ptr++;
4497            if (*ptr == ')')          /* Optimize (?!) */
4498              {
4499              *code++ = OP_FAIL;
4500              previous = NULL;
4501              continue;
4502              }
4503            bravalue = OP_ASSERT_NOT;
4504            break;
4505    
4506    
4507            /* ------------------------------------------------------------ */
4508            case '<':                 /* Lookbehind or named define */
4509            switch (ptr[1])
4510              {
4511              case '=':               /* Positive lookbehind */
4512              bravalue = OP_ASSERTBACK;
4513              ptr += 2;
4514              break;
4515    
4516              case '!':               /* Negative lookbehind */
4517              bravalue = OP_ASSERTBACK_NOT;
4518              ptr += 2;
4519              break;
4520    
4521              default:                /* Could be name define, else bad */
4522              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4523              ptr++;                  /* Correct offset for error */
4524              *errorcodeptr = ERR24;
4525              goto FAILED;
4526              }
4527            break;
4528    
4529    
4530            /* ------------------------------------------------------------ */
4531            case '>':                 /* One-time brackets */
4532            bravalue = OP_ONCE;
4533            ptr++;
4534            break;
4535    
4536    
4537            /* ------------------------------------------------------------ */
4538            case 'C':                 /* Callout - may be followed by digits; */
4539            previous_callout = code;  /* Save for later completion */
4540            after_manual_callout = 1; /* Skip one item before completing */
4541            *code++ = OP_CALLOUT;
4542              {
4543              int n = 0;
4544              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4545                n = n * 10 + *ptr - '0';
4546              if (*ptr != ')')
4547                {
4548                *errorcodeptr = ERR39;
4549                goto FAILED;
4550                }
4551              if (n > 255)
4552                {
4553                *errorcodeptr = ERR38;
4554                goto FAILED;
4555                }
4556              *code++ = n;
4557              PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4558              PUT(code, LINK_SIZE, 0);                    /* Default length */
4559              code += 2 * LINK_SIZE;
4560              }
4561            previous = NULL;
4562            continue;
4563    
4564    
4565            /* ------------------------------------------------------------ */
4566            case 'P':                 /* Python-style named subpattern handling */
4567            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4568              {
4569              is_recurse = *ptr == '>';
4570              terminator = ')';
4571              goto NAMED_REF_OR_RECURSE;
4572              }
4573            else if (*ptr != '<')    /* Test for Python-style definition */
4574              {
4575              *errorcodeptr = ERR41;
4576              goto FAILED;
4577              }
4578            /* Fall through to handle (?P< as (?< is handled */
4579    
4580    
4581            /* ------------------------------------------------------------ */
4582            DEFINE_NAME:    /* Come here from (?< handling */
4583            case '\'':
4584              {
4585              terminator = (*ptr == '<')? '>' : '\'';
4586              name = ++ptr;
4587    
4588              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4589              namelen = ptr - name;
4590    
4591              /* In the pre-compile phase, just do a syntax check. */
4592    
4593              if (lengthptr != NULL)
4594              {              {
4595              int crc = memcmp(name, slot+2, namelen);