/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC revision 488 by ph10, Mon Jan 11 15:29:42 2010 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2006 University of Cambridge             Copyright (c) 1997-2010 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include "config.h"
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56  /* When DEBUG is defined, we need the pcre_printint() function, which is also  /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57  used by pcretest. DEBUG is not defined when building a production library. */  also used by pcretest. PCRE_DEBUG is not defined when building a production
58    library. */
59    
60  #ifdef DEBUG  #ifdef PCRE_DEBUG
61  #include "pcre_printint.src"  #include "pcre_printint.src"
62  #endif  #endif
63    
64    
65    /* Macro for setting individual bits in class bitmaps. */
66    
67    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69    /* Maximum length value to check against when making sure that the integer that
70    holds the compiled pattern length does not overflow. We make it a bit less than
71    INT_MAX to allow for adding in group terminating bytes, so that we don't have
72    to check them every time. */
73    
74    #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77  /*************************************************  /*************************************************
78  *      Code parameters and static tables         *  *      Code parameters and static tables         *
79  *************************************************/  *************************************************/
80    
81  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
82  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
83  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
84  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
85  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
86    so this number is very generous.
87    
88    The same workspace is used during the second, actual compile phase for
89    remembering forward references to groups so that they can be filled in at the
90    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91    is 4 there is plenty of room. */
92    
93  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
94    
95    
96  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 98  are simple data values; negative values
98  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
99  is invalid. */  is invalid. */
100    
101  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC
102    
103    /* This is the "normal" table for ASCII systems or for EBCDIC systems running
104    in UTF-8 mode. */
105    
106  static const short int escapes[] = {  static const short int escapes[] = {
107       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,                       0,
108       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,                       0,
109     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */       0,                       0,
110       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */       0,                       0,
111  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */       0,                       0,
112  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */       CHAR_COLON,              CHAR_SEMICOLON,
113     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */       CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
114       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */       CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
115  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */       CHAR_COMMERCIAL_AT,      -ESC_A,
116       0,      0, -ESC_z                                            /* x - z */       -ESC_B,                  -ESC_C,
117         -ESC_D,                  -ESC_E,
118         0,                       -ESC_G,
119         -ESC_H,                  0,
120         0,                       -ESC_K,
121         0,                       0,
122         0,                       0,
123         -ESC_P,                  -ESC_Q,
124         -ESC_R,                  -ESC_S,
125         0,                       0,
126         -ESC_V,                  -ESC_W,
127         -ESC_X,                  0,
128         -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
129         CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
130         CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
131         CHAR_GRAVE_ACCENT,       7,
132         -ESC_b,                  0,
133         -ESC_d,                  ESC_e,
134         ESC_f,                   0,
135         -ESC_h,                  0,
136         0,                       -ESC_k,
137         0,                       0,
138         ESC_n,                   0,
139         -ESC_p,                  0,
140         ESC_r,                   -ESC_s,
141         ESC_tee,                 0,
142         -ESC_v,                  -ESC_w,
143         0,                       0,
144         -ESC_z
145  };  };
146    
147  #else         /* This is the "abnormal" table for EBCDIC systems */  #else
148    
149    /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
150    
151  static const short int escapes[] = {  static const short int escapes[] = {
152  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
153  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 96  static const short int escapes[] = { Line 157  static const short int escapes[] = {
157  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
158  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
159  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
160  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
161  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
162  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
163  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
164  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
165  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
166  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
167  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
168  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
169  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
170  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
171  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
172  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
173  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
174  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 115  static const short int escapes[] = { Line 176  static const short int escapes[] = {
176  #endif  #endif
177    
178    
179  /* Tables of names of POSIX character classes and their lengths. The list is  /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
180  terminated by a zero length entry. The first three must be alpha, lower, upper,  searched linearly. Put all the names into a single string, in order to reduce
181  as this is assumed for handling case independence. */  the number of relocations when a shared library is dynamically linked. The
182    string is built from string macros so that it works in UTF-8 mode on EBCDIC
183  static const char *const posix_names[] = {  platforms. */
184    "alpha", "lower", "upper",  
185    "alnum", "ascii", "blank", "cntrl", "digit", "graph",  typedef struct verbitem {
186    "print", "punct", "space", "word",  "xdigit" };    int   len;
187      int   op;
188    } verbitem;
189    
190    static const char verbnames[] =
191      STRING_ACCEPT0
192      STRING_COMMIT0
193      STRING_F0
194      STRING_FAIL0
195      STRING_PRUNE0
196      STRING_SKIP0
197      STRING_THEN;
198    
199    static const verbitem verbs[] = {
200      { 6, OP_ACCEPT },
201      { 6, OP_COMMIT },
202      { 1, OP_FAIL },
203      { 4, OP_FAIL },
204      { 5, OP_PRUNE },
205      { 4, OP_SKIP  },
206      { 4, OP_THEN  }
207    };
208    
209    static const int verbcount = sizeof(verbs)/sizeof(verbitem);
210    
211    
212    /* Tables of names of POSIX character classes and their lengths. The names are
213    now all in a single string, to reduce the number of relocations when a shared
214    library is dynamically loaded. The list of lengths is terminated by a zero
215    length entry. The first three must be alpha, lower, upper, as this is assumed
216    for handling case independence. */
217    
218    static const char posix_names[] =
219      STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
220      STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
221      STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
222      STRING_word0  STRING_xdigit;
223    
224  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
225    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
# Line 155  static const int posix_class_maps[] = { Line 252  static const int posix_class_maps[] = {
252  };  };
253    
254    
255  /* The texts of compile-time error messages. These are "char *" because they  #define STRING(a)  # a
256  are passed to the outside world. */  #define XSTRING(s) STRING(s)
257    
258  static const char *error_texts[] = {  /* The texts of compile-time error messages. These are "char *" because they
259    "no error",  are passed to the outside world. Do not ever re-use any error number, because
260    "\\ at end of pattern",  they are documented. Always add a new error instead. Messages marked DEAD below
261    "\\c at end of pattern",  are no longer used. This used to be a table of strings, but in order to reduce
262    "unrecognized character follows \\",  the number of relocations needed when a shared library is loaded dynamically,
263    "numbers out of order in {} quantifier",  it is now one long string. We cannot use a table of offsets, because the
264    lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
265    simply count through to the one we want - this isn't a performance issue
266    because these strings are used only when there is a compilation error. */
267    
268    static const char error_texts[] =
269      "no error\0"
270      "\\ at end of pattern\0"
271      "\\c at end of pattern\0"
272      "unrecognized character follows \\\0"
273      "numbers out of order in {} quantifier\0"
274    /* 5 */    /* 5 */
275    "number too big in {} quantifier",    "number too big in {} quantifier\0"
276    "missing terminating ] for character class",    "missing terminating ] for character class\0"
277    "invalid escape sequence in character class",    "invalid escape sequence in character class\0"
278    "range out of order in character class",    "range out of order in character class\0"
279    "nothing to repeat",    "nothing to repeat\0"
280    /* 10 */    /* 10 */
281    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
282    "internal error: unexpected repeat",    "internal error: unexpected repeat\0"
283    "unrecognized character after (?",    "unrecognized character after (? or (?-\0"
284    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class\0"
285    "missing )",    "missing )\0"
286    /* 15 */    /* 15 */
287    "reference to non-existent subpattern",    "reference to non-existent subpattern\0"
288    "erroffset passed as NULL",    "erroffset passed as NULL\0"
289    "unknown option bit(s) set",    "unknown option bit(s) set\0"
290    "missing ) after comment",    "missing ) after comment\0"
291    "parentheses nested too deeply",    "parentheses nested too deeply\0"  /** DEAD **/
292    /* 20 */    /* 20 */
293    "regular expression too large",    "regular expression is too large\0"
294    "failed to get memory",    "failed to get memory\0"
295    "unmatched parentheses",    "unmatched parentheses\0"
296    "internal error: code overflow",    "internal error: code overflow\0"
297    "unrecognized character after (?<",    "unrecognized character after (?<\0"
298    /* 25 */    /* 25 */
299    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length\0"
300    "malformed number after (?(",    "malformed number or name after (?(\0"
301    "conditional group contains more than two branches",    "conditional group contains more than two branches\0"
302    "assertion expected after (?(",    "assertion expected after (?(\0"
303    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )\0"
304    /* 30 */    /* 30 */
305    "unknown POSIX class name",    "unknown POSIX class name\0"
306    "POSIX collating elements are not supported",    "POSIX collating elements are not supported\0"
307    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support\0"
308    "spare error",    "spare error\0"  /** DEAD **/
309    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large\0"
310    /* 35 */    /* 35 */
311    "invalid condition (?(0)",    "invalid condition (?(0)\0"
312    "\\C not allowed in lookbehind assertion",    "\\C not allowed in lookbehind assertion\0"
313    "PCRE does not support \\L, \\l, \\N, \\U, or \\u",    "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
314    "number after (?C is > 255",    "number after (?C is > 255\0"
315    "closing ) for (?C expected",    "closing ) for (?C expected\0"
316    /* 40 */    /* 40 */
317    "recursive call could loop indefinitely",    "recursive call could loop indefinitely\0"
318    "unrecognized character after (?P",    "unrecognized character after (?P\0"
319    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)\0"
320    "two named groups have the same name",    "two named subpatterns have the same name\0"
321    "invalid UTF-8 string",    "invalid UTF-8 string\0"
322    /* 45 */    /* 45 */
323    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled\0"
324    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence\0"
325    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p\0"
326  };    "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
327      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
328      /* 50 */
329      "repeated subpattern is too long\0"    /** DEAD **/
330      "octal value is greater than \\377 (not in UTF-8 mode)\0"
331      "internal error: overran compiling workspace\0"
332      "internal error: previously-checked referenced subpattern not found\0"
333      "DEFINE group contains more than one branch\0"
334      /* 55 */
335      "repeating a DEFINE group is not allowed\0"
336      "inconsistent NEWLINE options\0"
337      "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
338      "a numbered reference must not be zero\0"
339      "(*VERB) with an argument is not supported\0"
340      /* 60 */
341      "(*VERB) not recognized\0"
342      "number is too big\0"
343      "subpattern name expected\0"
344      "digit expected after (?+\0"
345      "] is an invalid data character in JavaScript compatibility mode\0"
346      /* 65 */
347      "different names for subpatterns of the same number are not allowed";
348    
349    
350  /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
# Line 235  For convenience, we use the same bit def Line 363  For convenience, we use the same bit def
363    
364  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
365    
366  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC
367    
368    /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
369    UTF-8 mode. */
370    
371  static const unsigned char digitab[] =  static const unsigned char digitab[] =
372    {    {
373    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 271  static const unsigned char digitab[] = Line 403  static const unsigned char digitab[] =
403    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
404    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
405    
406  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else
407    
408    /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
409    
410  static const unsigned char digitab[] =  static const unsigned char digitab[] =
411    {    {
412    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 285  static const unsigned char digitab[] = Line 420  static const unsigned char digitab[] =
420    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
421    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
422    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
423    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
424    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
425    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
426    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 319  static const unsigned char ebcdic_charta Line 454  static const unsigned char ebcdic_charta
454    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
455    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
456    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
457    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
458    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
459    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
460    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 346  static const unsigned char ebcdic_charta Line 481  static const unsigned char ebcdic_charta
481  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
482    
483  static BOOL  static BOOL
484    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
485      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
486    
487    
488    
489    /*************************************************
490    *            Find an error text                  *
491    *************************************************/
492    
493    /* The error texts are now all in one long string, to save on relocations. As
494    some of the text is of unknown length, we can't use a table of offsets.
495    Instead, just count through the strings. This is not a performance issue
496    because it happens only when there has been a compilation error.
497    
498    Argument:   the error number
499    Returns:    pointer to the error string
500    */
501    
502    static const char *
503    find_error_text(int n)
504    {
505    const char *s = error_texts;
506    for (; n > 0; n--) while (*s++ != 0) {};
507    return s;
508    }
509    
510    
511  /*************************************************  /*************************************************
# Line 357  static BOOL Line 514  static BOOL
514    
515  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
516  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
517  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
518  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
519  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
520    ptr is pointing at the \. On exit, it is on the final character of the escape
521    sequence.
522    
523  Arguments:  Arguments:
524    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 370  Arguments: Line 529  Arguments:
529    
530  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
531                   negative => a special escape sequence                   negative => a special escape sequence
532                   on error, errorptr is set                   on error, errorcodeptr is set
533  */  */
534    
535  static int  static int
# Line 388  ptr--;                            /* Set Line 547  ptr--;                            /* Set
547    
548  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
549    
550  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
551  a table. A non-zero result is something that can be returned immediately.  in a table. A non-zero result is something that can be returned immediately.
552  Otherwise further processing may be required. */  Otherwise further processing may be required. */
553    
554  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
555  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
556  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - CHAR_0]) != 0) c = i;
557    
558  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
559  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
560  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
561  #endif  #endif
562    
# Line 406  else if ((i = escapes[c - 0x48]) != 0) Line 565  else if ((i = escapes[c - 0x48]) != 0)
565  else  else
566    {    {
567    const uschar *oldptr;    const uschar *oldptr;
568      BOOL braced, negated;
569    
570    switch (c)    switch (c)
571      {      {
572      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
573      error. */      error. */
574    
575      case 'l':      case CHAR_l:
576      case 'L':      case CHAR_L:
577      case 'N':      case CHAR_N:
578      case 'u':      case CHAR_u:
579      case 'U':      case CHAR_U:
580      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
581      break;      break;
582    
583        /* \g must be followed by one of a number of specific things:
584    
585        (1) A number, either plain or braced. If positive, it is an absolute
586        backreference. If negative, it is a relative backreference. This is a Perl
587        5.10 feature.
588    
589        (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
590        is part of Perl's movement towards a unified syntax for back references. As
591        this is synonymous with \k{name}, we fudge it up by pretending it really
592        was \k.
593    
594        (3) For Oniguruma compatibility we also support \g followed by a name or a
595        number either in angle brackets or in single quotes. However, these are
596        (possibly recursive) subroutine calls, _not_ backreferences. Just return
597        the -ESC_g code (cf \k). */
598    
599        case CHAR_g:
600        if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
601          {
602          c = -ESC_g;
603          break;
604          }
605    
606        /* Handle the Perl-compatible cases */
607    
608        if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
609          {
610          const uschar *p;
611          for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
612            if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
613          if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
614            {
615            c = -ESC_k;
616            break;
617            }
618          braced = TRUE;
619          ptr++;
620          }
621        else braced = FALSE;
622    
623        if (ptr[1] == CHAR_MINUS)
624          {
625          negated = TRUE;
626          ptr++;
627          }
628        else negated = FALSE;
629    
630        c = 0;
631        while ((digitab[ptr[1]] & ctype_digit) != 0)
632          c = c * 10 + *(++ptr) - CHAR_0;
633    
634        if (c < 0)   /* Integer overflow */
635          {
636          *errorcodeptr = ERR61;
637          break;
638          }
639    
640        if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
641          {
642          *errorcodeptr = ERR57;
643          break;
644          }
645    
646        if (c == 0)
647          {
648          *errorcodeptr = ERR58;
649          break;
650          }
651    
652        if (negated)
653          {
654          if (c > bracount)
655            {
656            *errorcodeptr = ERR15;
657            break;
658            }
659          c = bracount - (c - 1);
660          }
661    
662        c = -(ESC_REF + c);
663        break;
664    
665      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
666      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
667      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 431  else Line 674  else
674      value is greater than 377, the least significant 8 bits are taken. Inside a      value is greater than 377, the least significant 8 bits are taken. Inside a
675      character class, \ followed by a digit is always an octal number. */      character class, \ followed by a digit is always an octal number. */
676    
677      case '1': case '2': case '3': case '4': case '5':      case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
678      case '6': case '7': case '8': case '9':      case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
679    
680      if (!isclass)      if (!isclass)
681        {        {
682        oldptr = ptr;        oldptr = ptr;
683        c -= '0';        c -= CHAR_0;
684        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
685          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - CHAR_0;
686          if (c < 0)    /* Integer overflow */
687            {
688            *errorcodeptr = ERR61;
689            break;
690            }
691        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
692          {          {
693          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 452  else Line 700  else
700      generates a binary zero byte and treats the digit as a following literal.      generates a binary zero byte and treats the digit as a following literal.
701      Thus we have to pull back the pointer by one. */      Thus we have to pull back the pointer by one. */
702    
703      if ((c = *ptr) >= '8')      if ((c = *ptr) >= CHAR_8)
704        {        {
705        ptr--;        ptr--;
706        c = 0;        c = 0;
# Line 460  else Line 708  else
708        }        }
709    
710      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
711      larger first octal digit. */      larger first octal digit. The original code used just to take the least
712        significant 8 bits of octal numbers (I think this is what early Perls used
713      case '0':      to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
714      c -= '0';      than 3 octal digits. */
715      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')  
716          c = c * 8 + *(++ptr) - '0';      case CHAR_0:
717      c &= 255;     /* Take least significant 8 bits */      c -= CHAR_0;
718        while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
719            c = c * 8 + *(++ptr) - CHAR_0;
720        if (!utf8 && c > 255) *errorcodeptr = ERR51;
721      break;      break;
722    
723      /* \x is complicated. \x{ddd} is a character number which can be greater      /* \x is complicated. \x{ddd} is a character number which can be greater
724      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
725      treated as a data character. */      treated as a data character. */
726    
727      case 'x':      case CHAR_x:
728      if (ptr[1] == '{')      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
729        {        {
730        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
731        int count = 0;        int count = 0;
# Line 483  else Line 734  else
734        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
735          {          {
736          register int cc = *pt++;          register int cc = *pt++;
737          if (c == 0 && cc == '0') continue;     /* Leading zeroes */          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
738          count++;          count++;
739    
740  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
741          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
742          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
743  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
744          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
745          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
746  #endif  #endif
747          }          }
748    
749        if (*pt == '}')        if (*pt == CHAR_RIGHT_CURLY_BRACKET)
750          {          {
751          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
752          ptr = pt;          ptr = pt;
# Line 511  else Line 762  else
762      c = 0;      c = 0;
763      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
764        {        {
765        int cc;                               /* Some compilers don't like ++ */        int cc;                                  /* Some compilers don't like */
766        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                           /* ++ in initializers */
767  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
768        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
769        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
770  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
771        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
772        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
773  #endif  #endif
774        }        }
775      break;      break;
776    
777      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
778        This coding is ASCII-specific, but then the whole concept of \cx is
779        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
780    
781      case 'c':      case CHAR_c:
782      c = *(++ptr);      c = *(++ptr);
783      if (c == 0)      if (c == 0)
784        {        {
785        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
786        return 0;        break;
787        }        }
788    
789      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII/UTF-8 coding */
790      is ASCII-specific, but then the whole concept of \cx is ASCII-specific.      if (c >= CHAR_a && c <= CHAR_z) c -= 32;
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
     if (c >= 'a' && c <= 'z') c -= 32;  
791      c ^= 0x40;      c ^= 0x40;
792  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
793      if (c >= 'a' && c <= 'z') c += 64;      if (c >= CHAR_a && c <= CHAR_z) c += 64;
794      c ^= 0xC0;      c ^= 0xC0;
795  #endif  #endif
796      break;      break;
797    
798      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any      /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
799      other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,      other alphanumeric following \ is an error if PCRE_EXTRA was set;
800      for Perl compatibility, it is a literal. This code looks a bit odd, but      otherwise, for Perl compatibility, it is a literal. This code looks a bit
801      there used to be some cases other than the default, and there may be again      odd, but there used to be some cases other than the default, and there may
802      in future, so I haven't "optimized" it. */      be again in future, so I haven't "optimized" it. */
803    
804      default:      default:
805      if ((options & PCRE_EXTRA) != 0) switch(c)      if ((options & PCRE_EXTRA) != 0) switch(c)
# Line 603  if (c == 0) goto ERROR_RETURN; Line 852  if (c == 0) goto ERROR_RETURN;
852  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
853  negation. */  negation. */
854    
855  if (c == '{')  if (c == CHAR_LEFT_CURLY_BRACKET)
856    {    {
857    if (ptr[1] == '^')    if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
858      {      {
859      *negptr = TRUE;      *negptr = TRUE;
860      ptr++;      ptr++;
861      }      }
862    for (i = 0; i < sizeof(name) - 1; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
863      {      {
864      c = *(++ptr);      c = *(++ptr);
865      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
866      if (c == '}') break;      if (c == CHAR_RIGHT_CURLY_BRACKET) break;
867      name[i] = c;      name[i] = c;
868      }      }
869    if (c !='}') goto ERROR_RETURN;    if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
870    name[i] = 0;    name[i] = 0;
871    }    }
872    
# Line 639  top = _pcre_utt_size; Line 888  top = _pcre_utt_size;
888  while (bot < top)  while (bot < top)
889    {    {
890    i = (bot + top) >> 1;    i = (bot + top) >> 1;
891    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
892    if (c == 0)    if (c == 0)
893      {      {
894      *dptr = _pcre_utt[i].value;      *dptr = _pcre_utt[i].value;
# Line 682  is_counted_repeat(const uschar *p) Line 931  is_counted_repeat(const uschar *p)
931  {  {
932  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
933  while ((digitab[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
934  if (*p == '}') return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935    
936  if (*p++ != ',') return FALSE;  if (*p++ != CHAR_COMMA) return FALSE;
937  if (*p == '}') return TRUE;  if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
938    
939  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;  if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
940  while ((digitab[*p] & ctype_digit) != 0) p++;  while ((digitab[*p] & ctype_digit) != 0) p++;
941    
942  return (*p == '}');  return (*p == CHAR_RIGHT_CURLY_BRACKET);
943  }  }
944    
945    
# Line 723  int max = -1; Line 972  int max = -1;
972  /* Read the minimum value and do a paranoid check: a negative value indicates  /* Read the minimum value and do a paranoid check: a negative value indicates
973  an integer overflow. */  an integer overflow. */
974    
975  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
976  if (min < 0 || min > 65535)  if (min < 0 || min > 65535)
977    {    {
978    *errorcodeptr = ERR5;    *errorcodeptr = ERR5;
# Line 733  if (min < 0 || min > 65535) Line 982  if (min < 0 || min > 65535)
982  /* Read the maximum value if there is one, and again do a paranoid on its size.  /* Read the maximum value if there is one, and again do a paranoid on its size.
983  Also, max must not be less than min. */  Also, max must not be less than min. */
984    
985  if (*p == '}') max = min; else  if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
986    {    {
987    if (*(++p) != '}')    if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
988      {      {
989      max = 0;      max = 0;
990      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
991      if (max < 0 || max > 65535)      if (max < 0 || max > 65535)
992        {        {
993        *errorcodeptr = ERR5;        *errorcodeptr = ERR5;
# Line 763  return p; Line 1012  return p;
1012    
1013    
1014  /*************************************************  /*************************************************
1015    *  Subroutine for finding forward reference      *
1016    *************************************************/
1017    
1018    /* This recursive function is called only from find_parens() below. The
1019    top-level call starts at the beginning of the pattern. All other calls must
1020    start at a parenthesis. It scans along a pattern's text looking for capturing
1021    subpatterns, and counting them. If it finds a named pattern that matches the
1022    name it is given, it returns its number. Alternatively, if the name is NULL, it
1023    returns when it reaches a given numbered subpattern. We know that if (?P< is
1024    encountered, the name will be terminated by '>' because that is checked in the
1025    first pass. Recursion is used to keep track of subpatterns that reset the
1026    capturing group numbers - the (?| feature.
1027    
1028    Arguments:
1029      ptrptr       address of the current character pointer (updated)
1030      cd           compile background data
1031      name         name to seek, or NULL if seeking a numbered subpattern
1032      lorn         name length, or subpattern number if name is NULL
1033      xmode        TRUE if we are in /x mode
1034      count        pointer to the current capturing subpattern number (updated)
1035    
1036    Returns:       the number of the named subpattern, or -1 if not found
1037    */
1038    
1039    static int
1040    find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1041      BOOL xmode, int *count)
1042    {
1043    uschar *ptr = *ptrptr;
1044    int start_count = *count;
1045    int hwm_count = start_count;
1046    BOOL dup_parens = FALSE;
1047    
1048    /* If the first character is a parenthesis, check on the type of group we are
1049    dealing with. The very first call may not start with a parenthesis. */
1050    
1051    if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1052      {
1053      if (ptr[1] == CHAR_QUESTION_MARK &&
1054          ptr[2] == CHAR_VERTICAL_LINE)
1055        {
1056        ptr += 3;
1057        dup_parens = TRUE;
1058        }
1059    
1060      /* Handle a normal, unnamed capturing parenthesis */
1061    
1062      else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1063        {
1064        *count += 1;
1065        if (name == NULL && *count == lorn) return *count;
1066        ptr++;
1067        }
1068    
1069      /* Handle a condition. If it is an assertion, just carry on so that it
1070      is processed as normal. If not, skip to the closing parenthesis of the
1071      condition (there can't be any nested parens. */
1072    
1073      else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1074        {
1075        ptr += 2;
1076        if (ptr[1] != CHAR_QUESTION_MARK)
1077          {
1078          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1079          if (*ptr != 0) ptr++;
1080          }
1081        }
1082    
1083      /* We have either (? or (* and not a condition */
1084    
1085      else
1086        {
1087        ptr += 2;
1088        if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1089    
1090        /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1091    
1092        if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1093            ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1094          {
1095          int term;
1096          const uschar *thisname;
1097          *count += 1;
1098          if (name == NULL && *count == lorn) return *count;
1099          term = *ptr++;
1100          if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1101          thisname = ptr;
1102          while (*ptr != term) ptr++;
1103          if (name != NULL && lorn == ptr - thisname &&
1104              strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1105            return *count;
1106          term++;
1107          }
1108        }
1109      }
1110    
1111    /* Past any initial parenthesis handling, scan for parentheses or vertical
1112    bars. */
1113    
1114    for (; *ptr != 0; ptr++)
1115      {
1116      /* Skip over backslashed characters and also entire \Q...\E */
1117    
1118      if (*ptr == CHAR_BACKSLASH)
1119        {
1120        if (*(++ptr) == 0) goto FAIL_EXIT;
1121        if (*ptr == CHAR_Q) for (;;)
1122          {
1123          while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1124          if (*ptr == 0) goto FAIL_EXIT;
1125          if (*(++ptr) == CHAR_E) break;
1126          }
1127        continue;
1128        }
1129    
1130      /* Skip over character classes; this logic must be similar to the way they
1131      are handled for real. If the first character is '^', skip it. Also, if the
1132      first few characters (either before or after ^) are \Q\E or \E we skip them
1133      too. This makes for compatibility with Perl. Note the use of STR macros to
1134      encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1135    
1136      if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1137        {
1138        BOOL negate_class = FALSE;
1139        for (;;)
1140          {
1141          if (ptr[1] == CHAR_BACKSLASH)
1142            {
1143            if (ptr[2] == CHAR_E)
1144              ptr+= 2;
1145            else if (strncmp((const char *)ptr+2,
1146                     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1147              ptr += 4;
1148            else
1149              break;
1150            }
1151          else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1152            {
1153            negate_class = TRUE;
1154            ptr++;
1155            }
1156          else break;
1157          }
1158    
1159        /* If the next character is ']', it is a data character that must be
1160        skipped, except in JavaScript compatibility mode. */
1161    
1162        if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1163            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1164          ptr++;
1165    
1166        while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1167          {
1168          if (*ptr == 0) return -1;
1169          if (*ptr == CHAR_BACKSLASH)
1170            {
1171            if (*(++ptr) == 0) goto FAIL_EXIT;
1172            if (*ptr == CHAR_Q) for (;;)
1173              {
1174              while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1175              if (*ptr == 0) goto FAIL_EXIT;
1176              if (*(++ptr) == CHAR_E) break;
1177              }
1178            continue;
1179            }
1180          }
1181        continue;
1182        }
1183    
1184      /* Skip comments in /x mode */
1185    
1186      if (xmode && *ptr == CHAR_NUMBER_SIGN)
1187        {
1188        while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1189        if (*ptr == 0) goto FAIL_EXIT;
1190        continue;
1191        }
1192    
1193      /* Check for the special metacharacters */
1194    
1195      if (*ptr == CHAR_LEFT_PARENTHESIS)
1196        {
1197        int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1198        if (rc > 0) return rc;
1199        if (*ptr == 0) goto FAIL_EXIT;
1200        }
1201    
1202      else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1203        {
1204        if (dup_parens && *count < hwm_count) *count = hwm_count;
1205        *ptrptr = ptr;
1206        return -1;
1207        }
1208    
1209      else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1210        {
1211        if (*count > hwm_count) hwm_count = *count;
1212        *count = start_count;
1213        }
1214      }
1215    
1216    FAIL_EXIT:
1217    *ptrptr = ptr;
1218    return -1;
1219    }
1220    
1221    
1222    
1223    
1224    /*************************************************
1225    *       Find forward referenced subpattern       *
1226    *************************************************/
1227    
1228    /* This function scans along a pattern's text looking for capturing
1229    subpatterns, and counting them. If it finds a named pattern that matches the
1230    name it is given, it returns its number. Alternatively, if the name is NULL, it
1231    returns when it reaches a given numbered subpattern. This is used for forward
1232    references to subpatterns. We used to be able to start this scan from the
1233    current compiling point, using the current count value from cd->bracount, and
1234    do it all in a single loop, but the addition of the possibility of duplicate
1235    subpattern numbers means that we have to scan from the very start, in order to
1236    take account of such duplicates, and to use a recursive function to keep track
1237    of the different types of group.
1238    
1239    Arguments:
1240      cd           compile background data
1241      name         name to seek, or NULL if seeking a numbered subpattern
1242      lorn         name length, or subpattern number if name is NULL
1243      xmode        TRUE if we are in /x mode
1244    
1245    Returns:       the number of the found subpattern, or -1 if not found
1246    */
1247    
1248    static int
1249    find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1250    {
1251    uschar *ptr = (uschar *)cd->start_pattern;
1252    int count = 0;
1253    int rc;
1254    
1255    /* If the pattern does not start with an opening parenthesis, the first call
1256    to find_parens_sub() will scan right to the end (if necessary). However, if it
1257    does start with a parenthesis, find_parens_sub() will return when it hits the
1258    matching closing parens. That is why we have to have a loop. */
1259    
1260    for (;;)
1261      {
1262      rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1263      if (rc > 0 || *ptr++ == 0) break;
1264      }
1265    
1266    return rc;
1267    }
1268    
1269    
1270    
1271    
1272    /*************************************************
1273  *      Find first significant op code            *  *      Find first significant op code            *
1274  *************************************************/  *************************************************/
1275    
# Line 811  for (;;) Line 1318  for (;;)
1318    
1319      case OP_CALLOUT:      case OP_CALLOUT:
1320      case OP_CREF:      case OP_CREF:
1321      case OP_BRANUMBER:      case OP_NCREF:
1322        case OP_RREF:
1323        case OP_NRREF:
1324        case OP_DEF:
1325      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1326      break;      break;
1327    
# Line 826  for (;;) Line 1336  for (;;)
1336    
1337    
1338  /*************************************************  /*************************************************
1339  *        Find the fixed length of a pattern      *  *        Find the fixed length of a branch       *
1340  *************************************************/  *************************************************/
1341    
1342  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a branch and compute the fixed length of subject that will match it,
1343  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
1344  In UTF8 mode, the result is in characters rather than bytes.  In UTF8 mode, the result is in characters rather than bytes. The branch is
1345    temporarily terminated with OP_END when this function is called.
1346    
1347    This function is called when a backward assertion is encountered, so that if it
1348    fails, the error message can point to the correct place in the pattern.
1349    However, we cannot do this when the assertion contains subroutine calls,
1350    because they can be forward references. We solve this by remembering this case
1351    and doing the check at the end; a flag specifies which mode we are running in.
1352    
1353  Arguments:  Arguments:
1354    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1355    options  the compiling options    options  the compiling options
1356      atend    TRUE if called when the pattern is complete
1357      cd       the "compile data" structure
1358    
1359  Returns:   the fixed length, or -1 if there is no fixed length,  Returns:   the fixed length,
1360                 or -1 if there is no fixed length,
1361               or -2 if \C was encountered               or -2 if \C was encountered
1362                 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1363  */  */
1364    
1365  static int  static int
1366  find_fixedlength(uschar *code, int options)  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1367  {  {
1368  int length = -1;  int length = -1;
1369    
# Line 855  branch, check the length against that of Line 1376  branch, check the length against that of
1376  for (;;)  for (;;)
1377    {    {
1378    int d;    int d;
1379      uschar *ce, *cs;
1380    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
   
1381    switch (op)    switch (op)
1382      {      {
1383        case OP_CBRA:
1384      case OP_BRA:      case OP_BRA:
1385      case OP_ONCE:      case OP_ONCE:
1386      case OP_COND:      case OP_COND:
1387      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1388      if (d < 0) return d;      if (d < 0) return d;
1389      branchlength += d;      branchlength += d;
1390      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 886  for (;;) Line 1407  for (;;)
1407      branchlength = 0;      branchlength = 0;
1408      break;      break;
1409    
1410        /* A true recursion implies not fixed length, but a subroutine call may
1411        be OK. If the subroutine is a forward reference, we can't deal with
1412        it until the end of the pattern, so return -3. */
1413    
1414        case OP_RECURSE:
1415        if (!atend) return -3;
1416        cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1417        do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1418        if (cc > cs && cc < ce) return -1;                /* Recursion */
1419        d = find_fixedlength(cs + 2, options, atend, cd);
1420        if (d < 0) return d;
1421        branchlength += d;
1422        cc += 1 + LINK_SIZE;
1423        break;
1424    
1425      /* Skip over assertive subpatterns */      /* Skip over assertive subpatterns */
1426    
1427      case OP_ASSERT:      case OP_ASSERT:
# Line 898  for (;;) Line 1434  for (;;)
1434      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1435    
1436      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1437      case OP_CREF:      case OP_CREF:
1438        case OP_NCREF:
1439        case OP_RREF:
1440        case OP_NRREF:
1441        case OP_DEF:
1442      case OP_OPT:      case OP_OPT:
1443      case OP_CALLOUT:      case OP_CALLOUT:
1444      case OP_SOD:      case OP_SOD:
# Line 917  for (;;) Line 1456  for (;;)
1456    
1457      case OP_CHAR:      case OP_CHAR:
1458      case OP_CHARNC:      case OP_CHARNC:
1459        case OP_NOT:
1460      branchlength++;      branchlength++;
1461      cc += 2;      cc += 2;
1462  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1463      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1464        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while ((*cc & 0xc0) == 0x80) cc++;  
       }  
1465  #endif  #endif
1466      break;      break;
1467    
# Line 934  for (;;) Line 1472  for (;;)
1472      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1473      cc += 4;      cc += 4;
1474  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1475      if ((options & PCRE_UTF8) != 0)      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1476        {        cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       while((*cc & 0x80) == 0x80) cc++;  
       }  
1477  #endif  #endif
1478      break;      break;
1479    
1480      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1481      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1482        if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1483      cc += 4;      cc += 4;
1484      break;      break;
1485    
# Line 960  for (;;) Line 1497  for (;;)
1497      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1498      case OP_WORDCHAR:      case OP_WORDCHAR:
1499      case OP_ANY:      case OP_ANY:
1500        case OP_ALLANY:
1501      branchlength++;      branchlength++;
1502      cc++;      cc++;
1503      break;      break;
# Line 1014  for (;;) Line 1552  for (;;)
1552    
1553    
1554  /*************************************************  /*************************************************
1555  *    Scan compiled regex for numbered bracket    *  *    Scan compiled regex for specific bracket    *
1556  *************************************************/  *************************************************/
1557    
1558  /* This little function scans through a compiled pattern until it finds a  /* This little function scans through a compiled pattern until it finds a
1559  capturing bracket with the given number.  capturing bracket with the given number, or, if the number is negative, an
1560    instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1561    so that it can be called from pcre_study() when finding the minimum matching
1562    length.
1563    
1564  Arguments:  Arguments:
1565    code        points to start of expression    code        points to start of expression
1566    utf8        TRUE in UTF-8 mode    utf8        TRUE in UTF-8 mode
1567    number      the required bracket number    number      the required bracket number or negative to find a lookbehind
1568    
1569  Returns:      pointer to the opcode for the bracket, or NULL if not found  Returns:      pointer to the opcode for the bracket, or NULL if not found
1570  */  */
1571    
1572  static const uschar *  const uschar *
1573  find_bracket(const uschar *code, BOOL utf8, int number)  _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1574  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1575  for (;;)  for (;;)
1576    {    {
1577    register int c = *code;    register int c = *code;
1578    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1579    else if (c > OP_BRA)  
1580      /* XCLASS is used for classes that cannot be represented just by a bit
1581      map. This includes negated single high-valued characters. The length in
1582      the table is zero; the actual length is stored in the compiled code. */
1583    
1584      if (c == OP_XCLASS) code += GET(code, 1);
1585    
1586      /* Handle recursion */
1587    
1588      else if (c == OP_REVERSE)
1589        {
1590        if (number < 0) return (uschar *)code;
1591        code += _pcre_OP_lengths[c];
1592        }
1593    
1594      /* Handle capturing bracket */
1595    
1596      else if (c == OP_CBRA)
1597      {      {
1598      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1599      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1600      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1601      }      }
1602    
1603      /* Otherwise, we can get the item's length from the table, except that for
1604      repeated character types, we have to test for \p and \P, which have an extra
1605      two bytes of parameters. */
1606    
1607    else    else
1608      {      {
1609      code += _pcre_OP_lengths[c];      switch(c)
1610          {
1611          case OP_TYPESTAR:
1612          case OP_TYPEMINSTAR:
1613          case OP_TYPEPLUS:
1614          case OP_TYPEMINPLUS:
1615          case OP_TYPEQUERY:
1616          case OP_TYPEMINQUERY:
1617          case OP_TYPEPOSSTAR:
1618          case OP_TYPEPOSPLUS:
1619          case OP_TYPEPOSQUERY:
1620          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1621          break;
1622    
1623  #ifdef SUPPORT_UTF8        case OP_TYPEUPTO:
1624          case OP_TYPEMINUPTO:
1625          case OP_TYPEEXACT:
1626          case OP_TYPEPOSUPTO:
1627          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1628          break;
1629          }
1630    
1631      /* In UTF-8 mode, opcodes that are followed by a character may be followed      /* Add in the fixed length from the table */
1632      by a multi-byte character. The length in the table is a minimum, so we have  
1633      to scan along to skip the extra bytes. All opcodes are less than 128, so we      code += _pcre_OP_lengths[c];
     can use relatively efficient code. */  
1634    
1635      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1636      a multi-byte character. The length in the table is a minimum, so we have to
1637      arrange to skip the extra bytes. */
1638    
1639    #ifdef SUPPORT_UTF8
1640      if (utf8) switch(c)      if (utf8) switch(c)
1641        {        {
1642        case OP_CHAR:        case OP_CHAR:
# Line 1064  for (;;) Line 1644  for (;;)
1644        case OP_EXACT:        case OP_EXACT:
1645        case OP_UPTO:        case OP_UPTO:
1646        case OP_MINUPTO:        case OP_MINUPTO:
1647          case OP_POSUPTO:
1648        case OP_STAR:        case OP_STAR:
1649        case OP_MINSTAR:        case OP_MINSTAR:
1650          case OP_POSSTAR:
1651        case OP_PLUS:        case OP_PLUS:
1652        case OP_MINPLUS:        case OP_MINPLUS:
1653          case OP_POSPLUS:
1654        case OP_QUERY:        case OP_QUERY:
1655        case OP_MINQUERY:        case OP_MINQUERY:
1656        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1657        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1658        break;        break;
1659        }        }
1660    #else
1661        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1662  #endif  #endif
1663      }      }
1664    }    }
# Line 1105  Returns:      pointer to the opcode for Line 1683  Returns:      pointer to the opcode for
1683  static const uschar *  static const uschar *
1684  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1685  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1686  for (;;)  for (;;)
1687    {    {
1688    register int c = *code;    register int c = *code;
1689    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1690    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
   else if (c > OP_BRA)  
     {  
     code += _pcre_OP_lengths[OP_BRA];  
     }  
   else  
     {  
     code += _pcre_OP_lengths[c];  
1691    
1692  #ifdef SUPPORT_UTF8    /* XCLASS is used for classes that cannot be represented just by a bit
1693      map. This includes negated single high-valued characters. The length in
1694      the table is zero; the actual length is stored in the compiled code. */
1695    
1696      /* In UTF-8 mode, opcodes that are followed by a character may be followed    if (c == OP_XCLASS) code += GET(code, 1);
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
1697    
1698      if (utf8) switch(c)    /* Otherwise, we can get the item's length from the table, except that for
1699      repeated character types, we have to test for \p and \P, which have an extra
1700      two bytes of parameters. */
1701    
1702      else
1703        {
1704        switch(c)
1705          {
1706          case OP_TYPESTAR:
1707          case OP_TYPEMINSTAR:
1708          case OP_TYPEPLUS:
1709          case OP_TYPEMINPLUS:
1710          case OP_TYPEQUERY:
1711          case OP_TYPEMINQUERY:
1712          case OP_TYPEPOSSTAR:
1713          case OP_TYPEPOSPLUS:
1714          case OP_TYPEPOSQUERY:
1715          if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1716          break;
1717    
1718          case OP_TYPEPOSUPTO:
1719          case OP_TYPEUPTO:
1720          case OP_TYPEMINUPTO:
1721          case OP_TYPEEXACT:
1722          if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1723          break;
1724          }
1725    
1726        /* Add in the fixed length from the table */
1727    
1728        code += _pcre_OP_lengths[c];
1729    
1730        /* In UTF-8 mode, opcodes that are followed by a character may be followed
1731        by a multi-byte character. The length in the table is a minimum, so we have
1732        to arrange to skip the extra bytes. */
1733    
1734    #ifdef SUPPORT_UTF8
1735        if (utf8) switch(c)
1736        {        {
1737        case OP_CHAR:        case OP_CHAR:
1738        case OP_CHARNC:        case OP_CHARNC:
1739        case OP_EXACT:        case OP_EXACT:
1740        case OP_UPTO:        case OP_UPTO:
1741        case OP_MINUPTO:        case OP_MINUPTO:
1742          case OP_POSUPTO:
1743        case OP_STAR:        case OP_STAR:
1744        case OP_MINSTAR:        case OP_MINSTAR:
1745          case OP_POSSTAR:
1746        case OP_PLUS:        case OP_PLUS:
1747        case OP_MINPLUS:        case OP_MINPLUS:
1748          case OP_POSPLUS:
1749        case OP_QUERY:        case OP_QUERY:
1750        case OP_MINQUERY:        case OP_MINQUERY:
1751        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1752        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1753        break;        break;
1754        }        }
1755    #else
1756        (void)(utf8);  /* Keep compiler happy by referencing function argument */
1757  #endif  #endif
1758      }      }
1759    }    }
# Line 1165  for (;;) Line 1766  for (;;)
1766  *************************************************/  *************************************************/
1767    
1768  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1769  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1770  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1771  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1772  whose current branch will already have been scanned.  backward and negative forward assertions when its final argument is TRUE. If we
1773    hit an unclosed bracket, we return "empty" - this means we've struck an inner
1774    bracket whose current branch will already have been scanned.
1775    
1776  Arguments:  Arguments:
1777    code        points to start of search    code        points to start of search
# Line 1182  static BOOL Line 1785  static BOOL
1785  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1786  {  {
1787  register int c;  register int c;
1788  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1789       code < endcode;       code < endcode;
1790       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1791    {    {
# Line 1190  for (code = first_significant_code(code Line 1793  for (code = first_significant_code(code
1793    
1794    c = *code;    c = *code;
1795    
1796    if (c >= OP_BRA)    /* Skip over forward assertions; the other assertions are skipped by
1797      first_significant_code() with a TRUE final argument. */
1798    
1799      if (c == OP_ASSERT)
1800        {
1801        do code += GET(code, 1); while (*code == OP_ALT);
1802        c = *code;
1803        continue;
1804        }
1805    
1806      /* Groups with zero repeats can of course be empty; skip them. */
1807    
1808      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1809        {
1810        code += _pcre_OP_lengths[c];
1811        do code += GET(code, 1); while (*code == OP_ALT);
1812        c = *code;
1813        continue;
1814        }
1815    
1816      /* For other groups, scan the branches. */
1817    
1818      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1819      {      {
1820      BOOL empty_branch;      BOOL empty_branch;
1821      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1822    
1823      /* Scan a closed bracket */      /* If a conditional group has only one branch, there is a second, implied,
1824        empty branch, so just skip over the conditional, because it could be empty.
1825        Otherwise, scan the individual branches of the group. */
1826    
1827      empty_branch = FALSE;      if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
     do  
       {  
       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))  
         empty_branch = TRUE;  
1828        code += GET(code, 1);        code += GET(code, 1);
1829        else
1830          {
1831          empty_branch = FALSE;
1832          do
1833            {
1834            if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1835              empty_branch = TRUE;
1836            code += GET(code, 1);
1837            }
1838          while (*code == OP_ALT);
1839          if (!empty_branch) return FALSE;   /* All branches are non-empty */
1840        }        }
1841      while (*code == OP_ALT);  
     if (!empty_branch) return FALSE;   /* All branches are non-empty */  
     code += 1 + LINK_SIZE;  
1842      c = *code;      c = *code;
1843        continue;
1844      }      }
1845    
1846    else switch (c)    /* Handle the other opcodes */
1847    
1848      switch (c)
1849      {      {
1850      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1851        cannot be represented just by a bit map. This includes negated single
1852        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1853        actual length is stored in the compiled code, so we must update "code"
1854        here. */
1855    
1856  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1857      case OP_XCLASS:      case OP_XCLASS:
1858      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1859      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1860  #endif  #endif
1861    
# Line 1260  for (code = first_significant_code(code Line 1899  for (code = first_significant_code(code
1899      case OP_NOT_WORDCHAR:      case OP_NOT_WORDCHAR:
1900      case OP_WORDCHAR:      case OP_WORDCHAR:
1901      case OP_ANY:      case OP_ANY:
1902        case OP_ALLANY:
1903      case OP_ANYBYTE:      case OP_ANYBYTE:
1904      case OP_CHAR:      case OP_CHAR:
1905      case OP_CHARNC:      case OP_CHARNC:
1906      case OP_NOT:      case OP_NOT:
1907      case OP_PLUS:      case OP_PLUS:
1908      case OP_MINPLUS:      case OP_MINPLUS:
1909        case OP_POSPLUS:
1910      case OP_EXACT:      case OP_EXACT:
1911      case OP_NOTPLUS:      case OP_NOTPLUS:
1912      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1913        case OP_NOTPOSPLUS:
1914      case OP_NOTEXACT:      case OP_NOTEXACT:
1915      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1916      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1917        case OP_TYPEPOSPLUS:
1918      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1919      return FALSE;      return FALSE;
1920    
1921        /* These are going to continue, as they may be empty, but we have to
1922        fudge the length for the \p and \P cases. */
1923    
1924        case OP_TYPESTAR:
1925        case OP_TYPEMINSTAR:
1926        case OP_TYPEPOSSTAR:
1927        case OP_TYPEQUERY:
1928        case OP_TYPEMINQUERY:
1929        case OP_TYPEPOSQUERY:
1930        if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1931        break;
1932    
1933        /* Same for these */
1934    
1935        case OP_TYPEUPTO:
1936        case OP_TYPEMINUPTO:
1937        case OP_TYPEPOSUPTO:
1938        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1939        break;
1940    
1941      /* End of branch */      /* End of branch */
1942    
1943      case OP_KET:      case OP_KET:
# Line 1283  for (code = first_significant_code(code Line 1946  for (code = first_significant_code(code
1946      case OP_ALT:      case OP_ALT:
1947      return TRUE;      return TRUE;
1948    
1949      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1950      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1951    
1952  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1953      case OP_STAR:      case OP_STAR:
1954      case OP_MINSTAR:      case OP_MINSTAR:
1955        case OP_POSSTAR:
1956      case OP_QUERY:      case OP_QUERY:
1957      case OP_MINQUERY:      case OP_MINQUERY:
1958        case OP_POSQUERY:
1959        if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1960        break;
1961    
1962      case OP_UPTO:      case OP_UPTO:
1963      case OP_MINUPTO:      case OP_MINUPTO:
1964      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      case OP_POSUPTO:
1965        if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1966      break;      break;
1967  #endif  #endif
1968      }      }
# Line 1326  static BOOL Line 1995  static BOOL
1995  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1996    BOOL utf8)    BOOL utf8)
1997  {  {
1998  while (bcptr != NULL && bcptr->current >= code)  while (bcptr != NULL && bcptr->current_branch >= code)
1999    {    {
2000    if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;    if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8))
2001        return FALSE;
2002    bcptr = bcptr->outer;    bcptr = bcptr->outer;
2003    }    }
2004  return TRUE;  return TRUE;
# Line 1341  return TRUE; Line 2011  return TRUE;
2011  *************************************************/  *************************************************/
2012    
2013  /* This function is called when the sequence "[:" or "[." or "[=" is  /* This function is called when the sequence "[:" or "[." or "[=" is
2014  encountered in a character class. It checks whether this is followed by an  encountered in a character class. It checks whether this is followed by a
2015  optional ^ and then a sequence of letters, terminated by a matching ":]" or  sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2016  ".]" or "=]".  reach an unescaped ']' without the special preceding character, return FALSE.
2017    
2018    Originally, this function only recognized a sequence of letters between the
2019    terminators, but it seems that Perl recognizes any sequence of characters,
2020    though of course unknown POSIX names are subsequently rejected. Perl gives an
2021    "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2022    didn't consider this to be a POSIX class. Likewise for [:1234:].
2023    
2024    The problem in trying to be exactly like Perl is in the handling of escapes. We
2025    have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2026    class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2027    below handles the special case of \], but does not try to do any other escape
2028    processing. This makes it different from Perl for cases such as [:l\ower:]
2029    where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2030    "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2031    I think.
2032    
2033  Argument:  Arguments:
2034    ptr      pointer to the initial [    ptr      pointer to the initial [
2035    endptr   where to return the end pointer    endptr   where to return the end pointer
   cd       pointer to compile data  
2036    
2037  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
2038  */  */
2039    
2040  static BOOL  static BOOL
2041  check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)  check_posix_syntax(const uschar *ptr, const uschar **endptr)
2042  {  {
2043  int terminator;          /* Don't combine these lines; the Solaris cc */  int terminator;          /* Don't combine these lines; the Solaris cc */
2044  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */  terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2045  if (*(++ptr) == '^') ptr++;  for (++ptr; *ptr != 0; ptr++)
 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;  
 if (*ptr == terminator && ptr[1] == ']')  
2046    {    {
2047    *endptr = ptr;    if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2048    return TRUE;      {
2049        if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2050        if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2051          {
2052          *endptr = ptr;
2053          return TRUE;
2054          }
2055        }
2056    }    }
2057  return FALSE;  return FALSE;
2058  }  }
# Line 1388  Returns:     a value representing the na Line 2077  Returns:     a value representing the na
2077  static int  static int
2078  check_posix_name(const uschar *ptr, int len)  check_posix_name(const uschar *ptr, int len)
2079  {  {
2080    const char *pn = posix_names;
2081  register int yield = 0;  register int yield = 0;
2082  while (posix_name_lengths[yield] != 0)  while (posix_name_lengths[yield] != 0)
2083    {    {
2084    if (len == posix_name_lengths[yield] &&    if (len == posix_name_lengths[yield] &&
2085      strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;      strncmp((const char *)ptr, pn, len) == 0) return yield;
2086      pn += posix_name_lengths[yield] + 1;
2087    yield++;    yield++;
2088    }    }
2089  return -1;  return -1;
# Line 1407  return -1; Line 2098  return -1;
2098  that is referenced. This means that groups can be replicated for fixed  that is referenced. This means that groups can be replicated for fixed
2099  repetition simply by copying (because the recursion is allowed to refer to  repetition simply by copying (because the recursion is allowed to refer to
2100  earlier groups that are outside the current group). However, when a group is  earlier groups that are outside the current group). However, when a group is
2101  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2102  it, after it has been compiled. This means that any OP_RECURSE items within it  inserted before it, after it has been compiled. This means that any OP_RECURSE
2103  that refer to the group itself or any contained groups have to have their  items within it that refer to the group itself or any contained groups have to
2104  offsets adjusted. That is the job of this function. Before it is called, the  have their offsets adjusted. That one of the jobs of this function. Before it
2105  partially compiled regex must be temporarily terminated with OP_END.  is called, the partially compiled regex must be temporarily terminated with
2106    OP_END.
2107    
2108    This function has been extended with the possibility of forward references for
2109    recursions and subroutine calls. It must also check the list of such references
2110    for the group we are dealing with. If it finds that one of the recursions in
2111    the current group is on this list, it adjusts the offset in the list, not the
2112    value in the reference (which is a group number).
2113    
2114  Arguments:  Arguments:
2115    group      points to the start of the group    group      points to the start of the group
2116    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
2117    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
2118    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
2119      save_hwm   the hwm forward reference pointer at the start of the group
2120    
2121  Returns:     nothing  Returns:     nothing
2122  */  */
2123    
2124  static void  static void
2125  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2126      uschar *save_hwm)
2127  {  {
2128  uschar *ptr = group;  uschar *ptr = group;
2129    
2130  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2131    {    {
2132    int offset = GET(ptr, 1);    int offset;
2133    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
2134    
2135      /* See if this recursion is on the forward reference list. If so, adjust the
2136      reference. */
2137    
2138      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2139        {
2140        offset = GET(hc, 0);
2141        if (cd->start_code + offset == ptr + 1)
2142          {
2143          PUT(hc, 0, offset + adjust);
2144          break;
2145          }
2146        }
2147    
2148      /* Otherwise, adjust the recursion offset if it's after the start of this
2149      group. */
2150    
2151      if (hc >= cd->hwm)
2152        {
2153        offset = GET(ptr, 1);
2154        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2155        }
2156    
2157    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
2158    }    }
2159  }  }
# Line 1508  Yield:        TRUE when range returned; Line 2232  Yield:        TRUE when range returned;
2232  */  */
2233    
2234  static BOOL  static BOOL
2235  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2236      unsigned int *odptr)
2237  {  {
2238  int c, othercase, next;  unsigned int c, othercase, next;
2239    
2240  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
2241    { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }    { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2242    
2243  if (c > d) return FALSE;  if (c > d) return FALSE;
2244    
# Line 1522  next = othercase + 1; Line 2247  next = othercase + 1;
2247    
2248  for (++c; c <= d; c++)  for (++c; c <= d; c++)
2249    {    {
2250    if (_pcre_ucp_othercase(c) != next) break;    if (UCD_OTHERCASE(c) != next) break;
2251    next++;    next++;
2252    }    }
2253    
# Line 1534  return TRUE; Line 2259  return TRUE;
2259  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2260    
2261    
2262    
2263    /*************************************************
2264    *     Check if auto-possessifying is possible    *
2265    *************************************************/
2266    
2267    /* This function is called for unlimited repeats of certain items, to see
2268    whether the next thing could possibly match the repeated item. If not, it makes
2269    sense to automatically possessify the repeated item.
2270    
2271    Arguments:
2272      op_code       the repeated op code
2273      this          data for this item, depends on the opcode
2274      utf8          TRUE in UTF-8 mode
2275      utf8_char     used for utf8 character bytes, NULL if not relevant
2276      ptr           next character in pattern
2277      options       options bits
2278      cd            contains pointers to tables etc.
2279    
2280    Returns:        TRUE if possessifying is wanted
2281    */
2282    
2283    static BOOL
2284    check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2285      const uschar *ptr, int options, compile_data *cd)
2286    {
2287    int next;
2288    
2289    /* Skip whitespace and comments in extended mode */
2290    
2291    if ((options & PCRE_EXTENDED) != 0)
2292      {
2293      for (;;)
2294        {
2295        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2296        if (*ptr == CHAR_NUMBER_SIGN)
2297          {
2298          while (*(++ptr) != 0)
2299            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2300          }
2301        else break;
2302        }
2303      }
2304    
2305    /* If the next item is one that we can handle, get its value. A non-negative
2306    value is a character, a negative value is an escape value. */
2307    
2308    if (*ptr == CHAR_BACKSLASH)
2309      {
2310      int temperrorcode = 0;
2311      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2312      if (temperrorcode != 0) return FALSE;
2313      ptr++;    /* Point after the escape sequence */
2314      }
2315    
2316    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2317      {
2318    #ifdef SUPPORT_UTF8
2319      if (utf8) { GETCHARINC(next, ptr); } else
2320    #endif
2321      next = *ptr++;
2322      }
2323    
2324    else return FALSE;
2325    
2326    /* Skip whitespace and comments in extended mode */
2327    
2328    if ((options & PCRE_EXTENDED) != 0)
2329      {
2330      for (;;)
2331        {
2332        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2333        if (*ptr == CHAR_NUMBER_SIGN)
2334          {
2335          while (*(++ptr) != 0)
2336            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2337          }
2338        else break;
2339        }
2340      }
2341    
2342    /* If the next thing is itself optional, we have to give up. */
2343    
2344    if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2345      strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2346        return FALSE;
2347    
2348    /* Now compare the next item with the previous opcode. If the previous is a
2349    positive single character match, "item" either contains the character or, if
2350    "item" is greater than 127 in utf8 mode, the character's bytes are in
2351    utf8_char. */
2352    
2353    
2354    /* Handle cases when the next item is a character. */
2355    
2356    if (next >= 0) switch(op_code)
2357      {
2358      case OP_CHAR:
2359    #ifdef SUPPORT_UTF8
2360      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2361    #else
2362      (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2363    #endif
2364      return item != next;
2365    
2366      /* For CHARNC (caseless character) we must check the other case. If we have
2367      Unicode property support, we can use it to test the other case of
2368      high-valued characters. */
2369    
2370      case OP_CHARNC:
2371    #ifdef SUPPORT_UTF8
2372      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2373    #endif
2374      if (item == next) return FALSE;
2375    #ifdef SUPPORT_UTF8
2376      if (utf8)
2377        {
2378        unsigned int othercase;
2379        if (next < 128) othercase = cd->fcc[next]; else
2380    #ifdef SUPPORT_UCP
2381        othercase = UCD_OTHERCASE((unsigned int)next);
2382    #else
2383        othercase = NOTACHAR;
2384    #endif
2385        return (unsigned int)item != othercase;
2386        }
2387      else
2388    #endif  /* SUPPORT_UTF8 */
2389      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2390    
2391      /* For OP_NOT, "item" must be a single-byte character. */
2392    
2393      case OP_NOT:
2394      if (item == next) return TRUE;
2395      if ((options & PCRE_CASELESS) == 0) return FALSE;
2396    #ifdef SUPPORT_UTF8
2397      if (utf8)
2398        {
2399        unsigned int othercase;
2400        if (next < 128) othercase = cd->fcc[next]; else
2401    #ifdef SUPPORT_UCP
2402        othercase = UCD_OTHERCASE(next);
2403    #else
2404        othercase = NOTACHAR;
2405    #endif
2406        return (unsigned int)item == othercase;
2407        }
2408      else
2409    #endif  /* SUPPORT_UTF8 */
2410      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2411    
2412      case OP_DIGIT:
2413      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2414    
2415      case OP_NOT_DIGIT:
2416      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2417    
2418      case OP_WHITESPACE:
2419      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2420    
2421      case OP_NOT_WHITESPACE:
2422      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2423    
2424      case OP_WORDCHAR:
2425      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2426    
2427      case OP_NOT_WORDCHAR:
2428      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2429    
2430      case OP_HSPACE:
2431      case OP_NOT_HSPACE:
2432      switch(next)
2433        {
2434        case 0x09:
2435        case 0x20:
2436        case 0xa0:
2437        case 0x1680:
2438        case 0x180e:
2439        case 0x2000:
2440        case 0x2001:
2441        case 0x2002:
2442        case 0x2003:
2443        case 0x2004:
2444        case 0x2005:
2445        case 0x2006:
2446        case 0x2007:
2447        case 0x2008:
2448        case 0x2009:
2449        case 0x200A:
2450        case 0x202f:
2451        case 0x205f:
2452        case 0x3000:
2453        return op_code != OP_HSPACE;
2454        default:
2455        return op_code == OP_HSPACE;
2456        }
2457    
2458      case OP_VSPACE:
2459      case OP_NOT_VSPACE:
2460      switch(next)
2461        {
2462        case 0x0a:
2463        case 0x0b:
2464        case 0x0c:
2465        case 0x0d:
2466        case 0x85:
2467        case 0x2028:
2468        case 0x2029:
2469        return op_code != OP_VSPACE;
2470        default:
2471        return op_code == OP_VSPACE;
2472        }
2473    
2474      default:
2475      return FALSE;
2476      }
2477    
2478    
2479    /* Handle the case when the next item is \d, \s, etc. */
2480    
2481    switch(op_code)
2482      {
2483      case OP_CHAR:
2484      case OP_CHARNC:
2485    #ifdef SUPPORT_UTF8
2486      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2487    #endif
2488      switch(-next)
2489        {
2490        case ESC_d:
2491        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2492    
2493        case ESC_D:
2494        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2495    
2496        case ESC_s:
2497        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2498    
2499        case ESC_S:
2500        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2501    
2502        case ESC_w:
2503        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2504    
2505        case ESC_W:
2506        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2507    
2508        case ESC_h:
2509        case ESC_H:
2510        switch(item)
2511          {
2512          case 0x09:
2513          case 0x20:
2514          case 0xa0:
2515          case 0x1680:
2516          case 0x180e:
2517          case 0x2000:
2518          case 0x2001:
2519          case 0x2002:
2520          case 0x2003:
2521          case 0x2004:
2522          case 0x2005:
2523          case 0x2006:
2524          case 0x2007:
2525          case 0x2008:
2526          case 0x2009:
2527          case 0x200A:
2528          case 0x202f:
2529          case 0x205f:
2530          case 0x3000:
2531          return -next != ESC_h;
2532          default:
2533          return -next == ESC_h;
2534          }
2535    
2536        case ESC_v:
2537        case ESC_V:
2538        switch(item)
2539          {
2540          case 0x0a:
2541          case 0x0b:
2542          case 0x0c:
2543          case 0x0d:
2544          case 0x85:
2545          case 0x2028:
2546          case 0x2029:
2547          return -next != ESC_v;
2548          default:
2549          return -next == ESC_v;
2550          }
2551    
2552        default:
2553        return FALSE;
2554        }
2555    
2556      case OP_DIGIT:
2557      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2558             next == -ESC_h || next == -ESC_v;
2559    
2560      case OP_NOT_DIGIT:
2561      return next == -ESC_d;
2562    
2563      case OP_WHITESPACE:
2564      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2565    
2566      case OP_NOT_WHITESPACE:
2567      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2568    
2569      case OP_HSPACE:
2570      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2571    
2572      case OP_NOT_HSPACE:
2573      return next == -ESC_h;
2574    
2575      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2576      case OP_VSPACE:
2577      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2578    
2579      case OP_NOT_VSPACE:
2580      return next == -ESC_v;
2581    
2582      case OP_WORDCHAR:
2583      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2584    
2585      case OP_NOT_WORDCHAR:
2586      return next == -ESC_w || next == -ESC_d;
2587    
2588      default:
2589      return FALSE;
2590      }
2591    
2592    /* Control does not reach here */
2593    }
2594    
2595    
2596    
2597  /*************************************************  /*************************************************
2598  *           Compile one branch                   *  *           Compile one branch                   *
2599  *************************************************/  *************************************************/
2600    
2601  /* Scan the pattern, compiling it into the code vector. If the options are  /* Scan the pattern, compiling it into the a vector. If the options are
2602  changed during the branch, the pointer is used to change the external options  changed during the branch, the pointer is used to change the external options
2603  bits.  bits. This function is used during the pre-compile phase when we are trying
2604    to find out the amount of memory needed, as well as during the real compile
2605    phase. The value of lengthptr distinguishes the two phases.
2606    
2607  Arguments:  Arguments:
2608    optionsptr     pointer to the option bits    optionsptr     pointer to the option bits
   brackets       points to number of extracting brackets used  
2609    codeptr        points to the pointer to the current code point    codeptr        points to the pointer to the current code point
2610    ptrptr         points to the current pattern pointer    ptrptr         points to the current pattern pointer
2611    errorcodeptr   points to error code variable    errorcodeptr   points to error code variable
# Line 1552  Arguments: Line 2613  Arguments:
2613    reqbyteptr     set to the last literal character required, else < 0    reqbyteptr     set to the last literal character required, else < 0
2614    bcptr          points to current branch chain    bcptr          points to current branch chain
2615    cd             contains pointers to tables etc.    cd             contains pointers to tables etc.
2616      lengthptr      NULL during the real compile phase
2617                     points to length accumulator during pre-compile phase
2618    
2619  Returns:         TRUE on success  Returns:         TRUE on success
2620                   FALSE, with *errorcodeptr set non-zero on error                   FALSE, with *errorcodeptr set non-zero on error
2621  */  */
2622    
2623  static BOOL  static BOOL
2624  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2625    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2626    int *reqbyteptr, branch_chain *bcptr, compile_data *cd)    compile_data *cd, int *lengthptr)
2627  {  {
2628  int repeat_type, op_type;  int repeat_type, op_type;
2629  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
# Line 1569  int greedy_default, greedy_non_default; Line 2632  int greedy_default, greedy_non_default;
2632  int firstbyte, reqbyte;  int firstbyte, reqbyte;
2633  int zeroreqbyte, zerofirstbyte;  int zeroreqbyte, zerofirstbyte;
2634  int req_caseopt, reqvary, tempreqvary;  int req_caseopt, reqvary, tempreqvary;
 int condcount = 0;  
2635  int options = *optionsptr;  int options = *optionsptr;
2636  int after_manual_callout = 0;  int after_manual_callout = 0;
2637    int length_prevgroup = 0;
2638  register int c;  register int c;
2639  register uschar *code = *codeptr;  register uschar *code = *codeptr;
2640    uschar *last_code = code;
2641    uschar *orig_code = code;
2642  uschar *tempcode;  uschar *tempcode;
2643  BOOL inescq = FALSE;  BOOL inescq = FALSE;
2644  BOOL groupsetfirstbyte = FALSE;  BOOL groupsetfirstbyte = FALSE;
# Line 1581  const uschar *ptr = *ptrptr; Line 2646  const uschar *ptr = *ptrptr;
2646  const uschar *tempptr;  const uschar *tempptr;
2647  uschar *previous = NULL;  uschar *previous = NULL;
2648  uschar *previous_callout = NULL;  uschar *previous_callout = NULL;
2649    uschar *save_hwm = NULL;
2650  uschar classbits[32];  uschar classbits[32];
2651    
2652  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2653  BOOL class_utf8;  BOOL class_utf8;
2654  BOOL utf8 = (options & PCRE_UTF8) != 0;  BOOL utf8 = (options & PCRE_UTF8) != 0;
2655  uschar *class_utf8data;  uschar *class_utf8data;
2656    uschar *class_utf8data_base;
2657  uschar utf8_char[6];  uschar utf8_char[6];
2658  #else  #else
2659  BOOL utf8 = FALSE;  BOOL utf8 = FALSE;
2660    uschar *utf8_char = NULL;
2661    #endif
2662    
2663    #ifdef PCRE_DEBUG
2664    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2665  #endif  #endif
2666    
2667  /* Set up the default and non-default settings for greediness */  /* Set up the default and non-default settings for greediness */
# Line 1621  req_caseopt = ((options & PCRE_CASELESS) Line 2693  req_caseopt = ((options & PCRE_CASELESS)
2693  for (;; ptr++)  for (;; ptr++)
2694    {    {
2695    BOOL negate_class;    BOOL negate_class;
2696      BOOL should_flip_negation;
2697    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2698    BOOL is_quantifier;    BOOL is_quantifier;
2699      BOOL is_recurse;
2700      BOOL reset_bracount;
2701    int class_charcount;    int class_charcount;
2702    int class_lastchar;    int class_lastchar;
2703    int newoptions;    int newoptions;
2704    int recno;    int recno;
2705      int refsign;
2706    int skipbytes;    int skipbytes;
2707    int subreqbyte;    int subreqbyte;
2708    int subfirstbyte;    int subfirstbyte;
2709      int terminator;
2710    int mclength;    int mclength;
2711    uschar mcbuffer[8];    uschar mcbuffer[8];
2712    
2713    /* Next byte in the pattern */    /* Get next byte in the pattern */
2714    
2715    c = *ptr;    c = *ptr;
2716    
2717      /* If we are in the pre-compile phase, accumulate the length used for the
2718      previous cycle of this loop. */
2719    
2720      if (lengthptr != NULL)
2721        {
2722    #ifdef PCRE_DEBUG
2723        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2724    #endif
2725        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2726          {
2727          *errorcodeptr = ERR52;
2728          goto FAILED;
2729          }
2730    
2731        /* There is at least one situation where code goes backwards: this is the
2732        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2733        the class is simply eliminated. However, it is created first, so we have to
2734        allow memory for it. Therefore, don't ever reduce the length at this point.
2735        */
2736    
2737        if (code < last_code) code = last_code;
2738    
2739        /* Paranoid check for integer overflow */
2740    
2741        if (OFLOW_MAX - *lengthptr < code - last_code)
2742          {
2743          *errorcodeptr = ERR20;
2744          goto FAILED;
2745          }
2746    
2747        *lengthptr += code - last_code;
2748        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2749    
2750        /* If "previous" is set and it is not at the start of the work space, move
2751        it back to there, in order to avoid filling up the work space. Otherwise,
2752        if "previous" is NULL, reset the current code pointer to the start. */
2753    
2754        if (previous != NULL)
2755          {
2756          if (previous > orig_code)
2757            {
2758            memmove(orig_code, previous, code - previous);
2759            code -= previous - orig_code;
2760            previous = orig_code;
2761            }
2762          }
2763        else code = orig_code;
2764    
2765        /* Remember where this code item starts so we can pick up the length
2766        next time round. */
2767    
2768        last_code = code;
2769        }
2770    
2771      /* In the real compile phase, just check the workspace used by the forward
2772      reference list. */
2773    
2774      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2775        {
2776        *errorcodeptr = ERR52;
2777        goto FAILED;
2778        }
2779    
2780    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2781    
2782    if (inescq && c != 0)    if (inescq && c != 0)
2783      {      {
2784      if (c == '\\' && ptr[1] == 'E')      if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2785        {        {
2786        inescq = FALSE;        inescq = FALSE;
2787        ptr++;        ptr++;
# Line 1651  for (;; ptr++) Line 2791  for (;; ptr++)
2791        {        {
2792        if (previous_callout != NULL)        if (previous_callout != NULL)
2793          {          {
2794          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2795              complete_callout(previous_callout, ptr, cd);
2796          previous_callout = NULL;          previous_callout = NULL;
2797          }          }
2798        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1666  for (;; ptr++) Line 2807  for (;; ptr++)
2807    /* Fill in length of a previous callout, except when the next thing is    /* Fill in length of a previous callout, except when the next thing is
2808    a quantifier. */    a quantifier. */
2809    
2810    is_quantifier = c == '*' || c == '+' || c == '?' ||    is_quantifier =
2811      (c == '{' && is_counted_repeat(ptr+1));      c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2812        (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2813    
2814    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2815         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2816      {      {
2817      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2818          complete_callout(previous_callout, ptr, cd);
2819      previous_callout = NULL;      previous_callout = NULL;
2820      }      }
2821    
# Line 1681  for (;; ptr++) Line 2824  for (;; ptr++)
2824    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
2825      {      {
2826      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2827      if (c == '#')      if (c == CHAR_NUMBER_SIGN)
2828        {        {
2829        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2830        on the Macintosh. */          {
2831        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2832        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2833          if (*ptr != 0) continue;
2834    
2835          /* Else fall through to handle end of string */
2836          c = 0;
2837        }        }
2838      }      }
2839    
# Line 1700  for (;; ptr++) Line 2847  for (;; ptr++)
2847    
2848    switch(c)    switch(c)
2849      {      {
2850      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2851        case 0:                        /* The branch terminates at string end */
2852      case 0:      case CHAR_VERTICAL_LINE:       /* or | or ) */
2853      case '|':      case CHAR_RIGHT_PARENTHESIS:
     case ')':  
2854      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2855      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2856      *codeptr = code;      *codeptr = code;
2857      *ptrptr = ptr;      *ptrptr = ptr;
2858        if (lengthptr != NULL)
2859          {
2860          if (OFLOW_MAX - *lengthptr < code - last_code)
2861            {
2862            *errorcodeptr = ERR20;
2863            goto FAILED;
2864            }
2865          *lengthptr += code - last_code;   /* To include callout length */
2866          DPRINTF((">> end branch\n"));
2867          }
2868      return TRUE;      return TRUE;
2869    
2870    
2871        /* ===================================================================*/
2872      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2873      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2874    
2875      case '^':      case CHAR_CIRCUMFLEX_ACCENT:
2876      if ((options & PCRE_MULTILINE) != 0)      if ((options & PCRE_MULTILINE) != 0)
2877        {        {
2878        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
# Line 1723  for (;; ptr++) Line 2881  for (;; ptr++)
2881      *code++ = OP_CIRC;      *code++ = OP_CIRC;
2882      break;      break;
2883    
2884      case '$':      case CHAR_DOLLAR_SIGN:
2885      previous = NULL;      previous = NULL;
2886      *code++ = OP_DOLL;      *code++ = OP_DOLL;
2887      break;      break;
# Line 1731  for (;; ptr++) Line 2889  for (;; ptr++)
2889      /* There can never be a first char if '.' is first, whatever happens about      /* There can never be a first char if '.' is first, whatever happens about
2890      repeats. The value of reqbyte doesn't change either. */      repeats. The value of reqbyte doesn't change either. */
2891    
2892      case '.':      case CHAR_DOT:
2893      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2894      zerofirstbyte = firstbyte;      zerofirstbyte = firstbyte;
2895      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
2896      previous = code;      previous = code;
2897      *code++ = OP_ANY;      *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2898      break;      break;
2899    
2900    
2901        /* ===================================================================*/
2902      /* Character classes. If the included characters are all < 256, we build a      /* Character classes. If the included characters are all < 256, we build a
2903      32-byte bitmap of the permitted characters, except in the special case      32-byte bitmap of the permitted characters, except in the special case
2904      where there is only one such character. For negated classes, we build the      where there is only one such character. For negated classes, we build the
# Line 1749  for (;; ptr++) Line 2909  for (;; ptr++)
2909      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
2910      but those above are are explicitly listed afterwards. A flag byte tells      but those above are are explicitly listed afterwards. A flag byte tells
2911      whether the bitmap is present, and whether this is a negated class or not.      whether the bitmap is present, and whether this is a negated class or not.
     */  
2912    
2913      case '[':      In JavaScript compatibility mode, an isolated ']' causes an error. In
2914        default (Perl) mode, it is treated as a data character. */
2915    
2916        case CHAR_RIGHT_SQUARE_BRACKET:
2917        if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2918          {
2919          *errorcodeptr = ERR64;
2920          goto FAILED;
2921          }
2922        goto NORMAL_CHAR;
2923    
2924        case CHAR_LEFT_SQUARE_BRACKET:
2925      previous = code;      previous = code;
2926    
2927      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if      /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2928      they are encountered at the top level, so we'll do that too. */      they are encountered at the top level, so we'll do that too. */
2929    
2930      if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&      if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2931          check_posix_syntax(ptr, &tempptr, cd))           ptr[1] == CHAR_EQUALS_SIGN) &&
2932            check_posix_syntax(ptr, &tempptr))
2933        {        {
2934        *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;        *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2935        goto FAILED;        goto FAILED;
2936        }        }
2937    
2938      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2939        if the first few characters (either before or after ^) are \Q\E or \E we
2940        skip them too. This makes for compatibility with Perl. */
2941    
2942      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2943        for (;;)
2944        {        {
       negate_class = TRUE;  
2945        c = *(++ptr);        c = *(++ptr);
2946          if (c == CHAR_BACKSLASH)
2947            {
2948            if (ptr[1] == CHAR_E)
2949              ptr++;
2950            else if (strncmp((const char *)ptr+1,
2951                              STR_Q STR_BACKSLASH STR_E, 3) == 0)
2952              ptr += 3;
2953            else
2954              break;
2955            }
2956          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2957            negate_class = TRUE;
2958          else break;
2959        }        }
2960      else  
2961        /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2962        an initial ']' is taken as a data character -- the code below handles
2963        that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2964        [^] must match any character, so generate OP_ALLANY. */
2965    
2966        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2967            (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2968        {        {
2969        negate_class = FALSE;        *code++ = negate_class? OP_ALLANY : OP_FAIL;
2970          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2971          zerofirstbyte = firstbyte;
2972          break;
2973        }        }
2974    
2975        /* If a class contains a negative special such as \S, we need to flip the
2976        negation flag at the end, so that support for characters > 255 works
2977        correctly (they are all included in the class). */
2978    
2979        should_flip_negation = FALSE;
2980    
2981      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2982      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2983      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2984    
2985      class_charcount = 0;      class_charcount = 0;
2986      class_lastchar = -1;      class_lastchar = -1;
2987    
2988        /* Initialize the 32-char bit map to all zeros. We build the map in a
2989        temporary bit of memory, in case the class contains only 1 character (less
2990        than 256), because in that case the compiled code doesn't use the bit map.
2991        */
2992    
2993        memset(classbits, 0, 32 * sizeof(uschar));
2994    
2995  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2996      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2997      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2998        class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2999  #endif  #endif
3000    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
3001      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
3002      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
3003      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
3004    
3005      do      if (c != 0) do
3006        {        {
3007          const uschar *oldptr;
3008    
3009  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3010        if (utf8 && c > 127)        if (utf8 && c > 127)
3011          {                           /* Braces are required because the */          {                           /* Braces are required because the */
3012          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */          GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3013          }          }
3014    
3015          /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3016          data and reset the pointer. This is so that very large classes that
3017          contain a zillion UTF-8 characters no longer overwrite the work space
3018          (which is on the stack). */
3019    
3020          if (lengthptr != NULL)
3021            {
3022            *lengthptr += class_utf8data - class_utf8data_base;
3023            class_utf8data = class_utf8data_base;
3024            }
3025    
3026  #endif  #endif
3027    
3028        /* Inside \Q...\E everything is literal except \E */        /* Inside \Q...\E everything is literal except \E */
3029    
3030        if (inescq)        if (inescq)
3031          {          {
3032          if (c == '\\' && ptr[1] == 'E')          if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3033            {            {
3034            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
3035            ptr++;            ptr++;                            /* Skip the 'E' */
3036            continue;            continue;                         /* Carry on with next */
3037            }            }
3038          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
3039          }          }
3040    
3041        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1829  for (;; ptr++) Line 3044  for (;; ptr++)
3044        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl        [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3045        5.6 and 5.8 do. */        5.6 and 5.8 do. */
3046    
3047        if (c == '[' &&        if (c == CHAR_LEFT_SQUARE_BRACKET &&
3048            (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&            (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3049            check_posix_syntax(ptr, &tempptr, cd))             ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3050          {          {
3051          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
3052          int posix_class, taboffset, tabopt;          int posix_class, taboffset, tabopt;
3053          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
3054          uschar pbits[32];          uschar pbits[32];
3055    
3056          if (ptr[1] != ':')          if (ptr[1] != CHAR_COLON)
3057            {            {
3058            *errorcodeptr = ERR31;            *errorcodeptr = ERR31;
3059            goto FAILED;            goto FAILED;
3060            }            }
3061    
3062          ptr += 2;          ptr += 2;
3063          if (*ptr == '^')          if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3064            {            {
3065            local_negate = TRUE;            local_negate = TRUE;
3066              should_flip_negation = TRUE;  /* Note negative special */
3067            ptr++;            ptr++;
3068            }            }
3069    
# Line 1911  for (;; ptr++) Line 3127  for (;; ptr++)
3127          }          }
3128    
3129        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
3130        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
3131        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
3132        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
3133        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
3134        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
3135    
3136        if (c == '\\')        if (c == CHAR_BACKSLASH)
3137          {          {
3138          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3139            if (*errorcodeptr != 0) goto FAILED;
3140    
3141          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
3142          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
3143            else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
3144          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
3145            {            {
3146            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3147              {              {
3148              ptr += 2; /* avoid empty string */              ptr += 2; /* avoid empty string */
3149              }              }
3150            else inescq = TRUE;            else inescq = TRUE;
3151            continue;            continue;
3152            }            }
3153            else if (-c == ESC_E) continue;  /* Ignore orphan \E */
3154    
3155          if (c < 0)          if (c < 0)
3156            {            {
3157            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
3158            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
3159            switch (-c)  
3160              /* Save time by not doing this in the pre-compile phase. */
3161    
3162              if (lengthptr == NULL) switch (-c)
3163              {              {
3164              case ESC_d:              case ESC_d:
3165              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3166              continue;              continue;
3167    
3168              case ESC_D:              case ESC_D:
3169                should_flip_negation = TRUE;
3170              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3171              continue;              continue;
3172    
# Line 1953  for (;; ptr++) Line 3175  for (;; ptr++)
3175              continue;              continue;
3176    
3177              case ESC_W:              case ESC_W:
3178                should_flip_negation = TRUE;
3179              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3180              continue;              continue;
3181    
# Line 1962  for (;; ptr++) Line 3185  for (;; ptr++)
3185              continue;              continue;
3186    
3187              case ESC_S:              case ESC_S:
3188                should_flip_negation = TRUE;
3189              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3190              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3191              continue;              continue;
3192    
3193  #ifdef SUPPORT_UCP              default:    /* Not recognized; fall through */
3194              case ESC_p:              break;      /* Need "default" setting to stop compiler warning. */
3195              case ESC_P:              }
3196    
3197              /* In the pre-compile phase, just do the recognition. */
3198    
3199              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3200                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3201    
3202              /* We need to deal with \H, \h, \V, and \v in both phases because
3203              they use extra memory. */
3204    
3205              if (-c == ESC_h)
3206                {
3207                SETBIT(classbits, 0x09); /* VT */
3208                SETBIT(classbits, 0x20); /* SPACE */
3209                SETBIT(classbits, 0xa0); /* NSBP */
3210    #ifdef SUPPORT_UTF8
3211                if (utf8)
3212                  {
3213                  class_utf8 = TRUE;
3214                  *class_utf8data++ = XCL_SINGLE;
3215                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3216                  *class_utf8data++ = XCL_SINGLE;
3217                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3218                  *class_utf8data++ = XCL_RANGE;
3219                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3220                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3221                  *class_utf8data++ = XCL_SINGLE;
3222                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3223                  *class_utf8data++ = XCL_SINGLE;
3224                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3225                  *class_utf8data++ = XCL_SINGLE;
3226                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3227                  }
3228    #endif
3229                continue;
3230                }
3231    
3232              if (-c == ESC_H)
3233                {
3234                for (c = 0; c < 32; c++)
3235                  {
3236                  int x = 0xff;
3237                  switch (c)
3238                    {
3239                    case 0x09/8: x ^= 1 << (0x09%8); break;
3240                    case 0x20/8: x ^= 1 << (0x20%8); break;
3241                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
3242                    default: break;
3243                    }
3244                  classbits[c] |= x;
3245                  }
3246    
3247    #ifdef SUPPORT_UTF8
3248                if (utf8)
3249                {                {
               BOOL negated;  
               int pdata;  
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);  
               if (ptype < 0) goto FAILED;  
3250                class_utf8 = TRUE;                class_utf8 = TRUE;
3251                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_RANGE;
3252                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3253                *class_utf8data++ = ptype;                class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3254                *class_utf8data++ = pdata;                *class_utf8data++ = XCL_RANGE;
3255                class_charcount -= 2;   /* Not a < 256 character */                class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3256                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3257                  *class_utf8data++ = XCL_RANGE;
3258                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3259                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3260                  *class_utf8data++ = XCL_RANGE;
3261                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3262                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3263                  *class_utf8data++ = XCL_RANGE;
3264                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3265                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3266                  *class_utf8data++ = XCL_RANGE;
3267                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3268                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3269                  *class_utf8data++ = XCL_RANGE;
3270                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3271                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3272                }                }
3273    #endif
3274              continue;              continue;
3275                }
3276    
3277              if (-c == ESC_v)
3278                {
3279                SETBIT(classbits, 0x0a); /* LF */
3280                SETBIT(classbits, 0x0b); /* VT */
3281                SETBIT(classbits, 0x0c); /* FF */
3282                SETBIT(classbits, 0x0d); /* CR */
3283                SETBIT(classbits, 0x85); /* NEL */
3284    #ifdef SUPPORT_UTF8
3285                if (utf8)
3286                  {
3287                  class_utf8 = TRUE;
3288                  *class_utf8data++ = XCL_RANGE;
3289                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3290                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3291                  }
3292  #endif  #endif
3293                continue;
3294                }
3295    
3296              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_V)
3297              strict mode. By default, for compatibility with Perl, they are              {
3298              treated as literals. */              for (c = 0; c < 32; c++)
3299                  {
3300                  int x = 0xff;
3301                  switch (c)
3302                    {
3303                    case 0x0a/8: x ^= 1 << (0x0a%8);
3304                                 x ^= 1 << (0x0b%8);
3305                                 x ^= 1 << (0x0c%8);
3306                                 x ^= 1 << (0x0d%8);
3307                                 break;
3308                    case 0x85/8: x ^= 1 << (0x85%8); break;
3309                    default: break;
3310                    }
3311                  classbits[c] |= x;
3312                  }
3313    
3314              default:  #ifdef SUPPORT_UTF8
3315              if ((options & PCRE_EXTRA) != 0)              if (utf8)
3316                {                {
3317                *errorcodeptr = ERR7;                class_utf8 = TRUE;
3318                goto FAILED;                *class_utf8data++ = XCL_RANGE;
3319                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3320                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3321                  *class_utf8data++ = XCL_RANGE;
3322                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3323                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3324                }                }
3325              c = *ptr;              /* The final character */  #endif
3326              class_charcount -= 2;  /* Undo the default count from above */              continue;
3327                }
3328    
3329              /* We need to deal with \P and \p in both phases. */
3330    
3331    #ifdef SUPPORT_UCP
3332              if (-c == ESC_p || -c == ESC_P)
3333                {
3334                BOOL negated;
3335                int pdata;
3336                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3337                if (ptype < 0) goto FAILED;
3338                class_utf8 = TRUE;
3339                *class_utf8data++ = ((-c == ESC_p) != negated)?
3340                  XCL_PROP : XCL_NOTPROP;
3341                *class_utf8data++ = ptype;
3342                *class_utf8data++ = pdata;
3343                class_charcount -= 2;   /* Not a < 256 character */
3344                continue;
3345              }              }
3346    #endif
3347              /* Unrecognized escapes are faulted if PCRE is running in its
3348              strict mode. By default, for compatibility with Perl, they are
3349              treated as literals. */
3350    
3351              if ((options & PCRE_EXTRA) != 0)
3352                {
3353                *errorcodeptr = ERR7;
3354                goto FAILED;
3355                }
3356    
3357              class_charcount -= 2;  /* Undo the default count from above */
3358              c = *ptr;              /* Get the final character and fall through */
3359            }            }
3360    
3361          /* Fall through if we have a single character (c >= 0). This may be          /* Fall through if we have a single character (c >= 0). This may be
3362          > 256 in UTF-8 mode. */          greater than 256 in UTF-8 mode. */
3363    
3364          }   /* End of backslash handling */          }   /* End of backslash handling */
3365    
3366        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
3367        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
3368        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
3369          entirely. The code for handling \Q and \E is messy. */
3370    
3371          CHECK_RANGE:
3372          while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3373            {
3374            inescq = FALSE;
3375            ptr += 2;
3376            }
3377    
3378          oldptr = ptr;
3379    
3380          /* Remember \r or \n */
3381    
3382          if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3383    
3384          /* Check for range */
3385    
3386        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == CHAR_MINUS)
3387          {          {
3388          int d;          int d;
3389          ptr += 2;          ptr += 2;
3390            while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3391    
3392            /* If we hit \Q (not followed by \E) at this point, go into escaped
3393            mode. */
3394    
3395            while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3396              {
3397              ptr += 2;
3398              if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3399                { ptr += 2; continue; }
3400              inescq = TRUE;
3401              break;
3402              }
3403    
3404            if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3405              {
3406              ptr = oldptr;
3407              goto LONE_SINGLE_CHARACTER;
3408              }
3409    
3410  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3411          if (utf8)          if (utf8)
# Line 2026  for (;; ptr++) Line 3420  for (;; ptr++)
3420          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3421          in such circumstances. */          in such circumstances. */
3422    
3423          if (d == '\\')          if (!inescq && d == CHAR_BACKSLASH)
3424            {            {
3425            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3426            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
3427    
3428            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backspace; \X is literal X; \R is literal R; any other
3429            was literal */            special means the '-' was literal */
3430    
3431            if (d < 0)            if (d < 0)
3432              {              {
3433              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = CHAR_BS;
3434              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = CHAR_X;
3435                else if (d == -ESC_R) d = CHAR_R; else
3436                {                {
3437                ptr = oldptr - 2;                ptr = oldptr;
3438                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3439                }                }
3440              }              }
3441            }            }
3442    
3443          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
3444          the pre-pass. Optimize one-character ranges */          one-character ranges */
3445    
3446            if (d < c)
3447              {
3448              *errorcodeptr = ERR8;
3449              goto FAILED;
3450              }
3451    
3452          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3453    
3454            /* Remember \r or \n */
3455    
3456            if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3457    
3458          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless          /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3459          matching, we have to use an XCLASS with extra data items. Caseless          matching, we have to use an XCLASS with extra data items. Caseless
3460          matching for characters > 127 is available only if UCP support is          matching for characters > 127 is available only if UCP support is
# Line 2067  for (;; ptr++) Line 3472  for (;; ptr++)
3472  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3473            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
3474              {              {
3475              int occ, ocd;              unsigned int occ, ocd;
3476              int cc = c;              unsigned int cc = c;
3477              int origd = d;              unsigned int origd = d;
3478              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
3479                {                {
3480                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
3481                      ocd <= (unsigned int)d)
3482                    continue;                          /* Skip embedded ranges */
3483    
3484                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
3485                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3486                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
3487                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
3488                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
3489                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
3490                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3491                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3492                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3493                  d = ocd;                  d = ocd;
3494                  continue;                  continue;
# Line 2127  for (;; ptr++) Line 3536  for (;; ptr++)
3536          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3537          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3538    
3539          for (; c <= d; c++)          class_charcount += d - c + 1;
3540            class_lastchar = d;
3541    
3542            /* We can save a bit of time by skipping this in the pre-compile. */
3543    
3544            if (lengthptr == NULL) for (; c <= d; c++)
3545            {            {
3546            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3547            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2135  for (;; ptr++) Line 3549  for (;; ptr++)
3549              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3550              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3551              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3552            }            }
3553    
3554          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2160  for (;; ptr++) Line 3572  for (;; ptr++)
3572  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3573          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3574            {            {
3575            int othercase;            unsigned int othercase;
3576            if ((othercase = _pcre_ucp_othercase(c)) >= 0)            if ((othercase = UCD_OTHERCASE(c)) != c)
3577              {              {
3578              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3579              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2186  for (;; ptr++) Line 3598  for (;; ptr++)
3598          }          }
3599        }        }
3600    
3601      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3602      loop. This "while" is the end of the "do" above. */  
3603        while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3604    
3605        if (c == 0)                          /* Missing terminating ']' */
3606          {
3607          *errorcodeptr = ERR6;
3608          goto FAILED;
3609          }
3610    
3611    
3612    /* This code has been disabled because it would mean that \s counts as
3613    an explicit \r or \n reference, and that's not really what is wanted. Now
3614    we set the flag only if there is a literal "\r" or "\n" in the class. */
3615    
3616    #if 0
3617        /* Remember whether \r or \n are in this class */
3618    
3619        if (negate_class)
3620          {
3621          if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3622          }
3623        else
3624          {
3625          if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3626          }
3627    #endif
3628    
     while ((c = *(++ptr)) != ']' || inescq);  
3629    
3630      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3631      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. As long as there were no characters >= 128 and there was no
3632      can optimize the negative case only if there were no characters >= 128      use of \p or \P, in other words, no use of any XCLASS features, we can
3633      because OP_NOT and the related opcodes like OP_NOTSTAR operate on      optimize.
3634      single-bytes only. This is an historical hangover. Maybe one day we can  
3635      tidy these opcodes to handle multi-byte characters.      In UTF-8 mode, we can optimize the negative case only if there were no
3636        characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3637        operate on single-bytes only. This is an historical hangover. Maybe one day
3638        we can tidy these opcodes to handle multi-byte characters.
3639    
3640      The optimization throws away the bit map. We turn the item into a      The optimization throws away the bit map. We turn the item into a
3641      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note      1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
# Line 2206  for (;; ptr++) Line 3645  for (;; ptr++)
3645      reqbyte, save the previous value for reinstating. */      reqbyte, save the previous value for reinstating. */
3646    
3647  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3648      if (class_charcount == 1 &&      if (class_charcount == 1 && !class_utf8 &&
3649            (!utf8 ||        (!utf8 || !negate_class || class_lastchar < 128))
           (!class_utf8 && (!negate_class || class_lastchar < 128))))  
   
3650  #else  #else
3651      if (class_charcount == 1)      if (class_charcount == 1)
3652  #endif  #endif
# Line 2252  for (;; ptr++) Line 3689  for (;; ptr++)
3689      zeroreqbyte = reqbyte;      zeroreqbyte = reqbyte;
3690    
3691      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3692      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode, unless there was a negated special
3693      we can omit the bitmap. */      such as \S in the class, because in that case all characters > 255 are in
3694        the class, so any that were explicitly given as well can be ignored. If
3695        (when there are explicit characters > 255 that must be listed) there are no
3696        characters < 256, we can omit the bitmap in the actual compiled code. */
3697    
3698  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3699      if (class_utf8)      if (class_utf8 && !should_flip_negation)
3700        {        {
3701        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3702        *code++ = OP_XCLASS;        *code++ = OP_XCLASS;
3703        code += LINK_SIZE;        code += LINK_SIZE;
3704        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3705    
3706        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3707        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3708    
3709        if (class_charcount > 0)        if (class_charcount > 0)
3710          {          {
3711          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3712            memmove(code + 32, code, class_utf8data - code);
3713          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3714          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3715          }          }
3716          else code = class_utf8data;
3717    
3718        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3719    
# Line 2289  for (;; ptr++) Line 3722  for (;; ptr++)
3722        }        }
3723  #endif  #endif
3724    
3725      /* If there are no characters > 255, negate the 32-byte map if necessary,      /* If there are no characters > 255, set the opcode to OP_CLASS or
3726      and copy it into the code vector. If this is the first thing in the branch,      OP_NCLASS, depending on whether the whole class was negated and whether
3727      there can be no first char setting, whatever the repeat count. Any reqbyte      there were negative specials such as \S in the class. Then copy the 32-byte
3728      setting must remain unchanged after any kind of repeat. */      map into the code vector, negating it if necessary. */
3729    
3730        *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3731      if (negate_class)      if (negate_class)
3732        {        {
3733        *code++ = OP_NCLASS;        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3734        for (c = 0; c < 32; c++) code[c] = ~classbits[c];          for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3735        }        }
3736      else      else
3737        {        {
       *code++ = OP_CLASS;  
3738        memcpy(code, classbits, 32);        memcpy(code, classbits, 32);
3739        }        }
3740      code += 32;      code += 32;
3741      break;      break;
3742    
3743    
3744        /* ===================================================================*/
3745      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3746      has been tested above. */      has been tested above. */
3747    
3748      case '{':      case CHAR_LEFT_CURLY_BRACKET:
3749      if (!is_quantifier) goto NORMAL_CHAR;      if (!is_quantifier) goto NORMAL_CHAR;
3750      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);      ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3751      if (*errorcodeptr != 0) goto FAILED;      if (*errorcodeptr != 0) goto FAILED;
3752      goto REPEAT;      goto REPEAT;
3753    
3754      case '*':      case CHAR_ASTERISK:
3755      repeat_min = 0;      repeat_min = 0;
3756      repeat_max = -1;      repeat_max = -1;
3757      goto REPEAT;      goto REPEAT;
3758    
3759      case '+':      case CHAR_PLUS:
3760      repeat_min = 1;      repeat_min = 1;
3761      repeat_max = -1;      repeat_max = -1;
3762      goto REPEAT;      goto REPEAT;
3763    
3764      case '?':      case CHAR_QUESTION_MARK:
3765      repeat_min = 0;      repeat_min = 0;
3766      repeat_max = 1;      repeat_max = 1;
3767    
# Line 2361  for (;; ptr++) Line 3796  for (;; ptr++)
3796      but if PCRE_UNGREEDY is set, it works the other way round. We change the      but if PCRE_UNGREEDY is set, it works the other way round. We change the
3797      repeat type to the non-default. */      repeat type to the non-default. */
3798    
3799      if (ptr[1] == '+')      if (ptr[1] == CHAR_PLUS)
3800        {        {
3801        repeat_type = 0;                  /* Force greedy */        repeat_type = 0;                  /* Force greedy */
3802        possessive_quantifier = TRUE;        possessive_quantifier = TRUE;
3803        ptr++;        ptr++;
3804        }        }
3805      else if (ptr[1] == '?')      else if (ptr[1] == CHAR_QUESTION_MARK)
3806        {        {
3807        repeat_type = greedy_non_default;        repeat_type = greedy_non_default;
3808        ptr++;        ptr++;
3809        }        }
3810      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3811    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3812      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3813      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3814      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2421  for (;; ptr++) Line 3842  for (;; ptr++)
3842          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3843          }          }
3844    
3845          /* If the repetition is unlimited, it pays to see if the next thing on
3846          the line is something that cannot possibly match this character. If so,
3847          automatically possessifying this item gains some performance in the case
3848          where the match fails. */
3849    
3850          if (!possessive_quantifier &&
3851              repeat_max < 0 &&
3852              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3853                options, cd))
3854            {
3855            repeat_type = 0;    /* Force greedy */
3856            possessive_quantifier = TRUE;
3857            }
3858    
3859        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3860        }        }
3861    
3862      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3863      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3864      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3865      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3866        currently used only for single-byte chars. */
3867    
3868      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3869        {        {
3870        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3871        c = previous[1];        c = previous[1];
3872          if (!possessive_quantifier &&
3873              repeat_max < 0 &&
3874              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3875            {
3876            repeat_type = 0;    /* Force greedy */
3877            possessive_quantifier = TRUE;
3878            }
3879        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3880        }        }
3881    
# Line 2450  for (;; ptr++) Line 3893  for (;; ptr++)
3893        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3894        c = *previous;        c = *previous;
3895    
3896          if (!possessive_quantifier &&
3897              repeat_max < 0 &&
3898              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3899            {
3900            repeat_type = 0;    /* Force greedy */
3901            possessive_quantifier = TRUE;
3902            }
3903    
3904        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3905        if (*previous == OP_PROP || *previous == OP_NOTPROP)        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3906          {          {
# Line 2466  for (;; ptr++) Line 3917  for (;; ptr++)
3917    
3918        if (repeat_max == 0) goto END_REPEAT;        if (repeat_max == 0) goto END_REPEAT;
3919    
3920          /*--------------------------------------------------------------------*/
3921          /* This code is obsolete from release 8.00; the restriction was finally
3922          removed: */
3923    
3924        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
3925        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
3926    
3927        if (repeat_max != 1) cd->nopartial = TRUE;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3928          /*--------------------------------------------------------------------*/
3929    
3930        /* Combine the op_type with the repeat_type */        /* Combine the op_type with the repeat_type */
3931    
# Line 2490  for (;; ptr++) Line 3946  for (;; ptr++)
3946          }          }
3947    
3948        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3949        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3950        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3951        one less than the maximum. */        one less than the maximum. */
3952    
# Line 2543  for (;; ptr++) Line 3999  for (;; ptr++)
3999            }            }
4000    
4001          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
4002          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
4003            UPTO is just for 1 instance, we can use QUERY instead. */
4004    
4005          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
4006            {            {
# Line 2562  for (;; ptr++) Line 4019  for (;; ptr++)
4019              *code++ = prop_value;              *code++ = prop_value;
4020              }              }
4021            repeat_max -= repeat_min;            repeat_max -= repeat_min;
4022            *code++ = OP_UPTO + repeat_type;  
4023            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
4024                {
4025                *code++ = OP_QUERY + repeat_type;
4026                }
4027              else
4028                {
4029                *code++ = OP_UPTO + repeat_type;
4030                PUT2INC(code, 0, repeat_max);
4031                }
4032            }            }
4033          }          }
4034    
# Line 2607  for (;; ptr++) Line 4072  for (;; ptr++)
4072          goto END_REPEAT;          goto END_REPEAT;
4073          }          }
4074    
4075          /*--------------------------------------------------------------------*/
4076          /* This code is obsolete from release 8.00; the restriction was finally
4077          removed: */
4078    
4079        /* All real repeats make it impossible to handle partial matching (maybe        /* All real repeats make it impossible to handle partial matching (maybe
4080        one day we will be able to remove this restriction). */        one day we will be able to remove this restriction). */
4081    
4082        if (repeat_max != 1) cd->nopartial = TRUE;        /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4083          /*--------------------------------------------------------------------*/
4084    
4085        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
4086          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
# Line 2630  for (;; ptr++) Line 4100  for (;; ptr++)
4100      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
4101      cases. */      cases. */
4102    
4103      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
4104               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
4105        {        {
4106        register int i;        register int i;
4107        int ketoffset = 0;        int ketoffset = 0;
4108        int len = code - previous;        int len = code - previous;
4109        uschar *bralink = NULL;        uschar *bralink = NULL;
4110    
4111          /* Repeating a DEFINE group is pointless */
4112    
4113          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4114            {
4115            *errorcodeptr = ERR55;
4116            goto FAILED;
4117            }
4118    
4119        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
4120        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
4121        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2660  for (;; ptr++) Line 4138  for (;; ptr++)
4138    
4139        if (repeat_min == 0)        if (repeat_min == 0)
4140          {          {
4141          /* If the maximum is also zero, we just omit the group from the output          /* If the maximum is also zero, we used to just omit the group from the
4142          altogether. */          output altogether, like this:
4143    
4144          if (repeat_max == 0)          ** if (repeat_max == 0)
4145            {          **   {
4146            code = previous;          **   code = previous;
4147            goto END_REPEAT;          **   goto END_REPEAT;
4148            }          **   }
4149    
4150            However, that fails when a group is referenced as a subroutine from
4151            elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4152            so that it is skipped on execution. As we don't have a list of which
4153            groups are referenced, we cannot do this selectively.
4154    
4155            If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4156            and do no more at this point. However, we do need to adjust any
4157            OP_RECURSE calls inside the group that refer to the group itself or any
4158            internal or forward referenced group, because the offset is from the
4159            start of the whole regex. Temporarily terminate the pattern while doing
4160            this. */
4161    
4162          /* If the maximum is 1 or unlimited, we just have to stick in the          if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
         BRAZERO and do no more at this point. However, we do need to adjust  
         any OP_RECURSE calls inside the group that refer to the group itself or  
         any internal group, because the offset is from the start of the whole  
         regex. Temporarily terminate the pattern while doing this. */  
   
         if (repeat_max <= 1)  
4163            {            {
4164            *code = OP_END;            *code = OP_END;
4165            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
4166            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
4167            code++;            code++;
4168              if (repeat_max == 0)
4169                {
4170                *previous++ = OP_SKIPZERO;
4171                goto END_REPEAT;
4172                }
4173            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4174            }            }
4175    
# Line 2696  for (;; ptr++) Line 4185  for (;; ptr++)
4185            {            {
4186            int offset;            int offset;
4187            *code = OP_END;            *code = OP_END;
4188            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4189            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
4190            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
4191            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2716  for (;; ptr++) Line 4205  for (;; ptr++)
4205        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
4206        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
4207        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
4208        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
4209          forward reference subroutine calls in the group, there will be entries on
4210          the workspace list; replicate these with an appropriate increment. */
4211    
4212        else        else
4213          {          {
4214          if (repeat_min > 1)          if (repeat_min > 1)
4215            {            {
4216            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
4217            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
4218              potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4219              integer type when available, otherwise double. */
4220    
4221              if (lengthptr != NULL)
4222                {
4223                int delta = (repeat_min - 1)*length_prevgroup;
4224                if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4225                      (INT64_OR_DOUBLE)length_prevgroup >
4226                        (INT64_OR_DOUBLE)INT_MAX ||
4227                    OFLOW_MAX - *lengthptr < delta)
4228                  {
4229                  *errorcodeptr = ERR20;
4230                  goto FAILED;
4231                  }
4232                *lengthptr += delta;
4233                }
4234    
4235              /* This is compiling for real */
4236    
4237              else
4238              {              {
4239              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4240              code += len;              for (i = 1; i < repeat_min; i++)
4241                  {
4242                  uschar *hc;
4243                  uschar *this_hwm = cd->hwm;
4244                  memcpy(code, previous, len);
4245                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4246                    {
4247                    PUT(cd->hwm, 0, GET(hc, 0) + len);
4248                    cd->hwm += LINK_SIZE;
4249                    }
4250                  save_hwm = this_hwm;
4251                  code += len;
4252                  }
4253              }              }
4254            }            }
4255    
4256          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
4257          }          }
4258    
# Line 2736  for (;; ptr++) Line 4260  for (;; ptr++)
4260        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
4261        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
4262        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
4263        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
4264          replicate entries on the forward reference list. */
4265    
4266        if (repeat_max >= 0)        if (repeat_max >= 0)
4267          {          {
4268          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
4269            just adjust the length as if we had. For each repetition we must add 1
4270            to the length for BRAZERO and for all but the last repetition we must
4271            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4272            paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4273            a 64-bit integer type when available, otherwise double. */
4274    
4275            if (lengthptr != NULL && repeat_max > 0)
4276              {
4277              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4278                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4279              if ((INT64_OR_DOUBLE)repeat_max *
4280                    (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4281                      > (INT64_OR_DOUBLE)INT_MAX ||
4282                  OFLOW_MAX - *lengthptr < delta)
4283                {
4284                *errorcodeptr = ERR20;
4285                goto FAILED;
4286                }
4287              *lengthptr += delta;
4288              }
4289    
4290            /* This is compiling for real */
4291    
4292            else for (i = repeat_max - 1; i >= 0; i--)
4293            {            {
4294              uschar *hc;
4295              uschar *this_hwm = cd->hwm;
4296    
4297            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
4298    
4299            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2757  for (;; ptr++) Line 4309  for (;; ptr++)
4309              }              }
4310    
4311            memcpy(code, previous, len);            memcpy(code, previous, len);
4312              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4313                {
4314                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4315                cd->hwm += LINK_SIZE;
4316                }
4317              save_hwm = this_hwm;
4318            code += len;            code += len;
4319            }            }
4320    
# Line 2779  for (;; ptr++) Line 4337  for (;; ptr++)
4337        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
4338        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
4339        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
4340        correct offset was computed above. */        correct offset was computed above.
4341    
4342        else code[-ketoffset] = OP_KETRMAX + repeat_type;        Then, when we are doing the actual compile phase, check to see whether
4343          this group is a non-atomic one that could match an empty string. If so,
4344          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4345          that runtime checking can be done. [This check is also applied to
4346          atomic groups at runtime, but in a different way.] */
4347    
4348          else
4349            {
4350            uschar *ketcode = code - ketoffset;
4351            uschar *bracode = ketcode - GET(ketcode, 1);
4352            *ketcode = OP_KETRMAX + repeat_type;
4353            if (lengthptr == NULL && *bracode != OP_ONCE)
4354              {
4355              uschar *scode = bracode;
4356              do
4357                {
4358                if (could_be_empty_branch(scode, ketcode, utf8))
4359                  {
4360                  *bracode += OP_SBRA - OP_BRA;
4361                  break;
4362                  }
4363                scode += GET(scode, 1);
4364                }
4365              while (*scode == OP_ALT);
4366              }
4367            }
4368        }        }
4369    
4370        /* If previous is OP_FAIL, it was generated by an empty class [] in
4371        JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4372        by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4373        error above. We can just ignore the repeat in JS case. */
4374    
4375        else if (*previous == OP_FAIL) goto END_REPEAT;
4376    
4377      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
4378    
4379      else      else
# Line 2792  for (;; ptr++) Line 4382  for (;; ptr++)
4382        goto FAILED;        goto FAILED;
4383        }        }
4384    
4385      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
4386      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
4387      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
4388      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4389      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
4390        but the special opcodes can optimize it a bit. The repeated item starts at
4391        tempcode, not at previous, which might be the first part of a string whose
4392        (former) last char we repeated.
4393    
4394        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4395        an 'upto' may follow. We skip over an 'exact' item, and then test the
4396        length of what remains before proceeding. */
4397    
4398      if (possessive_quantifier)      if (possessive_quantifier)
4399        {        {
4400        int len = code - tempcode;        int len;
4401        memmove(tempcode + 1+LINK_SIZE, tempcode, len);  
4402        code += 1 + LINK_SIZE;        if (*tempcode == OP_TYPEEXACT)
4403        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode] +
4404        tempcode[0] = OP_ONCE;            ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4405        *code++ = OP_KET;  
4406        PUTINC(code, 0, len);        else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4407        PUT(tempcode, 1, len);          {
4408            tempcode += _pcre_OP_lengths[*tempcode];
4409    #ifdef SUPPORT_UTF8
4410            if (utf8 && tempcode[-1] >= 0xc0)
4411              tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4412    #endif
4413            }
4414    
4415          len = code - tempcode;
4416          if (len > 0) switch (*tempcode)
4417            {
4418            case OP_STAR:  *tempcode = OP_POSSTAR; break;
4419            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4420            case OP_QUERY: *tempcode = OP_POSQUERY; break;
4421            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4422    
4423            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4424            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4425            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4426            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4427    
4428            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4429            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4430            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4431            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4432    
4433            default:
4434            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4435            code += 1 + LINK_SIZE;
4436            len += 1 + LINK_SIZE;
4437            tempcode[0] = OP_ONCE;
4438            *code++ = OP_KET;
4439            PUTINC(code, 0, len);
4440            PUT(tempcode, 1, len);
4441            break;
4442            }
4443        }        }
4444    
4445      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2820  for (;; ptr++) Line 4452  for (;; ptr++)
4452      break;      break;
4453    
4454    
4455      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
4456      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
4457      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
4458      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
4459    
4460      case '(':      case CHAR_LEFT_PARENTHESIS:
4461      newoptions = options;      newoptions = options;
4462      skipbytes = 0;      skipbytes = 0;
4463        bravalue = OP_CBRA;
4464        save_hwm = cd->hwm;
4465        reset_bracount = FALSE;
4466    
4467        /* First deal with various "verbs" that can be introduced by '*'. */
4468    
4469        if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4470          {
4471          int i, namelen;
4472          const char *vn = verbnames;
4473          const uschar *name = ++ptr;
4474          previous = NULL;
4475          while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4476          if (*ptr == CHAR_COLON)
4477            {
4478            *errorcodeptr = ERR59;   /* Not supported */
4479            goto FAILED;
4480            }
4481          if (*ptr != CHAR_RIGHT_PARENTHESIS)
4482            {
4483            *errorcodeptr = ERR60;
4484            goto FAILED;
4485            }
4486          namelen = ptr - name;
4487          for (i = 0; i < verbcount; i++)
4488            {
4489            if (namelen == verbs[i].len &&
4490                strncmp((char *)name, vn, namelen) == 0)
4491              {
4492              /* Check for open captures before ACCEPT */
4493    
4494              if (verbs[i].op == OP_ACCEPT)
4495                {
4496                open_capitem *oc;
4497                cd->had_accept = TRUE;
4498                for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4499                  {
4500                  *code++ = OP_CLOSE;
4501                  PUT2INC(code, 0, oc->number);
4502                  }
4503                }
4504              *code++ = verbs[i].op;
4505              break;
4506              }
4507            vn += verbs[i].len + 1;
4508            }
4509          if (i < verbcount) continue;
4510          *errorcodeptr = ERR60;
4511          goto FAILED;
4512          }
4513    
4514      if (*(++ptr) == '?')      /* Deal with the extended parentheses; all are introduced by '?', and the
4515        appearance of any of them means that this is not a capturing group. */
4516    
4517        else if (*ptr == CHAR_QUESTION_MARK)
4518        {        {
4519        int set, unset;        int i, set, unset, namelen;
4520        int *optset;        int *optset;
4521          const uschar *name;
4522          uschar *slot;
4523    
4524        switch (*(++ptr))        switch (*(++ptr))
4525          {          {
4526          case '#':                 /* Comment; skip to ket */          case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
4527          ptr++;          ptr++;
4528          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4529            if (*ptr == 0)
4530              {
4531              *errorcodeptr = ERR18;
4532              goto FAILED;
4533              }
4534          continue;          continue;
4535    
4536          case ':':                 /* Non-extracting bracket */  
4537            /* ------------------------------------------------------------ */
4538            case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
4539            reset_bracount = TRUE;
4540            /* Fall through */
4541    
4542            /* ------------------------------------------------------------ */
4543            case CHAR_COLON:          /* Non-capturing bracket */
4544          bravalue = OP_BRA;          bravalue = OP_BRA;
4545          ptr++;          ptr++;
4546          break;          break;
4547    
4548          case '(':  
4549            /* ------------------------------------------------------------ */
4550            case CHAR_LEFT_PARENTHESIS:
4551          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
4552    
4553          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
4554            group), a name (referring to a named group), or 'R', referring to
4555            recursion. R<digits> and R&name are also permitted for recursion tests.
4556    
4557            There are several syntaxes for testing a named group: (?(name)) is used
4558            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4559    
4560            There are two unfortunate ambiguities, caused by history. (a) 'R' can
4561            be the recursive thing or the name 'R' (and similarly for 'R' followed
4562            by digits), and (b) a number could be a name that consists of digits.
4563            In both cases, we look for a name first; if not found, we try the other
4564            cases. */
4565    
4566            /* For conditions that are assertions, check the syntax, and then exit
4567            the switch. This will take control down to where bracketed groups,
4568            including assertions, are processed. */
4569    
4570          if (ptr[1] == 'R')          if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4571            {              ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
45