/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 79 by nigel, Sat Feb 24 21:40:52 2007 UTC revision 216 by ph10, Wed Aug 15 14:35:57 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56    /* When DEBUG is defined, we need the pcre_printint() function, which is also
57    used by pcretest. DEBUG is not defined when building a production library. */
58    
59    #ifdef DEBUG
60    #include "pcre_printint.src"
61    #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 106  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143    /* Table of special "verbs" like (*PRUNE) */
144    
145    typedef struct verbitem {
146      const char *name;
147      int   len;
148      int   op;
149    } verbitem;
150    
151    static verbitem verbs[] = {
152      { "ACCEPT", 6, OP_ACCEPT },
153      { "COMMIT", 6, OP_COMMIT },
154      { "F",      1, OP_FAIL },
155      { "FAIL",   4, OP_FAIL },
156      { "PRUNE",  5, OP_PRUNE },
157      { "SKIP",   4, OP_SKIP  },
158      { "THEN",   4, OP_THEN  }
159    };
160    
161    static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
165  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
166  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
167    
168  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 118  static const char *const posix_names[] = Line 173  static const char *const posix_names[] =
173  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
174    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
177  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
178  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
179    characters are removed, and for [:alpha:] and [:alnum:] the underscore
180    character is removed. The triples in the table consist of the base map offset,
181    second map offset or -1 if no second map, and a non-negative value for map
182    addition or a negative value for map subtraction (if there are two maps). The
183    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184    remove vertical space characters, 2 => remove underscore. */
185    
186  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
187    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
188    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
189    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
190    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
191    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
192    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
193    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
194    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
195    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
196    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
197    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
198    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
199    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
200    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
201  };  };
202    
203    
204    #define STRING(a)  # a
205    #define XSTRING(s) STRING(s)
206    
207  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
208  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
209    they are documented. Always add a new error instead. Messages marked DEAD below
210    are no longer used. */
211    
212  static const char *error_texts[] = {  static const char *error_texts[] = {
213    "no error",    "no error",
# Line 156  static const char *error_texts[] = { Line 222  static const char *error_texts[] = {
222    "range out of order in character class",    "range out of order in character class",
223    "nothing to repeat",    "nothing to repeat",
224    /* 10 */    /* 10 */
225    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
226    "internal error: unexpected repeat",    "internal error: unexpected repeat",
227    "unrecognized character after (?",    "unrecognized character after (?",
228    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 166  static const char *error_texts[] = { Line 232  static const char *error_texts[] = {
232    "erroffset passed as NULL",    "erroffset passed as NULL",
233    "unknown option bit(s) set",    "unknown option bit(s) set",
234    "missing ) after comment",    "missing ) after comment",
235    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
236    /* 20 */    /* 20 */
237    "regular expression too large",    "regular expression is too large",
238    "failed to get memory",    "failed to get memory",
239    "unmatched parentheses",    "unmatched parentheses",
240    "internal error: code overflow",    "internal error: code overflow",
241    "unrecognized character after (?<",    "unrecognized character after (?<",
242    /* 25 */    /* 25 */
243    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
244    "malformed number after (?(",    "malformed number or name after (?(",
245    "conditional group contains more than two branches",    "conditional group contains more than two branches",
246    "assertion expected after (?(",    "assertion expected after (?(",
247    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
248    /* 30 */    /* 30 */
249    "unknown POSIX class name",    "unknown POSIX class name",
250    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
251    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
252    "spare error",    "spare error",  /** DEAD **/
253    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
254    /* 35 */    /* 35 */
255    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 194  static const char *error_texts[] = { Line 260  static const char *error_texts[] = {
260    /* 40 */    /* 40 */
261    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
262    "unrecognized character after (?P",    "unrecognized character after (?P",
263    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
264    "two named groups have the same name",    "two named subpatterns have the same name",
265    "invalid UTF-8 string",    "invalid UTF-8 string",
266    /* 45 */    /* 45 */
267    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
268    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
269    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
270      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272      /* 50 */
273      "repeated subpattern is too long",    /** DEAD **/
274      "octal value is greater than \\377 (not in UTF-8 mode)",
275      "internal error: overran compiling workspace",
276      "internal error: previously-checked referenced subpattern not found",
277      "DEFINE group contains more than one branch",
278      /* 55 */
279      "repeating a DEFINE group is not allowed",
280      "inconsistent NEWLINE options",
281      "\\g is not followed by a braced name or an optionally braced non-zero number",
282      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283      "(*VERB) with an argument is not supported",
284      /* 60 */
285      "(*VERB) not recognized",
286      "number is too big"
287  };  };
288    
289    
# Line 220  For convenience, we use the same bit def Line 303  For convenience, we use the same bit def
303    
304  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
307  static const unsigned char digitab[] =  static const unsigned char digitab[] =
308    {    {
309    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 339  static const unsigned char digitab[] =
339    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
343  static const unsigned char digitab[] =  static const unsigned char digitab[] =
344    {    {
345    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 353  static const unsigned char digitab[] =
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
354    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
355    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
356    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
357    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 387  static const unsigned char ebcdic_charta
387    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
388    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
389    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
390    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
391    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
392    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
393    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 414  static const unsigned char ebcdic_charta
414  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
415    
416  static BOOL  static BOOL
417    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
419    
420    
421    
# Line 342  static BOOL Line 425  static BOOL
425    
426  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
427  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
428  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
429  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431    ptr is pointing at the \. On exit, it is on the final character of the escape
432    sequence.
433    
434  Arguments:  Arguments:
435    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 355  Arguments: Line 440  Arguments:
440    
441  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
442                   negative => a special escape sequence                   negative => a special escape sequence
443                   on error, errorptr is set                   on error, errorcodeptr is set
444  */  */
445    
446  static int  static int
447  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448    int options, BOOL isclass)    int options, BOOL isclass)
449  {  {
450  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
451    const uschar *ptr = *ptrptr + 1;
452  int c, i;  int c, i;
453    
454    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
455    ptr--;                            /* Set pointer back to the last byte */
456    
457  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
458    
 c = *(++ptr);  
459  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
460    
461  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
463  Otherwise further processing may be required. */  Otherwise further processing may be required. */
464    
465  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
466  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
467  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
468    
469  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
470  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
471  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
472  #endif  #endif
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 476  else if ((i = escapes[c - 0x48]) != 0)
476  else  else
477    {    {
478    const uschar *oldptr;    const uschar *oldptr;
479      BOOL braced, negated;
480    
481    switch (c)    switch (c)
482      {      {
483      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 491  else
491      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
492      break;      break;
493    
494        /* \g must be followed by a number, either plain or braced. If positive, it
495        is an absolute backreference. If negative, it is a relative backreference.
496        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497        reference to a named group. This is part of Perl's movement towards a
498        unified syntax for back references. As this is synonymous with \k{name}, we
499        fudge it up by pretending it really was \k. */
500    
501        case 'g':
502        if (ptr[1] == '{')
503          {
504          const uschar *p;
505          for (p = ptr+2; *p != 0 && *p != '}'; p++)
506            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507          if (*p != 0 && *p != '}')
508            {
509            c = -ESC_k;
510            break;
511            }
512          braced = TRUE;
513          ptr++;
514          }
515        else braced = FALSE;
516    
517        if (ptr[1] == '-')
518          {
519          negated = TRUE;
520          ptr++;
521          }
522        else negated = FALSE;
523    
524        c = 0;
525        while ((digitab[ptr[1]] & ctype_digit) != 0)
526          c = c * 10 + *(++ptr) - '0';
527    
528        if (c < 0)
529          {
530          *errorcodeptr = ERR61;
531          break;
532          }
533    
534        if (c == 0 || (braced && *(++ptr) != '}'))
535          {
536          *errorcodeptr = ERR57;
537          break;
538          }
539    
540        if (negated)
541          {
542          if (c > bracount)
543            {
544            *errorcodeptr = ERR15;
545            break;
546            }
547          c = bracount - (c - 1);
548          }
549    
550        c = -(ESC_REF + c);
551        break;
552    
553      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
554      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
555      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 422  else Line 571  else
571        c -= '0';        c -= '0';
572        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
573          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
574          if (c < 0)
575            {
576            *errorcodeptr = ERR61;
577            break;
578            }
579        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
580          {          {
581          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 442  else Line 596  else
596        }        }
597    
598      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
599      larger first octal digit. */      larger first octal digit. The original code used just to take the least
600        significant 8 bits of octal numbers (I think this is what early Perls used
601        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602        than 3 octal digits. */
603    
604      case '0':      case '0':
605      c -= '0';      c -= '0';
606      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
608      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
609      break;      break;
610    
611      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
612      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613        treated as a data character. */
614    
615      case 'x':      case 'x':
616  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
617        {        {
618        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
619        register int count = 0;        int count = 0;
620    
621        c = 0;        c = 0;
622        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
623          {          {
624          int cc = *pt++;          register int cc = *pt++;
625            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
626          count++;          count++;
627  #if !EBCDIC    /* ASCII coding */  
628    #ifndef EBCDIC  /* ASCII coding */
629          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
630          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
632          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
633          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634  #endif  #endif
635          }          }
636    
637        if (*pt == '}')        if (*pt == '}')
638          {          {
639          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640          ptr = pt;          ptr = pt;
641          break;          break;
642          }          }
643    
644        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
645        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
646        }        }
 #endif  
647    
648      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
649    
650      c = 0;      c = 0;
651      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652        {        {
653        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
654        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
655  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
656        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
657        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
659        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
660        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661  #endif  #endif
662        }        }
663      break;      break;
664    
665      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666        This coding is ASCII-specific, but then the whole concept of \cx is
667        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668    
669      case 'c':      case 'c':
670      c = *(++ptr);      c = *(++ptr);
671      if (c == 0)      if (c == 0)
672        {        {
673        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
674        return 0;        break;
675        }        }
676    
677      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
678      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
679      c ^= 0x40;      c ^= 0x40;
680  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
681      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
682      c ^= 0xC0;      c ^= 0xC0;
683  #endif  #endif
# Line 560  escape sequence. Line 719  escape sequence.
719  Argument:  Argument:
720    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
721    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
722      dptr           points to an int that is set to the detailed property value
723    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
724    
725  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
726  */  */
727    
728  static int  static int
729  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730  {  {
731  int c, i, bot, top;  int c, i, bot, top;
732  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
733  char name[4];  char name[32];
734    
735  c = *(++ptr);  c = *(++ptr);
736  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
737    
738  *negptr = FALSE;  *negptr = FALSE;
739    
740  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741  preceded by ^ for negation. */  negation. */
742    
743  if (c == '{')  if (c == '{')
744    {    {
# Line 587  if (c == '{') Line 747  if (c == '{')
747      *negptr = TRUE;      *negptr = TRUE;
748      ptr++;      ptr++;
749      }      }
750    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
751      {      {
752      c = *(++ptr);      c = *(++ptr);
753      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
754      if (c == '}') break;      if (c == '}') break;
755      name[i] = c;      name[i] = c;
756      }      }
757    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
758    name[i] = 0;    name[i] = 0;
759    }    }
760    
# Line 619  top = _pcre_utt_size; Line 775  top = _pcre_utt_size;
775    
776  while (bot < top)  while (bot < top)
777    {    {
778    i = (bot + top)/2;    i = (bot + top) >> 1;
779    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
780    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
781        {
782        *dptr = _pcre_utt[i].value;
783        return _pcre_utt[i].type;
784        }
785    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
786    }    }
787    
 UNKNOWN_RETURN:  
788  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
789  *ptrptr = ptr;  *ptrptr = ptr;
790  return -1;  return -1;
# Line 698  read_repeat_counts(const uschar *p, int Line 857  read_repeat_counts(const uschar *p, int
857  int min = 0;  int min = 0;
858  int max = -1;  int max = -1;
859    
860    /* Read the minimum value and do a paranoid check: a negative value indicates
861    an integer overflow. */
862    
863  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864    if (min < 0 || min > 65535)
865      {
866      *errorcodeptr = ERR5;
867      return p;
868      }
869    
870    /* Read the maximum value if there is one, and again do a paranoid on its size.
871    Also, max must not be less than min. */
872    
873  if (*p == '}') max = min; else  if (*p == '}') max = min; else
874    {    {
# Line 706  if (*p == '}') max = min; else Line 876  if (*p == '}') max = min; else
876      {      {
877      max = 0;      max = 0;
878      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879        if (max < 0 || max > 65535)
880          {
881          *errorcodeptr = ERR5;
882          return p;
883          }
884      if (max < min)      if (max < min)
885        {        {
886        *errorcodeptr = ERR4;        *errorcodeptr = ERR4;
# Line 714  if (*p == '}') max = min; else Line 889  if (*p == '}') max = min; else
889      }      }
890    }    }
891    
892  /* Do paranoid checks, then fill in the required variables, and pass back the  /* Fill in the required variables, and pass back the pointer to the terminating
893  pointer to the terminating '}'. */  '}'. */
894    
895  if (min > 65535 || max > 65535)  *minp = min;
896    *errorcodeptr = ERR5;  *maxp = max;
897  else  return p;
898    }
899    
900    
901    
902    /*************************************************
903    *       Find forward referenced subpattern       *
904    *************************************************/
905    
906    /* This function scans along a pattern's text looking for capturing
907    subpatterns, and counting them. If it finds a named pattern that matches the
908    name it is given, it returns its number. Alternatively, if the name is NULL, it
909    returns when it reaches a given numbered subpattern. This is used for forward
910    references to subpatterns. We know that if (?P< is encountered, the name will
911    be terminated by '>' because that is checked in the first pass.
912    
913    Arguments:
914      ptr          current position in the pattern
915      count        current count of capturing parens so far encountered
916      name         name to seek, or NULL if seeking a numbered subpattern
917      lorn         name length, or subpattern number if name is NULL
918      xmode        TRUE if we are in /x mode
919    
920    Returns:       the number of the named subpattern, or -1 if not found
921    */
922    
923    static int
924    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925      BOOL xmode)
926    {
927    const uschar *thisname;
928    
929    for (; *ptr != 0; ptr++)
930    {    {
931    *minp = min;    int term;
932    *maxp = max;  
933      /* Skip over backslashed characters and also entire \Q...\E */
934    
935      if (*ptr == '\\')
936        {
937        if (*(++ptr) == 0) return -1;
938        if (*ptr == 'Q') for (;;)
939          {
940          while (*(++ptr) != 0 && *ptr != '\\');
941          if (*ptr == 0) return -1;
942          if (*(++ptr) == 'E') break;
943          }
944        continue;
945        }
946    
947      /* Skip over character classes */
948    
949      if (*ptr == '[')
950        {
951        while (*(++ptr) != ']')
952          {
953          if (*ptr == 0) return -1;
954          if (*ptr == '\\')
955            {
956            if (*(++ptr) == 0) return -1;
957            if (*ptr == 'Q') for (;;)
958              {
959              while (*(++ptr) != 0 && *ptr != '\\');
960              if (*ptr == 0) return -1;
961              if (*(++ptr) == 'E') break;
962              }
963            continue;
964            }
965          }
966        continue;
967        }
968    
969      /* Skip comments in /x mode */
970    
971      if (xmode && *ptr == '#')
972        {
973        while (*(++ptr) != 0 && *ptr != '\n');
974        if (*ptr == 0) return -1;
975        continue;
976        }
977    
978      /* An opening parens must now be a real metacharacter */
979    
980      if (*ptr != '(') continue;
981      if (ptr[1] != '?' && ptr[1] != '*')
982        {
983        count++;
984        if (name == NULL && count == lorn) return count;
985        continue;
986        }
987    
988      ptr += 2;
989      if (*ptr == 'P') ptr++;                      /* Allow optional P */
990    
991      /* We have to disambiguate (?<! and (?<= from (?<name> */
992    
993      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994           *ptr != '\'')
995        continue;
996    
997      count++;
998    
999      if (name == NULL && count == lorn) return count;
1000      term = *ptr++;
1001      if (term == '<') term = '>';
1002      thisname = ptr;
1003      while (*ptr != term) ptr++;
1004      if (name != NULL && lorn == ptr - thisname &&
1005          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006        return count;
1007    }    }
1008  return p;  
1009    return -1;
1010  }  }
1011    
1012    
# Line 778  for (;;) Line 1060  for (;;)
1060    
1061      case OP_CALLOUT:      case OP_CALLOUT:
1062      case OP_CREF:      case OP_CREF:
1063      case OP_BRANUMBER:      case OP_RREF:
1064        case OP_DEF:
1065      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1066      break;      break;
1067    
# Line 823  for (;;) Line 1106  for (;;)
1106    {    {
1107    int d;    int d;
1108    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1109    
1110    switch (op)    switch (op)
1111      {      {
1112        case OP_CBRA:
1113      case OP_BRA:      case OP_BRA:
1114      case OP_ONCE:      case OP_ONCE:
1115      case OP_COND:      case OP_COND:
1116      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1117      if (d < 0) return d;      if (d < 0) return d;
1118      branchlength += d;      branchlength += d;
1119      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 865  for (;;) Line 1148  for (;;)
1148      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1149    
1150      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1151      case OP_CREF:      case OP_CREF:
1152        case OP_RREF:
1153        case OP_DEF:
1154      case OP_OPT:      case OP_OPT:
1155      case OP_CALLOUT:      case OP_CALLOUT:
1156      case OP_SOD:      case OP_SOD:
# Line 884  for (;;) Line 1168  for (;;)
1168    
1169      case OP_CHAR:      case OP_CHAR:
1170      case OP_CHARNC:      case OP_CHARNC:
1171        case OP_NOT:
1172      branchlength++;      branchlength++;
1173      cc += 2;      cc += 2;
1174  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 917  for (;;) Line 1202  for (;;)
1202    
1203      case OP_PROP:      case OP_PROP:
1204      case OP_NOTPROP:      case OP_NOTPROP:
1205      cc++;      cc += 2;
1206      /* Fall through */      /* Fall through */
1207    
1208      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 998  Returns:      pointer to the opcode for Line 1283  Returns:      pointer to the opcode for
1283  static const uschar *  static const uschar *
1284  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1285  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1286  for (;;)  for (;;)
1287    {    {
1288    register int c = *code;    register int c = *code;
1289    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1290    else if (c > OP_BRA)  
1291      /* XCLASS is used for classes that cannot be represented just by a bit
1292      map. This includes negated single high-valued characters. The length in
1293      the table is zero; the actual length is stored in the compiled code. */
1294    
1295      if (c == OP_XCLASS) code += GET(code, 1);
1296    
1297      /* Handle capturing bracket */
1298    
1299      else if (c == OP_CBRA)
1300      {      {
1301      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1302      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1303      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1304      }      }
1305    
1306      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1307      a multi-byte character. The length in the table is a minimum, so we have to
1308      arrange to skip the extra bytes. */
1309    
1310    else    else
1311      {      {
1312      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1313  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1314      if (utf8) switch(c)      if (utf8) switch(c)
1315        {        {
1316        case OP_CHAR:        case OP_CHAR:
# Line 1031  for (;;) Line 1318  for (;;)
1318        case OP_EXACT:        case OP_EXACT:
1319        case OP_UPTO:        case OP_UPTO:
1320        case OP_MINUPTO:        case OP_MINUPTO:
1321          case OP_POSUPTO:
1322        case OP_STAR:        case OP_STAR:
1323        case OP_MINSTAR:        case OP_MINSTAR:
1324          case OP_POSSTAR:
1325        case OP_PLUS:        case OP_PLUS:
1326        case OP_MINPLUS:        case OP_MINPLUS:
1327          case OP_POSPLUS:
1328        case OP_QUERY:        case OP_QUERY:
1329        case OP_MINQUERY:        case OP_MINQUERY:
1330        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1331        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1332        break;        break;
1333        }        }
1334  #endif  #endif
# Line 1072  Returns:      pointer to the opcode for Line 1355  Returns:      pointer to the opcode for
1355  static const uschar *  static const uschar *
1356  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1357  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1358  for (;;)  for (;;)
1359    {    {
1360    register int c = *code;    register int c = *code;
1361    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1362    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1363    else if (c > OP_BRA)  
1364      {    /* XCLASS is used for classes that cannot be represented just by a bit
1365      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1366      }    the table is zero; the actual length is stored in the compiled code. */
1367    
1368      if (c == OP_XCLASS) code += GET(code, 1);
1369    
1370      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1371      that are followed by a character may be followed by a multi-byte character.
1372      The length in the table is a minimum, so we have to arrange to skip the extra
1373      bytes. */
1374    
1375    else    else
1376      {      {
1377      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1378  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1379      if (utf8) switch(c)      if (utf8) switch(c)
1380        {        {
1381        case OP_CHAR:        case OP_CHAR:
# Line 1103  for (;;) Line 1383  for (;;)
1383        case OP_EXACT:        case OP_EXACT:
1384        case OP_UPTO:        case OP_UPTO:
1385        case OP_MINUPTO:        case OP_MINUPTO:
1386          case OP_POSUPTO:
1387        case OP_STAR:        case OP_STAR:
1388        case OP_MINSTAR:        case OP_MINSTAR:
1389          case OP_POSSTAR:
1390        case OP_PLUS:        case OP_PLUS:
1391        case OP_MINPLUS:        case OP_MINPLUS:
1392          case OP_POSPLUS:
1393        case OP_QUERY:        case OP_QUERY:
1394        case OP_MINQUERY:        case OP_MINQUERY:
1395        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1396        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1397        break;        break;
1398        }        }
1399  #endif  #endif
# Line 1132  for (;;) Line 1408  for (;;)
1408  *************************************************/  *************************************************/
1409    
1410  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1411  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1412  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1413  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1414  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1415    struck an inner bracket whose current branch will already have been scanned.
1416    
1417  Arguments:  Arguments:
1418    code        points to start of search    code        points to start of search
# Line 1149  static BOOL Line 1426  static BOOL
1426  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1427  {  {
1428  register int c;  register int c;
1429  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1430       code < endcode;       code < endcode;
1431       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1432    {    {
# Line 1157  for (code = first_significant_code(code Line 1434  for (code = first_significant_code(code
1434    
1435    c = *code;    c = *code;
1436    
1437    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1438    
1439      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1440        {
1441        code += _pcre_OP_lengths[c];
1442        do code += GET(code, 1); while (*code == OP_ALT);
1443        c = *code;
1444        continue;
1445        }
1446    
1447      /* For other groups, scan the branches. */
1448    
1449      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1450      {      {
1451      BOOL empty_branch;      BOOL empty_branch;
1452      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1173  for (code = first_significant_code(code Line 1462  for (code = first_significant_code(code
1462        }        }
1463      while (*code == OP_ALT);      while (*code == OP_ALT);
1464      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1465      c = *code;      c = *code;
1466        continue;
1467      }      }
1468    
1469    else switch (c)    /* Handle the other opcodes */
1470    
1471      switch (c)
1472      {      {
1473      /* Check for quantifiers after a class */      /* Check for quantifiers after a class. XCLASS is used for classes that
1474        cannot be represented just by a bit map. This includes negated single
1475        high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1476        actual length is stored in the compiled code, so we must update "code"
1477        here. */
1478    
1479  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1480      case OP_XCLASS:      case OP_XCLASS:
1481      ccode = code + GET(code, 1);      ccode = code += GET(code, 1);
1482      goto CHECK_CLASS_REPEAT;      goto CHECK_CLASS_REPEAT;
1483  #endif  #endif
1484    
# Line 1233  for (code = first_significant_code(code Line 1528  for (code = first_significant_code(code
1528      case OP_NOT:      case OP_NOT:
1529      case OP_PLUS:      case OP_PLUS:
1530      case OP_MINPLUS:      case OP_MINPLUS:
1531        case OP_POSPLUS:
1532      case OP_EXACT:      case OP_EXACT:
1533      case OP_NOTPLUS:      case OP_NOTPLUS:
1534      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1535        case OP_NOTPOSPLUS:
1536      case OP_NOTEXACT:      case OP_NOTEXACT:
1537      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1538      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1539        case OP_TYPEPOSPLUS:
1540      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1541      return FALSE;      return FALSE;
1542    
# Line 1250  for (code = first_significant_code(code Line 1548  for (code = first_significant_code(code
1548      case OP_ALT:      case OP_ALT:
1549      return TRUE;      return TRUE;
1550    
1551      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1552      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1553    
1554  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1555      case OP_STAR:      case OP_STAR:
1556      case OP_MINSTAR:      case OP_MINSTAR:
1557        case OP_POSSTAR:
1558      case OP_QUERY:      case OP_QUERY:
1559      case OP_MINQUERY:      case OP_MINQUERY:
1560        case OP_POSQUERY:
1561      case OP_UPTO:      case OP_UPTO:
1562      case OP_MINUPTO:      case OP_MINUPTO:
1563        case OP_POSUPTO:
1564      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1565      break;      break;
1566  #endif  #endif
# Line 1377  earlier groups that are outside the curr Line 1678  earlier groups that are outside the curr
1678  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1679  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1680  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1681  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1682  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1683    
1684    This function has been extended with the possibility of forward references for
1685    recursions and subroutine calls. It must also check the list of such references
1686    for the group we are dealing with. If it finds that one of the recursions in
1687    the current group is on this list, it adjusts the offset in the list, not the
1688    value in the reference (which is a group number).
1689    
1690  Arguments:  Arguments:
1691    group      points to the start of the group    group      points to the start of the group
1692    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1693    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1694    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1695      save_hwm   the hwm forward reference pointer at the start of the group
1696    
1697  Returns:     nothing  Returns:     nothing
1698  */  */
1699    
1700  static void  static void
1701  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1702      uschar *save_hwm)
1703  {  {
1704  uschar *ptr = group;  uschar *ptr = group;
1705  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1706    {    {
1707    int offset = GET(ptr, 1);    int offset;
1708    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1709    
1710      /* See if this recursion is on the forward reference list. If so, adjust the
1711      reference. */
1712    
1713      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1714        {
1715        offset = GET(hc, 0);
1716        if (cd->start_code + offset == ptr + 1)
1717          {
1718          PUT(hc, 0, offset + adjust);
1719          break;
1720          }
1721        }
1722    
1723      /* Otherwise, adjust the recursion offset if it's after the start of this
1724      group. */
1725    
1726      if (hc >= cd->hwm)
1727        {
1728        offset = GET(ptr, 1);
1729        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1730        }
1731    
1732    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1733    }    }
1734  }  }
# Line 1475  Yield:        TRUE when range returned; Line 1807  Yield:        TRUE when range returned;
1807  */  */
1808    
1809  static BOOL  static BOOL
1810  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1811      unsigned int *odptr)
1812  {  {
1813  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1814    
1815  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1816    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1817    
1818  if (c > d) return FALSE;  if (c > d) return FALSE;
1819    
# Line 1492  next = othercase + 1; Line 1822  next = othercase + 1;
1822    
1823  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1824    {    {
1825    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1826    next++;    next++;
1827    }    }
1828    
# Line 1506  return TRUE; Line 1834  return TRUE;
1834  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1835    
1836    
1837    
1838  /*************************************************  /*************************************************
1839  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1840  *************************************************/  *************************************************/
1841    
1842  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1843  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1844  bits.  sense to automatically possessify the repeated item.
1845    
1846  Arguments:  Arguments:
1847    optionsptr     pointer to the option bits    op_code       the repeated op code
1848    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1849    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1850    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1851    errorcodeptr   points to error code variable    ptr           next character in pattern
1852    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1853    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1854    
1855  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1856  */  */
1857    
1858  static BOOL  static BOOL
1859  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1860    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1861  {  {
1862  int repeat_type, op_type;  int next;
 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
 int bravalue = 0;  
 int greedy_default, greedy_non_default;  
 int firstbyte, reqbyte;  
 int zeroreqbyte, zerofirstbyte;  
 int req_caseopt, reqvary, tempreqvary;  
 int condcount = 0;  
 int options = *optionsptr;  
 int after_manual_callout = 0;  
 register int c;  
 register uschar *code = *codeptr;  
 uschar *tempcode;  
 BOOL inescq = FALSE;  
 BOOL groupsetfirstbyte = FALSE;  
 const uschar *ptr = *ptrptr;  
 const uschar *tempptr;  
 uschar *previous = NULL;  
 uschar *previous_callout = NULL;  
 uschar classbits[32];  
1863    
1864  #ifdef SUPPORT_UTF8  /* Skip whitespace and comments in extended mode */
 BOOL class_utf8;  
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
 #endif  
1865    
1866  /* Set up the default and non-default settings for greediness */  if ((options & PCRE_EXTENDED) != 0)
1867      {
1868      for (;;)
1869        {
1870        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1871        if (*ptr == '#')
1872          {
1873          while (*(++ptr) != 0)
1874            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1875          }
1876        else break;
1877        }
1878      }
1879    
1880  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* If the next item is one that we can handle, get its value. A non-negative
1881  greedy_non_default = greedy_default ^ 1;  value is a character, a negative value is an escape value. */
1882    
1883  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if (*ptr == '\\')
1884  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1885  matches a non-fixed char first char; reqbyte just remains unset if we never    int temperrorcode = 0;
1886  find one.    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1887      if (temperrorcode != 0) return FALSE;
1888      ptr++;    /* Point after the escape sequence */
1889      }
1890    
1891  When we hit a repeat whose minimum is zero, we may have to adjust these values  else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1892  to take the zero repeat into account. This is implemented by setting them to    {
1893  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  #ifdef SUPPORT_UTF8
1894  item types that can be repeated set these backoff variables appropriately. */    if (utf8) { GETCHARINC(next, ptr); } else
1895    #endif
1896      next = *ptr++;
1897      }
1898    
1899  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  else return FALSE;
1900    
1901  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Skip whitespace and comments in extended mode */
1902  according to the current setting of the caseless flag. REQ_CASELESS is a bit  
1903  value > 255. It is added into the firstbyte or reqbyte variables to record the  if ((options & PCRE_EXTENDED) != 0)
1904      {
1905      for (;;)
1906        {
1907        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1908        if (*ptr == '#')
1909          {
1910          while (*(++ptr) != 0)
1911            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1912          }
1913        else break;
1914        }
1915      }
1916    
1917    /* If the next thing is itself optional, we have to give up. */
1918    
1919    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1920      return FALSE;
1921    
1922    /* Now compare the next item with the previous opcode. If the previous is a
1923    positive single character match, "item" either contains the character or, if
1924    "item" is greater than 127 in utf8 mode, the character's bytes are in
1925    utf8_char. */
1926    
1927    
1928    /* Handle cases when the next item is a character. */
1929    
1930    if (next >= 0) switch(op_code)
1931      {
1932      case OP_CHAR:
1933    #ifdef SUPPORT_UTF8
1934      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1935    #endif
1936      return item != next;
1937    
1938      /* For CHARNC (caseless character) we must check the other case. If we have
1939      Unicode property support, we can use it to test the other case of
1940      high-valued characters. */
1941    
1942      case OP_CHARNC:
1943    #ifdef SUPPORT_UTF8
1944      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1945    #endif
1946      if (item == next) return FALSE;
1947    #ifdef SUPPORT_UTF8
1948      if (utf8)
1949        {
1950        unsigned int othercase;
1951        if (next < 128) othercase = cd->fcc[next]; else
1952    #ifdef SUPPORT_UCP
1953        othercase = _pcre_ucp_othercase((unsigned int)next);
1954    #else
1955        othercase = NOTACHAR;
1956    #endif
1957        return (unsigned int)item != othercase;
1958        }
1959      else
1960    #endif  /* SUPPORT_UTF8 */
1961      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1962    
1963      /* For OP_NOT, "item" must be a single-byte character. */
1964    
1965      case OP_NOT:
1966      if (next < 0) return FALSE;  /* Not a character */
1967      if (item == next) return TRUE;
1968      if ((options & PCRE_CASELESS) == 0) return FALSE;
1969    #ifdef SUPPORT_UTF8
1970      if (utf8)
1971        {
1972        unsigned int othercase;
1973        if (next < 128) othercase = cd->fcc[next]; else
1974    #ifdef SUPPORT_UCP
1975        othercase = _pcre_ucp_othercase(next);
1976    #else
1977        othercase = NOTACHAR;
1978    #endif
1979        return (unsigned int)item == othercase;
1980        }
1981      else
1982    #endif  /* SUPPORT_UTF8 */
1983      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1984    
1985      case OP_DIGIT:
1986      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1987    
1988      case OP_NOT_DIGIT:
1989      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1990    
1991      case OP_WHITESPACE:
1992      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1993    
1994      case OP_NOT_WHITESPACE:
1995      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1996    
1997      case OP_WORDCHAR:
1998      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1999    
2000      case OP_NOT_WORDCHAR:
2001      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2002    
2003      case OP_HSPACE:
2004      case OP_NOT_HSPACE:
2005      switch(next)
2006        {
2007        case 0x09:
2008        case 0x20:
2009        case 0xa0:
2010        case 0x1680:
2011        case 0x180e:
2012        case 0x2000:
2013        case 0x2001:
2014        case 0x2002:
2015        case 0x2003:
2016        case 0x2004:
2017        case 0x2005:
2018        case 0x2006:
2019        case 0x2007:
2020        case 0x2008:
2021        case 0x2009:
2022        case 0x200A:
2023        case 0x202f:
2024        case 0x205f:
2025        case 0x3000:
2026        return op_code != OP_HSPACE;
2027        default:
2028        return op_code == OP_HSPACE;
2029        }
2030    
2031      case OP_VSPACE:
2032      case OP_NOT_VSPACE:
2033      switch(next)
2034        {
2035        case 0x0a:
2036        case 0x0b:
2037        case 0x0c:
2038        case 0x0d:
2039        case 0x85:
2040        case 0x2028:
2041        case 0x2029:
2042        return op_code != OP_VSPACE;
2043        default:
2044        return op_code == OP_VSPACE;
2045        }
2046    
2047      default:
2048      return FALSE;
2049      }
2050    
2051    
2052    /* Handle the case when the next item is \d, \s, etc. */
2053    
2054    switch(op_code)
2055      {
2056      case OP_CHAR:
2057      case OP_CHARNC:
2058    #ifdef SUPPORT_UTF8
2059      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2060    #endif
2061      switch(-next)
2062        {
2063        case ESC_d:
2064        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2065    
2066        case ESC_D:
2067        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2068    
2069        case ESC_s:
2070        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2071    
2072        case ESC_S:
2073        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2074    
2075        case ESC_w:
2076        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2077    
2078        case ESC_W:
2079        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2080    
2081        case ESC_h:
2082        case ESC_H:
2083        switch(item)
2084          {
2085          case 0x09:
2086          case 0x20:
2087          case 0xa0:
2088          case 0x1680:
2089          case 0x180e:
2090          case 0x2000:
2091          case 0x2001:
2092          case 0x2002:
2093          case 0x2003:
2094          case 0x2004:
2095          case 0x2005:
2096          case 0x2006:
2097          case 0x2007:
2098          case 0x2008:
2099          case 0x2009:
2100          case 0x200A:
2101          case 0x202f:
2102          case 0x205f:
2103          case 0x3000:
2104          return -next != ESC_h;
2105          default:
2106          return -next == ESC_h;
2107          }
2108    
2109        case ESC_v:
2110        case ESC_V:
2111        switch(item)
2112          {
2113          case 0x0a:
2114          case 0x0b:
2115          case 0x0c:
2116          case 0x0d:
2117          case 0x85:
2118          case 0x2028:
2119          case 0x2029:
2120          return -next != ESC_v;
2121          default:
2122          return -next == ESC_v;
2123          }
2124    
2125        default:
2126        return FALSE;
2127        }
2128    
2129      case OP_DIGIT:
2130      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2131             next == -ESC_h || next == -ESC_v;
2132    
2133      case OP_NOT_DIGIT:
2134      return next == -ESC_d;
2135    
2136      case OP_WHITESPACE:
2137      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2138    
2139      case OP_NOT_WHITESPACE:
2140      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2141    
2142      case OP_HSPACE:
2143      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2144    
2145      case OP_NOT_HSPACE:
2146      return next == -ESC_h;
2147    
2148      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2149      case OP_VSPACE:
2150      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2151    
2152      case OP_NOT_VSPACE:
2153      return next == -ESC_v;
2154    
2155      case OP_WORDCHAR:
2156      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2157    
2158      case OP_NOT_WORDCHAR:
2159      return next == -ESC_w || next == -ESC_d;
2160    
2161      default:
2162      return FALSE;
2163      }
2164    
2165    /* Control does not reach here */
2166    }
2167    
2168    
2169    
2170    /*************************************************
2171    *           Compile one branch                   *
2172    *************************************************/
2173    
2174    /* Scan the pattern, compiling it into the a vector. If the options are
2175    changed during the branch, the pointer is used to change the external options
2176    bits. This function is used during the pre-compile phase when we are trying
2177    to find out the amount of memory needed, as well as during the real compile
2178    phase. The value of lengthptr distinguishes the two phases.
2179    
2180    Arguments:
2181      optionsptr     pointer to the option bits
2182      codeptr        points to the pointer to the current code point
2183      ptrptr         points to the current pattern pointer
2184      errorcodeptr   points to error code variable
2185      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2186      reqbyteptr     set to the last literal character required, else < 0
2187      bcptr          points to current branch chain
2188      cd             contains pointers to tables etc.
2189      lengthptr      NULL during the real compile phase
2190                     points to length accumulator during pre-compile phase
2191    
2192    Returns:         TRUE on success
2193                     FALSE, with *errorcodeptr set non-zero on error
2194    */
2195    
2196    static BOOL
2197    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2198      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2199      compile_data *cd, int *lengthptr)
2200    {
2201    int repeat_type, op_type;
2202    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2203    int bravalue = 0;
2204    int greedy_default, greedy_non_default;
2205    int firstbyte, reqbyte;
2206    int zeroreqbyte, zerofirstbyte;
2207    int req_caseopt, reqvary, tempreqvary;
2208    int options = *optionsptr;
2209    int after_manual_callout = 0;
2210    int length_prevgroup = 0;
2211    register int c;
2212    register uschar *code = *codeptr;
2213    uschar *last_code = code;
2214    uschar *orig_code = code;
2215    uschar *tempcode;
2216    BOOL inescq = FALSE;
2217    BOOL groupsetfirstbyte = FALSE;
2218    const uschar *ptr = *ptrptr;
2219    const uschar *tempptr;
2220    uschar *previous = NULL;
2221    uschar *previous_callout = NULL;
2222    uschar *save_hwm = NULL;
2223    uschar classbits[32];
2224    
2225    #ifdef SUPPORT_UTF8
2226    BOOL class_utf8;
2227    BOOL utf8 = (options & PCRE_UTF8) != 0;
2228    uschar *class_utf8data;
2229    uschar utf8_char[6];
2230    #else
2231    BOOL utf8 = FALSE;
2232    uschar *utf8_char = NULL;
2233    #endif
2234    
2235    #ifdef DEBUG
2236    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2237    #endif
2238    
2239    /* Set up the default and non-default settings for greediness */
2240    
2241    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2242    greedy_non_default = greedy_default ^ 1;
2243    
2244    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2245    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2246    matches a non-fixed char first char; reqbyte just remains unset if we never
2247    find one.
2248    
2249    When we hit a repeat whose minimum is zero, we may have to adjust these values
2250    to take the zero repeat into account. This is implemented by setting them to
2251    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2252    item types that can be repeated set these backoff variables appropriately. */
2253    
2254    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2255    
2256    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2257    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2258    value > 255. It is added into the firstbyte or reqbyte variables to record the
2259  case status of the value. This is used only for ASCII characters. */  case status of the value. This is used only for ASCII characters. */
2260    
2261  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
# Line 1595  for (;; ptr++) Line 2267  for (;; ptr++)
2267    BOOL negate_class;    BOOL negate_class;
2268    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2269    BOOL is_quantifier;    BOOL is_quantifier;
2270      BOOL is_recurse;
2271      BOOL reset_bracount;
2272    int class_charcount;    int class_charcount;
2273    int class_lastchar;    int class_lastchar;
2274    int newoptions;    int newoptions;
2275    int recno;    int recno;
2276      int refsign;
2277    int skipbytes;    int skipbytes;
2278    int subreqbyte;    int subreqbyte;
2279    int subfirstbyte;    int subfirstbyte;
2280      int terminator;
2281    int mclength;    int mclength;
2282    uschar mcbuffer[8];    uschar mcbuffer[8];
2283    
2284    /* Next byte in the pattern */    /* Get next byte in the pattern */
2285    
2286    c = *ptr;    c = *ptr;
2287    
2288      /* If we are in the pre-compile phase, accumulate the length used for the
2289      previous cycle of this loop. */
2290    
2291      if (lengthptr != NULL)
2292        {
2293    #ifdef DEBUG
2294        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2295    #endif
2296        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2297          {
2298          *errorcodeptr = ERR52;
2299          goto FAILED;
2300          }
2301    
2302        /* There is at least one situation where code goes backwards: this is the
2303        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2304        the class is simply eliminated. However, it is created first, so we have to
2305        allow memory for it. Therefore, don't ever reduce the length at this point.
2306        */
2307    
2308        if (code < last_code) code = last_code;
2309    
2310        /* Paranoid check for integer overflow */
2311    
2312        if (OFLOW_MAX - *lengthptr < code - last_code)
2313          {
2314          *errorcodeptr = ERR20;
2315          goto FAILED;
2316          }
2317    
2318        *lengthptr += code - last_code;
2319        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2320    
2321        /* If "previous" is set and it is not at the start of the work space, move
2322        it back to there, in order to avoid filling up the work space. Otherwise,
2323        if "previous" is NULL, reset the current code pointer to the start. */
2324    
2325        if (previous != NULL)
2326          {
2327          if (previous > orig_code)
2328            {
2329            memmove(orig_code, previous, code - previous);
2330            code -= previous - orig_code;
2331            previous = orig_code;
2332            }
2333          }
2334        else code = orig_code;
2335    
2336        /* Remember where this code item starts so we can pick up the length
2337        next time round. */
2338    
2339        last_code = code;
2340        }
2341    
2342      /* In the real compile phase, just check the workspace used by the forward
2343      reference list. */
2344    
2345      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2346        {
2347        *errorcodeptr = ERR52;
2348        goto FAILED;
2349        }
2350    
2351    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2352    
2353    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1623  for (;; ptr++) Line 2362  for (;; ptr++)
2362        {        {
2363        if (previous_callout != NULL)        if (previous_callout != NULL)
2364          {          {
2365          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2366              complete_callout(previous_callout, ptr, cd);
2367          previous_callout = NULL;          previous_callout = NULL;
2368          }          }
2369        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1644  for (;; ptr++) Line 2384  for (;; ptr++)
2384    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2385         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2386      {      {
2387      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2388          complete_callout(previous_callout, ptr, cd);
2389      previous_callout = NULL;      previous_callout = NULL;
2390      }      }
2391    
# Line 1655  for (;; ptr++) Line 2396  for (;; ptr++)
2396      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2397      if (c == '#')      if (c == '#')
2398        {        {
2399        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2400        on the Macintosh. */          {
2401        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2402        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2403          if (*ptr != 0) continue;
2404    
2405          /* Else fall through to handle end of string */
2406          c = 0;
2407        }        }
2408      }      }
2409    
# Line 1672  for (;; ptr++) Line 2417  for (;; ptr++)
2417    
2418    switch(c)    switch(c)
2419      {      {
2420      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2421        case 0:                        /* The branch terminates at string end */
2422      case 0:      case '|':                      /* or | or ) */
     case '|':  
2423      case ')':      case ')':
2424      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2425      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2426      *codeptr = code;      *codeptr = code;
2427      *ptrptr = ptr;      *ptrptr = ptr;
2428        if (lengthptr != NULL)
2429          {
2430          if (OFLOW_MAX - *lengthptr < code - last_code)
2431            {
2432            *errorcodeptr = ERR20;
2433            goto FAILED;
2434            }
2435          *lengthptr += code - last_code;   /* To include callout length */
2436          DPRINTF((">> end branch\n"));
2437          }
2438      return TRUE;      return TRUE;
2439    
2440    
2441        /* ===================================================================*/
2442      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2443      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2444    
# Line 1711  for (;; ptr++) Line 2467  for (;; ptr++)
2467      *code++ = OP_ANY;      *code++ = OP_ANY;
2468      break;      break;
2469    
2470      /* Character classes. If the included characters are all < 255 in value, we  
2471      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2472      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2473      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2474      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2475        map as usual, then invert it at the end. However, we use a different opcode
2476        so that data characters > 255 can be handled correctly.
2477    
2478      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2479      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1736  for (;; ptr++) Line 2494  for (;; ptr++)
2494        goto FAILED;        goto FAILED;
2495        }        }
2496    
2497      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2498        if the first few characters (either before or after ^) are \Q\E or \E we
2499        skip them too. This makes for compatibility with Perl. */
2500    
2501      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2502        for (;;)
2503        {        {
       negate_class = TRUE;  
2504        c = *(++ptr);        c = *(++ptr);
2505        }        if (c == '\\')
2506      else          {
2507        {          if (ptr[1] == 'E') ptr++;
2508        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2509                else break;
2510            }
2511          else if (!negate_class && c == '^')
2512            negate_class = TRUE;
2513          else break;
2514        }        }
2515    
2516      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2517      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2518      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2519    
2520      class_charcount = 0;      class_charcount = 0;
2521      class_lastchar = -1;      class_lastchar = -1;
2522    
2523        /* Initialize the 32-char bit map to all zeros. We build the map in a
2524        temporary bit of memory, in case the class contains only 1 character (less
2525        than 256), because in that case the compiled code doesn't use the bit map.
2526        */
2527    
2528        memset(classbits, 0, 32 * sizeof(uschar));
2529    
2530  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2531      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2532      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2533  #endif  #endif
2534    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2535      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2536      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2537      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2538    
2539      do      if (c != 0) do
2540        {        {
2541          const uschar *oldptr;
2542    
2543  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2544        if (utf8 && c > 127)        if (utf8 && c > 127)
2545          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1786  for (;; ptr++) Line 2551  for (;; ptr++)
2551    
2552        if (inescq)        if (inescq)
2553          {          {
2554          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2555            {            {
2556            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2557            ptr++;            ptr++;                            /* Skip the 'E' */
2558            continue;            continue;                         /* Carry on with next */
2559            }            }
2560          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2561          }          }
2562    
2563        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1806  for (;; ptr++) Line 2571  for (;; ptr++)
2571            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2572          {          {
2573          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2574          int posix_class, i;          int posix_class, taboffset, tabopt;
2575          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2576            uschar pbits[32];
2577    
2578          if (ptr[1] != ':')          if (ptr[1] != ':')
2579            {            {
# Line 1836  for (;; ptr++) Line 2602  for (;; ptr++)
2602          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2603            posix_class = 0;            posix_class = 0;
2604    
2605          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2606          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2607          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2608          white space chars afterwards. */          result into the bit map that is being built. */
2609    
2610          posix_class *= 3;          posix_class *= 3;
2611          for (i = 0; i < 3; i++)  
2612            /* Copy in the first table (always present) */
2613    
2614            memcpy(pbits, cbits + posix_class_maps[posix_class],
2615              32 * sizeof(uschar));
2616    
2617            /* If there is a second table, add or remove it as required. */
2618    
2619            taboffset = posix_class_maps[posix_class + 1];
2620            tabopt = posix_class_maps[posix_class + 2];
2621    
2622            if (taboffset >= 0)
2623            {            {
2624            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2625            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2626            else            else
2627              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2628            }            }
2629    
2630            /* Not see if we need to remove any special characters. An option
2631            value of 1 removes vertical space and 2 removes underscore. */
2632    
2633            if (tabopt < 0) tabopt = -tabopt;
2634            if (tabopt == 1) pbits[1] &= ~0x3c;
2635              else if (tabopt == 2) pbits[11] &= 0x7f;
2636    
2637            /* Add the POSIX table or its complement into the main table that is
2638            being built and we are done. */
2639    
2640            if (local_negate)
2641              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2642            else
2643              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2644    
2645          ptr = tempptr + 1;          ptr = tempptr + 1;
2646          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2647          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2648          }          }
2649    
2650        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2651        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2652        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2653        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2654        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2655        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2656    
2657        if (c == '\\')        if (c == '\\')
2658          {          {
2659          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2660            if (*errorcodeptr != 0) goto FAILED;
2661    
2662          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2663          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2664            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2665          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2666            {            {
2667            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1895  for (;; ptr++) Line 2676  for (;; ptr++)
2676            {            {
2677            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2678            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2679            switch (-c)  
2680              /* Save time by not doing this in the pre-compile phase. */
2681    
2682              if (lengthptr == NULL) switch (-c)
2683              {              {
2684              case ESC_d:              case ESC_d:
2685              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1923  for (;; ptr++) Line 2707  for (;; ptr++)
2707              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2708              continue;              continue;
2709    
2710  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
             case ESC_p:  
             case ESC_P:  
               {  
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
               class_utf8 = TRUE;  
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
                 XCL_PROP : XCL_NOTPROP;  
               *class_utf8data++ = property;  
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2711              continue;              continue;
 #endif  
2712    
2713              /* Unrecognized escapes are faulted if PCRE is running in its              default:    /* Not recognized; fall through */
2714              strict mode. By default, for compatibility with Perl, they are              break;      /* Need "default" setting to stop compiler warning. */
             treated as literals. */  
   
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2715              }              }
           }  
   
         /* Fall through if we have a single character (c >= 0). This may be  
         > 256 in UTF-8 mode. */  
2716    
2717          }   /* End of backslash handling */            /* In the pre-compile phase, just do the recognition. */
2718    
2719        /* A single character may be followed by '-' to form a range. However,            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2720        Perl does not permit ']' to be the end of the range. A '-' character                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
       here is treated as a literal. */  
2721    
2722        if (ptr[1] == '-' && ptr[2] != ']')            /* We need to deal with \H, \h, \V, and \v in both phases because
2723          {            they use extra memory. */
2724          int d;  
2725          ptr += 2;            if (-c == ESC_h)
2726                {
2727                SETBIT(classbits, 0x09); /* VT */
2728                SETBIT(classbits, 0x20); /* SPACE */
2729                SETBIT(classbits, 0xa0); /* NSBP */
2730    #ifdef SUPPORT_UTF8
2731                if (utf8)
2732                  {
2733                  class_utf8 = TRUE;
2734                  *class_utf8data++ = XCL_SINGLE;
2735                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2736                  *class_utf8data++ = XCL_SINGLE;
2737                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2738                  *class_utf8data++ = XCL_RANGE;
2739                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2740                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2741                  *class_utf8data++ = XCL_SINGLE;
2742                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2743                  *class_utf8data++ = XCL_SINGLE;
2744                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2745                  *class_utf8data++ = XCL_SINGLE;
2746                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2747                  }
2748    #endif
2749                continue;
2750                }
2751    
2752              if (-c == ESC_H)
2753                {
2754                for (c = 0; c < 32; c++)
2755                  {
2756                  int x = 0xff;
2757                  switch (c)
2758                    {
2759                    case 0x09/8: x ^= 1 << (0x09%8); break;
2760                    case 0x20/8: x ^= 1 << (0x20%8); break;
2761                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2762                    default: break;
2763                    }
2764                  classbits[c] |= x;
2765                  }
2766    
2767    #ifdef SUPPORT_UTF8
2768                if (utf8)
2769                  {
2770                  class_utf8 = TRUE;
2771                  *class_utf8data++ = XCL_RANGE;
2772                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2773                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2774                  *class_utf8data++ = XCL_RANGE;
2775                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2776                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2777                  *class_utf8data++ = XCL_RANGE;
2778                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2779                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2780                  *class_utf8data++ = XCL_RANGE;
2781                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2782                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2783                  *class_utf8data++ = XCL_RANGE;
2784                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2785                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2786                  *class_utf8data++ = XCL_RANGE;
2787                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2788                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2789                  *class_utf8data++ = XCL_RANGE;
2790                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2791                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2792                  }
2793    #endif
2794                continue;
2795                }
2796    
2797              if (-c == ESC_v)
2798                {
2799                SETBIT(classbits, 0x0a); /* LF */
2800                SETBIT(classbits, 0x0b); /* VT */
2801                SETBIT(classbits, 0x0c); /* FF */
2802                SETBIT(classbits, 0x0d); /* CR */
2803                SETBIT(classbits, 0x85); /* NEL */
2804    #ifdef SUPPORT_UTF8
2805                if (utf8)
2806                  {
2807                  class_utf8 = TRUE;
2808                  *class_utf8data++ = XCL_RANGE;
2809                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2810                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2811                  }
2812    #endif
2813                continue;
2814                }
2815    
2816              if (-c == ESC_V)
2817                {
2818                for (c = 0; c < 32; c++)
2819                  {
2820                  int x = 0xff;
2821                  switch (c)
2822                    {
2823                    case 0x0a/8: x ^= 1 << (0x0a%8);
2824                                 x ^= 1 << (0x0b%8);
2825                                 x ^= 1 << (0x0c%8);
2826                                 x ^= 1 << (0x0d%8);
2827                                 break;
2828                    case 0x85/8: x ^= 1 << (0x85%8); break;
2829                    default: break;
2830                    }
2831                  classbits[c] |= x;
2832                  }
2833    
2834    #ifdef SUPPORT_UTF8
2835                if (utf8)
2836                  {
2837                  class_utf8 = TRUE;
2838                  *class_utf8data++ = XCL_RANGE;
2839                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2840                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2841                  *class_utf8data++ = XCL_RANGE;
2842                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2843                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2844                  }
2845    #endif
2846                continue;
2847                }
2848    
2849              /* We need to deal with \P and \p in both phases. */
2850    
2851    #ifdef SUPPORT_UCP
2852              if (-c == ESC_p || -c == ESC_P)
2853                {
2854                BOOL negated;
2855                int pdata;
2856                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2857                if (ptype < 0) goto FAILED;
2858                class_utf8 = TRUE;
2859                *class_utf8data++ = ((-c == ESC_p) != negated)?
2860                  XCL_PROP : XCL_NOTPROP;
2861                *class_utf8data++ = ptype;
2862                *class_utf8data++ = pdata;
2863                class_charcount -= 2;   /* Not a < 256 character */
2864                continue;
2865                }
2866    #endif
2867              /* Unrecognized escapes are faulted if PCRE is running in its
2868              strict mode. By default, for compatibility with Perl, they are
2869              treated as literals. */
2870    
2871              if ((options & PCRE_EXTRA) != 0)
2872                {
2873                *errorcodeptr = ERR7;
2874                goto FAILED;
2875                }
2876    
2877              class_charcount -= 2;  /* Undo the default count from above */
2878              c = *ptr;              /* Get the final character and fall through */
2879              }
2880    
2881            /* Fall through if we have a single character (c >= 0). This may be
2882            greater than 256 in UTF-8 mode. */
2883    
2884            }   /* End of backslash handling */
2885    
2886          /* A single character may be followed by '-' to form a range. However,
2887          Perl does not permit ']' to be the end of the range. A '-' character
2888          at the end is treated as a literal. Perl ignores orphaned \E sequences
2889          entirely. The code for handling \Q and \E is messy. */
2890    
2891          CHECK_RANGE:
2892          while (ptr[1] == '\\' && ptr[2] == 'E')
2893            {
2894            inescq = FALSE;
2895            ptr += 2;
2896            }
2897    
2898          oldptr = ptr;
2899    
2900          if (!inescq && ptr[1] == '-')
2901            {
2902            int d;
2903            ptr += 2;
2904            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2905    
2906            /* If we hit \Q (not followed by \E) at this point, go into escaped
2907            mode. */
2908    
2909            while (*ptr == '\\' && ptr[1] == 'Q')
2910              {
2911              ptr += 2;
2912              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2913              inescq = TRUE;
2914              break;
2915              }
2916    
2917            if (*ptr == 0 || (!inescq && *ptr == ']'))
2918              {
2919              ptr = oldptr;
2920              goto LONE_SINGLE_CHARACTER;
2921              }
2922    
2923  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2924          if (utf8)          if (utf8)
# Line 1981  for (;; ptr++) Line 2933  for (;; ptr++)
2933          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2934          in such circumstances. */          in such circumstances. */
2935    
2936          if (d == '\\')          if (!inescq && d == '\\')
2937            {            {
2938            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2939            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2940    
2941            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2942            was literal */            special means the '-' was literal */
2943    
2944            if (d < 0)            if (d < 0)
2945              {              {
2946              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2947              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2948                else if (d == -ESC_R) d = 'R'; else
2949                {                {
2950                ptr = oldptr - 2;                ptr = oldptr;
2951                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2952                }                }
2953              }              }
2954            }            }
2955    
2956          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2957          the pre-pass. Optimize one-character ranges */          one-character ranges */
2958    
2959            if (d < c)
2960              {
2961              *errorcodeptr = ERR8;
2962              goto FAILED;
2963              }
2964    
2965          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2966    
# Line 2022  for (;; ptr++) Line 2981  for (;; ptr++)
2981  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2982            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2983              {              {
2984              int occ, ocd;              unsigned int occ, ocd;
2985              int cc = c;              unsigned int cc = c;
2986              int origd = d;              unsigned int origd = d;
2987              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2988                {                {
2989                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2990                      ocd <= (unsigned int)d)
2991                    continue;                          /* Skip embedded ranges */
2992    
2993                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2994                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2995                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2996                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2997                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2998                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2999                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
3000                      occ <= (unsigned int)d + 1)      /* always shorter than    */
3001                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
3002                  d = ocd;                  d = ocd;
3003                  continue;                  continue;
# Line 2082  for (;; ptr++) Line 3045  for (;; ptr++)
3045          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3046          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3047    
3048          for (; c <= d; c++)          class_charcount += d - c + 1;
3049            class_lastchar = d;
3050    
3051            /* We can save a bit of time by skipping this in the pre-compile. */
3052    
3053            if (lengthptr == NULL) for (; c <= d; c++)
3054            {            {
3055            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3056            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2090  for (;; ptr++) Line 3058  for (;; ptr++)
3058              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3059              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3060              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3061            }            }
3062    
3063          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2115  for (;; ptr++) Line 3081  for (;; ptr++)
3081  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3082          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3083            {            {
3084            int chartype;            unsigned int othercase;
3085            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3086              {              {
3087              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3088              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2143  for (;; ptr++) Line 3107  for (;; ptr++)
3107          }          }
3108        }        }
3109    
3110      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
3111    
3112      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3113    
3114        if (c == 0)                          /* Missing terminating ']' */
3115          {
3116          *errorcodeptr = ERR6;
3117          goto FAILED;
3118          }
3119    
3120      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3121      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2210  for (;; ptr++) Line 3179  for (;; ptr++)
3179    
3180      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3181      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3182      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3183    
3184  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3185      if (class_utf8)      if (class_utf8)
# Line 2220  for (;; ptr++) Line 3189  for (;; ptr++)
3189        code += LINK_SIZE;        code += LINK_SIZE;
3190        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3191    
3192        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3193        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3194    
3195        if (class_charcount > 0)        if (class_charcount > 0)
3196          {          {
3197          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3198            memmove(code + 32, code, class_utf8data - code);
3199          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3200          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3201          }          }
3202          else code = class_utf8data;
3203    
3204        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3205    
# Line 2254  for (;; ptr++) Line 3216  for (;; ptr++)
3216      if (negate_class)      if (negate_class)
3217        {        {
3218        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3219        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3220            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3221        }        }
3222      else      else
3223        {        {
# Line 2264  for (;; ptr++) Line 3227  for (;; ptr++)
3227      code += 32;      code += 32;
3228      break;      break;
3229    
3230    
3231        /* ===================================================================*/
3232      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3233      has been tested above. */      has been tested above. */
3234    
# Line 2331  for (;; ptr++) Line 3296  for (;; ptr++)
3296        }        }
3297      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3298    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3299      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3300      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3301      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2378  for (;; ptr++) Line 3329  for (;; ptr++)
3329          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3330          }          }
3331    
3332          /* If the repetition is unlimited, it pays to see if the next thing on
3333          the line is something that cannot possibly match this character. If so,
3334          automatically possessifying this item gains some performance in the case
3335          where the match fails. */
3336    
3337          if (!possessive_quantifier &&
3338              repeat_max < 0 &&
3339              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3340                options, cd))
3341            {
3342            repeat_type = 0;    /* Force greedy */
3343            possessive_quantifier = TRUE;
3344            }
3345    
3346        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3347        }        }
3348    
3349      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3350      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3351      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3352      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3353        currently used only for single-byte chars. */
3354    
3355      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3356        {        {
3357        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3358        c = previous[1];        c = previous[1];
3359          if (!possessive_quantifier &&
3360              repeat_max < 0 &&
3361              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3362            {
3363            repeat_type = 0;    /* Force greedy */
3364            possessive_quantifier = TRUE;
3365            }
3366        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3367        }        }
3368    
# Line 2403  for (;; ptr++) Line 3376  for (;; ptr++)
3376      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3377        {        {
3378        uschar *oldcode;        uschar *oldcode;
3379        int prop_type;        int prop_type, prop_value;
3380        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3381        c = *previous;        c = *previous;
3382    
3383          if (!possessive_quantifier &&
3384              repeat_max < 0 &&
3385              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3386            {
3387            repeat_type = 0;    /* Force greedy */
3388            possessive_quantifier = TRUE;
3389            }
3390    
3391        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3392        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3393          previous[1] : -1;          {
3394            prop_type = previous[1];
3395            prop_value = previous[2];
3396            }
3397          else prop_type = prop_value = -1;
3398    
3399        oldcode = code;        oldcode = code;
3400        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2443  for (;; ptr++) Line 3428  for (;; ptr++)
3428          }          }
3429    
3430        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3431        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3432        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3433        one less than the maximum. */        one less than the maximum. */
3434    
# Line 2470  for (;; ptr++) Line 3455  for (;; ptr++)
3455    
3456          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3457          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3458          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3459          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3460          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3461    
# Line 2486  for (;; ptr++) Line 3471  for (;; ptr++)
3471  #endif  #endif
3472              {              {
3473              *code++ = c;              *code++ = c;
3474              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3475                  {
3476                  *code++ = prop_type;
3477                  *code++ = prop_value;
3478                  }
3479              }              }
3480            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3481            }            }
3482    
3483          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3484          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3485            UPTO is just for 1 instance, we can use QUERY instead. */
3486    
3487          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3488            {            {
# Line 2505  for (;; ptr++) Line 3495  for (;; ptr++)
3495            else            else
3496  #endif  #endif
3497            *code++ = c;            *code++ = c;
3498            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3499                {
3500                *code++ = prop_type;
3501                *code++ = prop_value;
3502                }
3503            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3504            *code++ = OP_UPTO + repeat_type;  
3505            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3506                {
3507                *code++ = OP_QUERY + repeat_type;
3508                }
3509              else
3510                {
3511                *code++ = OP_UPTO + repeat_type;
3512                PUT2INC(code, 0, repeat_max);
3513                }
3514            }            }
3515          }          }
3516    
# Line 2524  for (;; ptr++) Line 3526  for (;; ptr++)
3526  #endif  #endif
3527        *code++ = c;        *code++ = c;
3528    
3529        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3530        defines the required property. */        define the required property. */
3531    
3532  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3533        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3534            {
3535            *code++ = prop_type;
3536            *code++ = prop_value;
3537            }
3538  #endif  #endif
3539        }        }
3540    
# Line 2571  for (;; ptr++) Line 3577  for (;; ptr++)
3577      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3578      cases. */      cases. */
3579    
3580      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3581               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3582        {        {
3583        register int i;        register int i;
3584        int ketoffset = 0;        int ketoffset = 0;
3585        int len = code - previous;        int len = code - previous;
3586        uschar *bralink = NULL;        uschar *bralink = NULL;
3587    
3588          /* Repeating a DEFINE group is pointless */
3589    
3590          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3591            {
3592            *errorcodeptr = ERR55;
3593            goto FAILED;
3594            }
3595    
3596        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3597        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3598        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2613  for (;; ptr++) Line 3627  for (;; ptr++)
3627          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3628          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3629          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3630          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3631          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3632            doing this. */
3633    
3634          if (repeat_max <= 1)          if (repeat_max <= 1)
3635            {            {
3636            *code = OP_END;            *code = OP_END;
3637            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3638            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3639            code++;            code++;
3640            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2637  for (;; ptr++) Line 3652  for (;; ptr++)
3652            {            {
3653            int offset;            int offset;
3654            *code = OP_END;            *code = OP_END;
3655            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3656            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3657            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3658            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3672  for (;; ptr++)
3672        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3673        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3674        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3675        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3676          forward reference subroutine calls in the group, there will be entries on
3677          the workspace list; replicate these with an appropriate increment. */
3678    
3679        else        else
3680          {          {
3681          if (repeat_min > 1)          if (repeat_min > 1)
3682            {            {
3683            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3684            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3685              potential integer overflow. */
3686    
3687              if (lengthptr != NULL)
3688                {
3689                int delta = (repeat_min - 1)*length_prevgroup;
3690                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3691                                                                (double)INT_MAX ||
3692                    OFLOW_MAX - *lengthptr < delta)
3693                  {
3694                  *errorcodeptr = ERR20;
3695                  goto FAILED;
3696                  }
3697                *lengthptr += delta;
3698                }
3699    
3700              /* This is compiling for real */
3701    
3702              else
3703              {              {
3704              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3705              code += len;              for (i = 1; i < repeat_min; i++)
3706                  {
3707                  uschar *hc;
3708                  uschar *this_hwm = cd->hwm;
3709                  memcpy(code, previous, len);
3710                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3711                    {
3712                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3713                    cd->hwm += LINK_SIZE;
3714                    }
3715                  save_hwm = this_hwm;
3716                  code += len;
3717                  }
3718              }              }
3719            }            }
3720    
3721          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3722          }          }
3723    
# Line 2677  for (;; ptr++) Line 3725  for (;; ptr++)
3725        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3726        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3727        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3728        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3729          replicate entries on the forward reference list. */
3730    
3731        if (repeat_max >= 0)        if (repeat_max >= 0)
3732          {          {
3733          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3734            just adjust the length as if we had. For each repetition we must add 1
3735            to the length for BRAZERO and for all but the last repetition we must
3736            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3737            paranoid checks to avoid integer overflow. */
3738    
3739            if (lengthptr != NULL && repeat_max > 0)
3740              {
3741              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3742                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3743              if ((double)repeat_max *
3744                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3745                      > (double)INT_MAX ||
3746                  OFLOW_MAX - *lengthptr < delta)
3747                {
3748                *errorcodeptr = ERR20;
3749                goto FAILED;
3750                }
3751              *lengthptr += delta;
3752              }
3753    
3754            /* This is compiling for real */
3755    
3756            else for (i = repeat_max - 1; i >= 0; i--)
3757            {            {
3758              uschar *hc;
3759              uschar *this_hwm = cd->hwm;
3760    
3761            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3762    
3763            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2698  for (;; ptr++) Line 3773  for (;; ptr++)
3773              }              }
3774    
3775            memcpy(code, previous, len);            memcpy(code, previous, len);
3776              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3777                {
3778                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3779                cd->hwm += LINK_SIZE;
3780                }
3781              save_hwm = this_hwm;
3782            code += len;            code += len;
3783            }            }
3784    
# Line 2720  for (;; ptr++) Line 3801  for (;; ptr++)
3801        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3802        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3803        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3804        correct offset was computed above. */        correct offset was computed above.
3805    
3806          Then, when we are doing the actual compile phase, check to see whether
3807          this group is a non-atomic one that could match an empty string. If so,
3808          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3809          that runtime checking can be done. [This check is also applied to
3810          atomic groups at runtime, but in a different way.] */
3811    
3812        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3813            {
3814            uschar *ketcode = code - ketoffset;
3815            uschar *bracode = ketcode - GET(ketcode, 1);
3816            *ketcode = OP_KETRMAX + repeat_type;
3817            if (lengthptr == NULL && *bracode != OP_ONCE)
3818              {
3819              uschar *scode = bracode;
3820              do
3821                {
3822                if (could_be_empty_branch(scode, ketcode, utf8))
3823                  {
3824                  *bracode += OP_SBRA - OP_BRA;
3825                  break;
3826                  }
3827                scode += GET(scode, 1);
3828                }
3829              while (*scode == OP_ALT);
3830              }
3831            }
3832        }        }
3833    
3834      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2733  for (;; ptr++) Line 3839  for (;; ptr++)
3839        goto FAILED;        goto FAILED;
3840        }        }
3841    
3842      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3843      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3844      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3845      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3846      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3847        but the special opcodes can optimize it a bit. The repeated item starts at
3848        tempcode, not at previous, which might be the first part of a string whose
3849        (former) last char we repeated.
3850    
3851        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3852        an 'upto' may follow. We skip over an 'exact' item, and then test the
3853        length of what remains before proceeding. */
3854    
3855      if (possessive_quantifier)      if (possessive_quantifier)
3856        {        {
3857        int len = code - tempcode;        int len;
3858        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3859        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3860        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3861        tempcode[0] = OP_ONCE;        len = code - tempcode;
3862        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3863        PUTINC(code, 0, len);          {
3864        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3865            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3866            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3867            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3868    
3869            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3870            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3871            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3872            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3873    
3874            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3875            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3876            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3877            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3878    
3879            default:
3880            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3881            code += 1 + LINK_SIZE;
3882            len += 1 + LINK_SIZE;
3883            tempcode[0] = OP_ONCE;
3884            *code++ = OP_KET;
3885            PUTINC(code, 0, len);
3886            PUT(tempcode, 1, len);
3887            break;
3888            }
3889        }        }
3890    
3891      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2761  for (;; ptr++) Line 3898  for (;; ptr++)
3898      break;      break;
3899    
3900    
3901      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3902      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3903      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3904      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
3905    
3906      case '(':      case '(':
3907      newoptions = options;      newoptions = options;
3908      skipbytes = 0;      skipbytes = 0;
3909        bravalue = OP_CBRA;
3910        save_hwm = cd->hwm;
3911        reset_bracount = FALSE;
3912    
3913        /* First deal with various "verbs" that can be introduced by '*'. */
3914    
3915        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3916          {
3917          int i, namelen;
3918          const uschar *name = ++ptr;
3919          previous = NULL;
3920          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3921          if (*ptr == ':')
3922            {
3923            *errorcodeptr = ERR59;   /* Not supported */
3924            goto FAILED;
3925            }
3926          if (*ptr != ')')
3927            {
3928            *errorcodeptr = ERR60;
3929            goto FAILED;
3930            }
3931          namelen = ptr - name;
3932          for (i = 0; i < verbcount; i++)
3933            {
3934            if (namelen == verbs[i].len &&
3935                strncmp((char *)name, verbs[i].name, namelen) == 0)
3936              {
3937              *code = verbs[i].op;
3938              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3939              break;
3940              }
3941            }
3942          if (i < verbcount) continue;
3943          *errorcodeptr = ERR60;
3944          goto FAILED;
3945          }
3946    
3947        /* Deal with the extended parentheses; all are introduced by '?', and the
3948        appearance of any of them means that this is not a capturing group. */
3949    
3950      if (*(++ptr) == '?')      else if (*ptr == '?')
3951        {        {
3952        int set, unset;        int i, set, unset, namelen;
3953        int *optset;        int *optset;
3954          const uschar *name;
3955          uschar *slot;
3956    
3957        switch (*(++ptr))        switch (*(++ptr))
3958          {          {
3959          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3960          ptr++;          ptr++;
3961          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3962            if (*ptr == 0)
3963              {
3964              *errorcodeptr = ERR18;
3965              goto FAILED;
3966              }
3967          continue;          continue;
3968    
3969          case ':':                 /* Non-extracting bracket */  
3970            /* ------------------------------------------------------------ */
3971            case '|':                 /* Reset capture count for each branch */
3972            reset_bracount = TRUE;
3973            /* Fall through */
3974    
3975            /* ------------------------------------------------------------ */
3976            case ':':                 /* Non-capturing bracket */
3977          bravalue = OP_BRA;          bravalue = OP_BRA;
3978          ptr++;          ptr++;
3979          break;          break;
3980    
3981    
3982            /* ------------------------------------------------------------ */
3983          case '(':          case '(':
3984          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3985    
3986          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3987            group), a name (referring to a named group), or 'R', referring to
3988            recursion. R<digits> and R&name are also permitted for recursion tests.
3989    
3990            There are several syntaxes for testing a named group: (?(name)) is used
3991            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3992    
3993            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3994            be the recursive thing or the name 'R' (and similarly for 'R' followed
3995            by digits), and (b) a number could be a name that consists of digits.
3996            In both cases, we look for a name first; if not found, we try the other
3997            cases. */
3998    
3999            /* For conditions that are assertions, check the syntax, and then exit
4000            the switch. This will take control down to where bracketed groups,
4001            including assertions, are processed. */
4002    
4003            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4004              break;
4005    
4006            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4007            below), and all need to skip 3 bytes at the start of the group. */
4008    
4009          if (ptr[1] == 'R')          code[1+LINK_SIZE] = OP_CREF;
4010            skipbytes = 3;
4011            refsign = -1;
4012    
4013            /* Check for a test for recursion in a named group. */
4014    
4015            if (ptr[1] == 'R' && ptr[2] == '&')
4016            {            {
4017            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4018            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4019            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4020            }            }
4021    
4022          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4023          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4024    
4025          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4026            {            {
4027            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4028            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4029            }            }
4030          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
4031          set bravalue above. */            {
4032          break;            terminator = '\'';
4033              ptr++;
4034          case '=':                 /* Positive lookahead */            }
4035          bravalue = OP_ASSERT;          else
4036          ptr++;            {
4037          break;            terminator = 0;
4038              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4039              }
4040    
4041          case '!':                 /* Negative lookahead */          /* We now expect to read a name; any thing else is an error */
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
4042    
4043          case '<':                 /* Lookbehinds */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
         switch (*(++ptr))  
4044            {            {
4045            case '=':               /* Positive lookbehind */            ptr += 1;  /* To get the right offset */
4046            bravalue = OP_ASSERTBACK;            *errorcodeptr = ERR28;
4047            ptr++;            goto FAILED;
4048            break;            }
4049    
4050            case '!':               /* Negative lookbehind */          /* Read the name, but also get it as a number if it's all digits */
4051            bravalue = OP_ASSERTBACK_NOT;  
4052            recno = 0;
4053            name = ++ptr;
4054            while ((cd->ctypes[*ptr] & ctype_word) != 0)
4055              {
4056              if (recno >= 0)
4057                recno = ((digitab[*ptr] & ctype_digit) != 0)?
4058                  recno * 10 + *ptr - '0' : -1;
4059            ptr++;            ptr++;
           break;  
4060            }            }
4061          break;          namelen = ptr - name;
4062    
4063          case '>':                 /* One-time brackets */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4064          bravalue = OP_ONCE;            {
4065          ptr++;            ptr--;      /* Error offset */
4066          break;            *errorcodeptr = ERR26;
4067              goto FAILED;
4068              }
4069    
4070          case 'C':                 /* Callout - may be followed by digits; */          /* Do no further checking in the pre-compile phase. */
4071          previous_callout = code;  /* Save for later completion */  
4072          after_manual_callout = 1; /* Skip one item before completing */          if (lengthptr != NULL) break;
4073          *code++ = OP_CALLOUT;     /* Already checked that the terminating */  
4074            {                       /* closing parenthesis is present. */          /* In the real compile we do the work of looking for the actual
4075            int n = 0;          reference. If the string started with "+" or "-" we require the rest to
4076            while ((digitab[*(++ptr)] & ctype_digit) != 0)          be digits, in which case recno will be set. */
4077              n = n * 10 + *ptr - '0';  
4078            if (n > 255)          if (refsign > 0)
4079              {            {
4080              if (recno <= 0)
4081                {
4082                *errorcodeptr = ERR58;
4083                goto FAILED;
4084                }
4085              if (refsign == '-')
4086                {
4087                recno = cd->bracount - recno + 1;
4088                if (recno <= 0)
4089                  {
4090                  *errorcodeptr = ERR15;
4091                  goto FAILED;
4092                  }
4093                }
4094              else recno += cd->bracount;
4095              PUT2(code, 2+LINK_SIZE, recno);
4096              break;
4097              }
4098    
4099            /* Otherwise (did not start with "+" or "-"), start by looking for the
4100            name. */
4101    
4102            slot = cd->name_table;
4103            for (i = 0; i < cd->names_found; i++)
4104              {
4105              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4106              slot += cd->name_entry_size;
4107              }
4108    
4109            /* Found a previous named subpattern */
4110    
4111            if (i < cd->names_found)
4112              {
4113              recno = GET2(slot, 0);
4114              PUT2(code, 2+LINK_SIZE, recno);
4115              }
4116    
4117            /* Search the pattern for a forward reference */
4118    
4119            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4120                            (options & PCRE_EXTENDED) != 0)) > 0)
4121              {
4122              PUT2(code, 2+LINK_SIZE, i);
4123              }
4124    
4125            /* If terminator == 0 it means that the name followed directly after
4126            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4127            some further alternatives to try. For the cases where terminator != 0
4128            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4129            now checked all the possibilities, so give an error. */
4130    
4131            else if (terminator != 0)
4132              {
4133              *errorcodeptr = ERR15;
4134              goto FAILED;
4135              }
4136    
4137            /* Check for (?(R) for recursion. Allow digits after R to specify a
4138            specific group number. */
4139    
4140            else if (*name == 'R')
4141              {
4142              recno = 0;
4143              for (i = 1; i < namelen; i++)
4144                {
4145                if ((digitab[name[i]] & ctype_digit) == 0)
4146                  {
4147                  *errorcodeptr = ERR15;
4148                  goto FAILED;
4149                  }
4150                recno = recno * 10 + name[i] - '0';
4151                }
4152              if (recno == 0) recno = RREF_ANY;
4153              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4154              PUT2(code, 2+LINK_SIZE, recno);
4155              }
4156    
4157            /* Similarly, check for the (?(DEFINE) "condition", which is always
4158            false. */
4159    
4160            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4161              {
4162              code[1+LINK_SIZE] = OP_DEF;
4163              skipbytes = 1;
4164              }
4165    
4166            /* Check for the "name" actually being a subpattern number. */
4167    
4168            else if (recno > 0)
4169              {
4170              PUT2(code, 2+LINK_SIZE, recno);
4171              }
4172    
4173            /* Either an unidentified subpattern, or a reference to (?(0) */
4174    
4175            else
4176              {
4177              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4178              goto FAILED;
4179              }
4180            break;
4181    
4182    
4183            /* ------------------------------------------------------------ */
4184            case '=':                 /* Positive lookahead */
4185            bravalue = OP_ASSERT;
4186            ptr++;
4187            break;
4188    
4189    
4190            /* ------------------------------------------------------------ */
4191            case '!':                 /* Negative lookahead */
4192            ptr++;
4193            if (*ptr == ')')          /* Optimize (?!) */
4194              {
4195              *code++ = OP_FAIL;
4196              previous = NULL;
4197              continue;
4198              }
4199            bravalue = OP_ASSERT_NOT;
4200            break;
4201    
4202    
4203            /* ------------------------------------------------------------ */
4204            case '<':                 /* Lookbehind or named define */
4205            switch (ptr[1])
4206              {
4207              case '=':               /* Positive lookbehind */
4208              bravalue = OP_ASSERTBACK;
4209              ptr += 2;
4210              break;
4211    
4212              case '!':               /* Negative lookbehind */
4213              bravalue = OP_ASSERTBACK_NOT;
4214              ptr += 2;
4215              break;
4216    
4217              default:                /* Could be name define, else bad */
4218              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4219              ptr++;                  /* Correct offset for error */
4220              *errorcodeptr = ERR24;
4221              goto FAILED;
4222              }
4223            break;
4224    
4225    
4226            /* ------------------------------------------------------------ */
4227            case '>':                 /* One-time brackets */
4228            bravalue = OP_ONCE;
4229            ptr++;
4230            break;
4231    
4232    
4233            /* ------------------------------------------------------------ */
4234            case 'C':                 /* Callout - may be followed by digits; */
4235            previous_callout = code;  /* Save for later completion */
4236            after_manual_callout = 1; /* Skip one item before completing */
4237            *code++ = OP_CALLOUT;
4238              {
4239              int n = 0;
4240              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4241                n = n * 10 + *ptr - '0';
4242              if (*ptr != ')')
4243                {
4244                *errorcodeptr = ERR39;
4245                goto FAILED;
4246                }
4247              if (n > 255)
4248                {
4249              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
4250              goto FAILED;              goto FAILED;
4251              }              }
# Line 2876  for (;; ptr++) Line 4257  for (;; ptr++)
4257          previous = NULL;          previous = NULL;
4258          continue;          continue;
4259    
4260          case 'P':                 /* Named subpattern handling */  
4261          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4262            case 'P':                 /* Python-style named subpattern handling */
4263            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4264              {
4265              is_recurse = *ptr == '>';
4266              terminator = ')';
4267              goto NAMED_REF_OR_RECURSE;
4268              }
4269            else if (*ptr != '<')    /* Test for Python-style definition */
4270              {
4271              *errorcodeptr = ERR41;
4272              goto FAILED;
4273              }
4274            /* Fall through to handle (?P< as (?< is handled */
4275    
4276    
4277            /* ------------------------------------------------------------ */
4278            DEFINE_NAME:    /* Come here from (?< handling */
4279            case '\'':
4280            {            {
4281            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4282            uschar *slot = cd->name_table;            name = ++ptr;
4283            const uschar *name;     /* Don't amalgamate; some compilers */  
4284            name = ++ptr;           /* grumble at autoincrement in declaration */            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4285              namelen = ptr - name;
4286    
4287            while (*ptr++ != '>');            /* In the pre-compile phase, just do a syntax check. */
           namelen = ptr - name - 1;  
4288    
4289            for (i = 0; i < cd->names_found; i++)            if (lengthptr != NULL)
4290              {              {
4291              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
4292              if (crc == 0)                {
4293                  *errorcodeptr = ERR42;
4294                  goto FAILED;
4295                  }
4296                if (cd->names_found >= MAX_NAME_COUNT)
4297                  {
4298                  *errorcodeptr = ERR49;
4299                  goto FAILED;
4300                  }
4301                if (namelen + 3 > cd->name_entry_size)
4302                {                {
4303                if (slot[2+namelen] == 0)                cd->name_entry_size = namelen + 3;
4304                  if (namelen > MAX_NAME_SIZE)
4305                  {                  {
4306                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4307                  goto FAILED;                  goto FAILED;
4308                  }                  }
               crc = -1;             /* Current name is substring */  
4309                }                }
4310              if (crc < 0)              }
4311    
4312              /* In the real compile, create the entry in the table */
4313    
4314              else
4315                {
4316                slot = cd->name_table;
4317                for (i = 0; i < cd->names_found; i++)
4318                {                {
4319                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
4320                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
4321                break;                  {
4322                    if (slot[2+namelen] == 0)
4323                      {
4324                      if ((options & PCRE_DUPNAMES) == 0)
4325                        {
4326                        *errorcodeptr = ERR43;
4327                        goto FAILED;
4328                        }
4329                      }
4330                    else crc = -1;      /* Current name is substring */
4331                    }
4332                  if (crc < 0)
4333                    {
4334                    memmove(slot + cd->name_entry_size, slot,
4335                      (cd->names_found - i) * cd->name_entry_size);
4336                    break;
4337                    }
4338                  slot += cd->name_entry_size;
4339                }                }
             slot += cd->name_entry_size;  
             }  
4340    
4341            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
4342            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4343            slot[2+namelen] = 0;              slot[2+namelen] = 0;
4344            cd->names_found++;              }
           goto NUMBERED_GROUP;  
4345            }            }
4346    
4347          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4348    
4349            ptr++;                    /* Move past > or ' */
4350            cd->names_found++;
4351            goto NUMBERED_GROUP;
4352    
4353    
4354            /* ------------------------------------------------------------ */
4355            case '&':                 /* Perl recursion/subroutine syntax */
4356            terminator = ')';
4357            is_recurse = TRUE;
4358            /* Fall through */
4359    
4360            /* We come here from the Python syntax above that handles both
4361            references (?P=name) and recursion (?P>name), as well as falling
4362            through from the Perl recursion syntax (?&name). */
4363    
4364            NAMED_REF_OR_RECURSE:
4365            name = ++ptr;
4366            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4367            namelen = ptr - name;
4368    
4369            /* In the pre-compile phase, do a syntax check and set a dummy
4370            reference number. */
4371    
4372            if (lengthptr != NULL)
4373            {            {
4374            int i, namelen;            if (*ptr != terminator)
4375            int type = *ptr++;              {
4376            const uschar *name = ptr;              *errorcodeptr = ERR42;
4377            uschar *slot = cd->name_table;              goto FAILED;
4378                }
4379              if (namelen > MAX_NAME_SIZE)
4380                {
4381                *errorcodeptr = ERR48;
4382                goto FAILED;
4383                }
4384              recno = 0;
4385              }
4386    
4387            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4388    
4389            else
4390              {
4391              slot = cd->name_table;
4392            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4393              {              {
4394              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4395              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4396              }              }
4397            if (i >= cd->names_found)  
4398              if (i < cd->names_found)         /* Back reference */
4399                {
4400                recno = GET2(slot, 0);
4401                }
4402              else if ((recno =                /* Forward back reference */
4403                        find_parens(ptr, cd->bracount, name, namelen,
4404                          (options & PCRE_EXTENDED) != 0)) <= 0)
4405              {              {
4406              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4407              goto FAILED;              goto FAILED;
4408              }              }
4409              }
4410    
4411            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4412            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4413    
4414            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4415            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4416    
         /* Should never happen */  
         break;  
4417    
4418          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4419            case 'R':                 /* Recursion */
4420          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4421          /* Fall through */          /* Fall through */
4422    
         /* Recursion or "subroutine" call */  
4423    
4424          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4425          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4426            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4427            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4428            {            {
4429            const uschar *called;            const uschar *called;
4430    
4431              if ((refsign = *ptr) == '+') ptr++;
4432              else if (refsign == '-')
4433                {
4434                if ((digitab[ptr[1]] & ctype_digit) == 0)
4435                  goto OTHER_CHAR_AFTER_QUERY;
4436                ptr++;
4437                }
4438    
4439            recno = 0;            recno = 0;
4440            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4441              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4442    
4443              if (*ptr != ')')
4444                {
4445                *errorcodeptr = ERR29;
4446                goto FAILED;
4447                }
4448    
4449              if (refsign == '-')
4450                {
4451                if (recno == 0)
4452                  {
4453                  *errorcodeptr = ERR58;
4454                  goto FAILED;
4455                  }
4456                recno = cd->bracount - recno + 1;
4457                if (recno <= 0)
4458                  {
4459                  *errorcodeptr = ERR15;
4460                  goto FAILED;
4461                  }
4462                }
4463              else if (refsign == '+')
4464                {
4465                if (recno == 0)
4466                  {
4467                  *errorcodeptr = ERR58;
4468                  goto FAILED;
4469                  }
4470                recno += cd->bracount;
4471                }
4472    
4473            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4474    
4475            HANDLE_RECURSION:            HANDLE_RECURSION:
4476    
4477            previous = code;            previous = code;
4478              called = cd->start_code;
4479    
4480            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4481            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4482              this point. If we end up with a forward reference, first check that
4483              the bracket does occur later so we can give the error (and position)
4484              now. Then remember this forward reference in the workspace so it can
4485              be filled in at the end. */
4486    
4487            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)?  
             cd->start_code : find_bracket(cd->start_code, utf8, recno);  
   
           if (called == NULL)  
4488              {              {
4489              *errorcodeptr = ERR15;              *code = OP_END;
4490              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4491    
4492            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4493    
4494            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4495              {                {
4496              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4497              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4498                    {
4499                    *errorcodeptr = ERR15;
4500                    goto FAILED;
4501                    }
4502                  called = cd->start_code + recno;
4503                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4504                  }
4505    
4506                /* If not a forward reference, and the subpattern is still open,
4507                this is a recursive call. We check to see if this is a left
4508                recursion that could loop for ever, and diagnose that case. */
4509    
4510                else if (GET(called, 1) == 0 &&
4511                         could_be_empty(called, code, bcptr, utf8))
4512                  {
4513                  *errorcodeptr = ERR40;
4514                  goto FAILED;
4515                  }
4516              }              }
4517    
4518            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4519              "once" brackets. Set up a "previous group" length so that a
4520              subsequent quantifier will work. */
4521    
4522              *code = OP_ONCE;
4523              PUT(code, 1, 2 + 2*LINK_SIZE);
4524              code += 1 + LINK_SIZE;
4525    
4526            *code = OP_RECURSE;            *code = OP_RECURSE;
4527            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4528            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4529    
4530              *code = OP_KET;
4531              PUT(code, 1, 2 + 2*LINK_SIZE);
4532              code += 1 + LINK_SIZE;
4533    
4534              length_prevgroup = 3 + 3*LINK_SIZE;
4535            }            }
4536    
4537            /* Can't determine a first byte now */
4538    
4539            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4540          continue;          continue;
4541    
         /* Character after (? not specially recognized */  
4542    
4543          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4544            default:              /* Other characters: check option setting */
4545            OTHER_CHAR_AFTER_QUERY:
4546          set = unset = 0;          set = unset = 0;
4547          optset = &set;          optset = &set;
4548    
# Line 3016  for (;; ptr++) Line 4552  for (;; ptr++)
4552              {              {
4553              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4554    
4555                case 'J':    /* Record that it changed in the external options */
4556                *optset |= PCRE_DUPNAMES;
4557                cd->external_options |= PCRE_JCHANGED;
4558                break;
4559    
4560              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4561              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4562              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4563              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4564              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4565              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4566    
4567                default:  *errorcodeptr = ERR12;
4568                          ptr--;    /* Correct the offset */
4569                          goto FAILED;
4570              }              }
4571            }            }
4572    
# Line 3030  for (;; ptr++) Line 4575  for (;; ptr++)
4575          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4576    
4577          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4578          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4579          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4580          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4581          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4582          a group), a resetting item can be compiled.          caseless checking of required bytes.
4583    
4584          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4585          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4586          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4587            that value after the start, because it gets reset as code is discarded
4588            during the pre-compile. However, this can happen only at top level - if
4589            we are within parentheses, the starting BRA will still be present. At
4590            any parenthesis level, the length value can be used to test if anything
4591            has been compiled at that level. Thus, a test for both these conditions
4592            is necessary to ensure we correctly detect the start of the pattern in
4593            both phases.
4594    
4595            If we are not at the pattern start, compile code to change the ims
4596            options if this setting actually changes any of them. We also pass the
4597            new setting back so that it can be put at the start of any following
4598            branches, and when this group ends (if we are in a group), a resetting
4599            item can be compiled. */
4600    
4601          if (*ptr == ')')          if (*ptr == ')')
4602            {            {
4603            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4604                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4605              {              {
4606              *code++ = OP_OPT;              cd->external_options = newoptions;
4607              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4608              }              }
4609             else
4610                {
4611                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4612                  {
4613                  *code++ = OP_OPT;
4614                  *code++ = newoptions & PCRE_IMS;
4615                  }
4616    
4617            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4618            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4619            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4620    
4621            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4622            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4623            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4624            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4625                }
4626    
4627            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4628            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3068  for (;; ptr++) Line 4635  for (;; ptr++)
4635    
4636          bravalue = OP_BRA;          bravalue = OP_BRA;
4637          ptr++;          ptr++;
4638          }          }     /* End of switch for character following (? */
4639        }        }       /* End of (? handling */
4640