/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 79 by nigel, Sat Feb 24 21:40:52 2007 UTC revision 213 by ph10, Wed Aug 15 11:34:14 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56    /* When DEBUG is defined, we need the pcre_printint() function, which is also
57    used by pcretest. DEBUG is not defined when building a production library. */
58    
59    #ifdef DEBUG
60    #include "pcre_printint.src"
61    #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 106  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143    /* Table of special "verbs" like (*PRUNE) */
144    
145    typedef struct verbitem {
146      const char *name;
147      int   len;
148      int   op;
149    } verbitem;
150    
151    static verbitem verbs[] = {
152      { "ACCEPT", 6, OP_ACCEPT },
153      { "COMMIT", 6, OP_COMMIT },
154      { "F",      1, OP_FAIL },
155      { "FAIL",   4, OP_FAIL },
156      { "PRUNE",  5, OP_PRUNE },
157      { "SKIP",   4, OP_SKIP  },
158      { "THEN",   4, OP_THEN  }
159    };
160    
161    static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
165  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
166  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
167    
168  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 118  static const char *const posix_names[] = Line 173  static const char *const posix_names[] =
173  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
174    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
177  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
178  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
179    characters are removed, and for [:alpha:] and [:alnum:] the underscore
180    character is removed. The triples in the table consist of the base map offset,
181    second map offset or -1 if no second map, and a non-negative value for map
182    addition or a negative value for map subtraction (if there are two maps). The
183    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184    remove vertical space characters, 2 => remove underscore. */
185    
186  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
187    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
188    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
189    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
190    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
191    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
192    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
193    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
194    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
195    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
196    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
197    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
198    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
199    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
200    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
201  };  };
202    
203    
204    #define STRING(a)  # a
205    #define XSTRING(s) STRING(s)
206    
207  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
208  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
209    they are documented. Always add a new error instead. Messages marked DEAD below
210    are no longer used. */
211    
212  static const char *error_texts[] = {  static const char *error_texts[] = {
213    "no error",    "no error",
# Line 156  static const char *error_texts[] = { Line 222  static const char *error_texts[] = {
222    "range out of order in character class",    "range out of order in character class",
223    "nothing to repeat",    "nothing to repeat",
224    /* 10 */    /* 10 */
225    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
226    "internal error: unexpected repeat",    "internal error: unexpected repeat",
227    "unrecognized character after (?",    "unrecognized character after (?",
228    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 166  static const char *error_texts[] = { Line 232  static const char *error_texts[] = {
232    "erroffset passed as NULL",    "erroffset passed as NULL",
233    "unknown option bit(s) set",    "unknown option bit(s) set",
234    "missing ) after comment",    "missing ) after comment",
235    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
236    /* 20 */    /* 20 */
237    "regular expression too large",    "regular expression is too large",
238    "failed to get memory",    "failed to get memory",
239    "unmatched parentheses",    "unmatched parentheses",
240    "internal error: code overflow",    "internal error: code overflow",
241    "unrecognized character after (?<",    "unrecognized character after (?<",
242    /* 25 */    /* 25 */
243    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
244    "malformed number after (?(",    "malformed number or name after (?(",
245    "conditional group contains more than two branches",    "conditional group contains more than two branches",
246    "assertion expected after (?(",    "assertion expected after (?(",
247    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
248    /* 30 */    /* 30 */
249    "unknown POSIX class name",    "unknown POSIX class name",
250    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
251    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
252    "spare error",    "spare error",  /** DEAD **/
253    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
254    /* 35 */    /* 35 */
255    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 194  static const char *error_texts[] = { Line 260  static const char *error_texts[] = {
260    /* 40 */    /* 40 */
261    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
262    "unrecognized character after (?P",    "unrecognized character after (?P",
263    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
264    "two named groups have the same name",    "two named subpatterns have the same name",
265    "invalid UTF-8 string",    "invalid UTF-8 string",
266    /* 45 */    /* 45 */
267    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
268    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
269    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
270      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272      /* 50 */
273      "repeated subpattern is too long",    /** DEAD **/
274      "octal value is greater than \\377 (not in UTF-8 mode)",
275      "internal error: overran compiling workspace",
276      "internal error: previously-checked referenced subpattern not found",
277      "DEFINE group contains more than one branch",
278      /* 55 */
279      "repeating a DEFINE group is not allowed",
280      "inconsistent NEWLINE options",
281      "\\g is not followed by a braced name or an optionally braced non-zero number",
282      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283      "(*VERB) with an argument is not supported",
284      /* 60 */
285      "(*VERB) not recognized",
286      "number is too big"
287  };  };
288    
289    
# Line 220  For convenience, we use the same bit def Line 303  For convenience, we use the same bit def
303    
304  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
307  static const unsigned char digitab[] =  static const unsigned char digitab[] =
308    {    {
309    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 339  static const unsigned char digitab[] =
339    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
343  static const unsigned char digitab[] =  static const unsigned char digitab[] =
344    {    {
345    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 353  static const unsigned char digitab[] =
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
354    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
355    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
356    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
357    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
359    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 387  static const unsigned char ebcdic_charta
387    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
388    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
389    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
390    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
391    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
392    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
393    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 414  static const unsigned char ebcdic_charta
414  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
415    
416  static BOOL  static BOOL
417    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
419    
420    
421    
# Line 342  static BOOL Line 425  static BOOL
425    
426  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
427  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
428  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
429  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431    ptr is pointing at the \. On exit, it is on the final character of the escape
432    sequence.
433    
434  Arguments:  Arguments:
435    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 355  Arguments: Line 440  Arguments:
440    
441  Returns:         zero or positive => a data character  Returns:         zero or positive => a data character
442                   negative => a special escape sequence                   negative => a special escape sequence
443                   on error, errorptr is set                   on error, errorcodeptr is set
444  */  */
445    
446  static int  static int
447  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448    int options, BOOL isclass)    int options, BOOL isclass)
449  {  {
450  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
451    const uschar *ptr = *ptrptr + 1;
452  int c, i;  int c, i;
453    
454    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
455    ptr--;                            /* Set pointer back to the last byte */
456    
457  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
458    
 c = *(++ptr);  
459  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
460    
461  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
463  Otherwise further processing may be required. */  Otherwise further processing may be required. */
464    
465  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
466  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
467  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
468    
469  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
470  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
471  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
472  #endif  #endif
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 476  else if ((i = escapes[c - 0x48]) != 0)
476  else  else
477    {    {
478    const uschar *oldptr;    const uschar *oldptr;
479      BOOL braced, negated;
480    
481    switch (c)    switch (c)
482      {      {
483      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 491  else
491      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
492      break;      break;
493    
494        /* \g must be followed by a number, either plain or braced. If positive, it
495        is an absolute backreference. If negative, it is a relative backreference.
496        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497        reference to a named group. This is part of Perl's movement towards a
498        unified syntax for back references. As this is synonymous with \k{name}, we
499        fudge it up by pretending it really was \k. */
500    
501        case 'g':
502        if (ptr[1] == '{')
503          {
504          const uschar *p;
505          for (p = ptr+2; *p != 0 && *p != '}'; p++)
506            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507          if (*p != 0 && *p != '}')
508            {
509            c = -ESC_k;
510            break;
511            }
512          braced = TRUE;
513          ptr++;
514          }
515        else braced = FALSE;
516    
517        if (ptr[1] == '-')
518          {
519          negated = TRUE;
520          ptr++;
521          }
522        else negated = FALSE;
523    
524        c = 0;
525        while ((digitab[ptr[1]] & ctype_digit) != 0)
526          c = c * 10 + *(++ptr) - '0';
527    
528        if (c < 0)
529          {
530          *errorcodeptr = ERR61;
531          break;
532          }
533    
534        if (c == 0 || (braced && *(++ptr) != '}'))
535          {
536          *errorcodeptr = ERR57;
537          break;
538          }
539    
540        if (negated)
541          {
542          if (c > bracount)
543            {
544            *errorcodeptr = ERR15;
545            break;
546            }
547          c = bracount - (c - 1);
548          }
549    
550        c = -(ESC_REF + c);
551        break;
552    
553      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
554      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
555      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 422  else Line 571  else
571        c -= '0';        c -= '0';
572        while ((digitab[ptr[1]] & ctype_digit) != 0)        while ((digitab[ptr[1]] & ctype_digit) != 0)
573          c = c * 10 + *(++ptr) - '0';          c = c * 10 + *(++ptr) - '0';
574          if (c < 0)
575            {
576            *errorcodeptr = ERR61;
577            break;
578            }
579        if (c < 10 || c <= bracount)        if (c < 10 || c <= bracount)
580          {          {
581          c = -(ESC_REF + c);          c = -(ESC_REF + c);
# Line 442  else Line 596  else
596        }        }
597    
598      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
599      larger first octal digit. */      larger first octal digit. The original code used just to take the least
600        significant 8 bits of octal numbers (I think this is what early Perls used
601        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602        than 3 octal digits. */
603    
604      case '0':      case '0':
605      c -= '0';      c -= '0';
606      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
608      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
609      break;      break;
610    
611      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
612      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613        treated as a data character. */
614    
615      case 'x':      case 'x':
616  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
617        {        {
618        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
619        register int count = 0;        int count = 0;
620    
621        c = 0;        c = 0;
622        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
623          {          {
624          int cc = *pt++;          register int cc = *pt++;
625            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
626          count++;          count++;
627  #if !EBCDIC    /* ASCII coding */  
628    #ifndef EBCDIC  /* ASCII coding */
629          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
630          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
632          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
633          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634  #endif  #endif
635          }          }
636    
637        if (*pt == '}')        if (*pt == '}')
638          {          {
639          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640          ptr = pt;          ptr = pt;
641          break;          break;
642          }          }
643    
644        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
645        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
646        }        }
 #endif  
647    
648      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
649    
650      c = 0;      c = 0;
651      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652        {        {
653        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
654        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
655  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
656        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
657        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
659        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
660        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661  #endif  #endif
662        }        }
663      break;      break;
664    
665      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666        This coding is ASCII-specific, but then the whole concept of \cx is
667        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668    
669      case 'c':      case 'c':
670      c = *(++ptr);      c = *(++ptr);
671      if (c == 0)      if (c == 0)
672        {        {
673        *errorcodeptr = ERR2;        *errorcodeptr = ERR2;
674        return 0;        break;
675        }        }
676    
677      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
678      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
679      c ^= 0x40;      c ^= 0x40;
680  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
681      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
682      c ^= 0xC0;      c ^= 0xC0;
683  #endif  #endif
# Line 560  escape sequence. Line 719  escape sequence.
719  Argument:  Argument:
720    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
721    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
722      dptr           points to an int that is set to the detailed property value
723    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
724    
725  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
726  */  */
727    
728  static int  static int
729  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730  {  {
731  int c, i, bot, top;  int c, i, bot, top;
732  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
733  char name[4];  char name[32];
734    
735  c = *(++ptr);  c = *(++ptr);
736  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
737    
738  *negptr = FALSE;  *negptr = FALSE;
739    
740  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741  preceded by ^ for negation. */  negation. */
742    
743  if (c == '{')  if (c == '{')
744    {    {
# Line 587  if (c == '{') Line 747  if (c == '{')
747      *negptr = TRUE;      *negptr = TRUE;
748      ptr++;      ptr++;
749      }      }
750    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
751      {      {
752      c = *(++ptr);      c = *(++ptr);
753      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
754      if (c == '}') break;      if (c == '}') break;
755      name[i] = c;      name[i] = c;
756      }      }
757    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
758    name[i] = 0;    name[i] = 0;
759    }    }
760    
# Line 619  top = _pcre_utt_size; Line 775  top = _pcre_utt_size;
775    
776  while (bot < top)  while (bot < top)
777    {    {
778    i = (bot + top)/2;    i = (bot + top) >> 1;
779    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
780    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
781        {
782        *dptr = _pcre_utt[i].value;
783        return _pcre_utt[i].type;
784        }
785    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
786    }    }
787    
 UNKNOWN_RETURN:  
788  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
789  *ptrptr = ptr;  *ptrptr = ptr;
790  return -1;  return -1;
# Line 698  read_repeat_counts(const uschar *p, int Line 857  read_repeat_counts(const uschar *p, int
857  int min = 0;  int min = 0;
858  int max = -1;  int max = -1;
859    
860    /* Read the minimum value and do a paranoid check: a negative value indicates
861    an integer overflow. */
862    
863  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864    if (min < 0 || min > 65535)
865      {
866      *errorcodeptr = ERR5;
867      return p;
868      }
869    
870    /* Read the maximum value if there is one, and again do a paranoid on its size.
871    Also, max must not be less than min. */
872    
873  if (*p == '}') max = min; else  if (*p == '}') max = min; else
874    {    {
# Line 706  if (*p == '}') max = min; else Line 876  if (*p == '}') max = min; else
876      {      {
877      max = 0;      max = 0;
878      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879        if (max < 0 || max > 65535)
880          {
881          *errorcodeptr = ERR5;
882          return p;
883          }
884      if (max < min)      if (max < min)
885        {        {
886        *errorcodeptr = ERR4;        *errorcodeptr = ERR4;
# Line 714  if (*p == '}') max = min; else Line 889  if (*p == '}') max = min; else
889      }      }
890    }    }
891    
892  /* Do paranoid checks, then fill in the required variables, and pass back the  /* Fill in the required variables, and pass back the pointer to the terminating
893  pointer to the terminating '}'. */  '}'. */
894    
895  if (min > 65535 || max > 65535)  *minp = min;
896    *errorcodeptr = ERR5;  *maxp = max;
897  else  return p;
898    }
899    
900    
901    
902    /*************************************************
903    *       Find forward referenced subpattern       *
904    *************************************************/
905    
906    /* This function scans along a pattern's text looking for capturing
907    subpatterns, and counting them. If it finds a named pattern that matches the
908    name it is given, it returns its number. Alternatively, if the name is NULL, it
909    returns when it reaches a given numbered subpattern. This is used for forward
910    references to subpatterns. We know that if (?P< is encountered, the name will
911    be terminated by '>' because that is checked in the first pass.
912    
913    Arguments:
914      ptr          current position in the pattern
915      count        current count of capturing parens so far encountered
916      name         name to seek, or NULL if seeking a numbered subpattern
917      lorn         name length, or subpattern number if name is NULL
918      xmode        TRUE if we are in /x mode
919    
920    Returns:       the number of the named subpattern, or -1 if not found
921    */
922    
923    static int
924    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925      BOOL xmode)
926    {
927    const uschar *thisname;
928    
929    for (; *ptr != 0; ptr++)
930    {    {
931    *minp = min;    int term;
932    *maxp = max;  
933      /* Skip over backslashed characters and also entire \Q...\E */
934    
935      if (*ptr == '\\')
936        {
937        if (*(++ptr) == 0) return -1;
938        if (*ptr == 'Q') for (;;)
939          {
940          while (*(++ptr) != 0 && *ptr != '\\');
941          if (*ptr == 0) return -1;
942          if (*(++ptr) == 'E') break;
943          }
944        continue;
945        }
946    
947      /* Skip over character classes */
948    
949      if (*ptr == '[')
950        {
951        while (*(++ptr) != ']')
952          {
953          if (*ptr == '\\')
954            {
955            if (*(++ptr) == 0) return -1;
956            if (*ptr == 'Q') for (;;)
957              {
958              while (*(++ptr) != 0 && *ptr != '\\');
959              if (*ptr == 0) return -1;
960              if (*(++ptr) == 'E') break;
961              }
962            continue;
963            }
964          }
965        continue;
966        }
967    
968      /* Skip comments in /x mode */
969    
970      if (xmode && *ptr == '#')
971        {
972        while (*(++ptr) != 0 && *ptr != '\n');
973        if (*ptr == 0) return -1;
974        continue;
975        }
976    
977      /* An opening parens must now be a real metacharacter */
978    
979      if (*ptr != '(') continue;
980      if (ptr[1] != '?' && ptr[1] != '*')
981        {
982        count++;
983        if (name == NULL && count == lorn) return count;
984        continue;
985        }
986    
987      ptr += 2;
988      if (*ptr == 'P') ptr++;                      /* Allow optional P */
989    
990      /* We have to disambiguate (?<! and (?<= from (?<name> */
991    
992      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
993           *ptr != '\'')
994        continue;
995    
996      count++;
997    
998      if (name == NULL && count == lorn) return count;
999      term = *ptr++;
1000      if (term == '<') term = '>';
1001      thisname = ptr;
1002      while (*ptr != term) ptr++;
1003      if (name != NULL && lorn == ptr - thisname &&
1004          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1005        return count;
1006    }    }
1007  return p;  
1008    return -1;
1009  }  }
1010    
1011    
# Line 778  for (;;) Line 1059  for (;;)
1059    
1060      case OP_CALLOUT:      case OP_CALLOUT:
1061      case OP_CREF:      case OP_CREF:
1062      case OP_BRANUMBER:      case OP_RREF:
1063        case OP_DEF:
1064      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1065      break;      break;
1066    
# Line 823  for (;;) Line 1105  for (;;)
1105    {    {
1106    int d;    int d;
1107    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1108    
1109    switch (op)    switch (op)
1110      {      {
1111        case OP_CBRA:
1112      case OP_BRA:      case OP_BRA:
1113      case OP_ONCE:      case OP_ONCE:
1114      case OP_COND:      case OP_COND:
1115      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116      if (d < 0) return d;      if (d < 0) return d;
1117      branchlength += d;      branchlength += d;
1118      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 865  for (;;) Line 1147  for (;;)
1147      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1148    
1149      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1150      case OP_CREF:      case OP_CREF:
1151        case OP_RREF:
1152        case OP_DEF:
1153      case OP_OPT:      case OP_OPT:
1154      case OP_CALLOUT:      case OP_CALLOUT:
1155      case OP_SOD:      case OP_SOD:
# Line 884  for (;;) Line 1167  for (;;)
1167    
1168      case OP_CHAR:      case OP_CHAR:
1169      case OP_CHARNC:      case OP_CHARNC:
1170        case OP_NOT:
1171      branchlength++;      branchlength++;
1172      cc += 2;      cc += 2;
1173  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 917  for (;;) Line 1201  for (;;)
1201    
1202      case OP_PROP:      case OP_PROP:
1203      case OP_NOTPROP:      case OP_NOTPROP:
1204      cc++;      cc += 2;
1205      /* Fall through */      /* Fall through */
1206    
1207      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 998  Returns:      pointer to the opcode for Line 1282  Returns:      pointer to the opcode for
1282  static const uschar *  static const uschar *
1283  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1284  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1285  for (;;)  for (;;)
1286    {    {
1287    register int c = *code;    register int c = *code;
1288    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1289    else if (c > OP_BRA)  
1290      /* XCLASS is used for classes that cannot be represented just by a bit
1291      map. This includes negated single high-valued characters. The length in
1292      the table is zero; the actual length is stored in the compiled code. */
1293    
1294      if (c == OP_XCLASS) code += GET(code, 1);
1295    
1296      /* Handle capturing bracket */
1297    
1298      else if (c == OP_CBRA)
1299      {      {
1300      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1301      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1302      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1303      }      }
1304    
1305      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1306      a multi-byte character. The length in the table is a minimum, so we have to
1307      arrange to skip the extra bytes. */
1308    
1309    else    else
1310      {      {
1311      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1312  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1313      if (utf8) switch(c)      if (utf8) switch(c)
1314        {        {
1315        case OP_CHAR:        case OP_CHAR:
# Line 1031  for (;;) Line 1317  for (;;)
1317        case OP_EXACT:        case OP_EXACT:
1318        case OP_UPTO:        case OP_UPTO:
1319        case OP_MINUPTO:        case OP_MINUPTO:
1320          case OP_POSUPTO:
1321        case OP_STAR:        case OP_STAR:
1322        case OP_MINSTAR:        case OP_MINSTAR:
1323          case OP_POSSTAR:
1324        case OP_PLUS:        case OP_PLUS:
1325        case OP_MINPLUS:        case OP_MINPLUS:
1326          case OP_POSPLUS:
1327        case OP_QUERY:        case OP_QUERY:
1328        case OP_MINQUERY:        case OP_MINQUERY:
1329        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1330        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1331        break;        break;
1332        }        }
1333  #endif  #endif
# Line 1072  Returns:      pointer to the opcode for Line 1354  Returns:      pointer to the opcode for
1354  static const uschar *  static const uschar *
1355  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1356  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1357  for (;;)  for (;;)
1358    {    {
1359    register int c = *code;    register int c = *code;
1360    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1361    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1362    else if (c > OP_BRA)  
1363      {    /* XCLASS is used for classes that cannot be represented just by a bit
1364      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1365      }    the table is zero; the actual length is stored in the compiled code. */
1366    
1367      if (c == OP_XCLASS) code += GET(code, 1);
1368    
1369      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1370      that are followed by a character may be followed by a multi-byte character.
1371      The length in the table is a minimum, so we have to arrange to skip the extra
1372      bytes. */
1373    
1374    else    else
1375      {      {
1376      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1377  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1378      if (utf8) switch(c)      if (utf8) switch(c)
1379        {        {
1380        case OP_CHAR:        case OP_CHAR:
# Line 1103  for (;;) Line 1382  for (;;)
1382        case OP_EXACT:        case OP_EXACT:
1383        case OP_UPTO:        case OP_UPTO:
1384        case OP_MINUPTO:        case OP_MINUPTO:
1385          case OP_POSUPTO:
1386        case OP_STAR:        case OP_STAR:
1387        case OP_MINSTAR:        case OP_MINSTAR:
1388          case OP_POSSTAR:
1389        case OP_PLUS:        case OP_PLUS:
1390        case OP_MINPLUS:        case OP_MINPLUS:
1391          case OP_POSPLUS:
1392        case OP_QUERY:        case OP_QUERY:
1393        case OP_MINQUERY:        case OP_MINQUERY:
1394        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1395        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1396        break;        break;
1397        }        }
1398  #endif  #endif
# Line 1132  for (;;) Line 1407  for (;;)
1407  *************************************************/  *************************************************/
1408    
1409  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1410  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1411  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1412  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1413  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1414    struck an inner bracket whose current branch will already have been scanned.
1415    
1416  Arguments:  Arguments:
1417    code        points to start of search    code        points to start of search
# Line 1149  static BOOL Line 1425  static BOOL
1425  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1426  {  {
1427  register int c;  register int c;
1428  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1429       code < endcode;       code < endcode;
1430       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1431    {    {
# Line 1157  for (code = first_significant_code(code Line 1433  for (code = first_significant_code(code
1433    
1434    c = *code;    c = *code;
1435    
1436    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1437    
1438      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1439        {
1440        code += _pcre_OP_lengths[c];
1441        do code += GET(code, 1); while (*code == OP_ALT);
1442        c = *code;
1443        continue;
1444        }
1445    
1446      /* For other groups, scan the branches. */
1447    
1448      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1449      {      {
1450      BOOL empty_branch;      BOOL empty_branch;
1451      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1173  for (code = first_significant_code(code Line 1461  for (code = first_significant_code(code
1461        }        }
1462      while (*code == OP_ALT);      while (*code == OP_ALT);
1463      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1464      c = *code;      c = *code;
1465        continue;
1466      }      }
1467    
1468    else switch (c)    /* Handle the other opcodes */
1469    
1470      switch (c)
1471      {      {
1472      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1473    
# Line 1233  for (code = first_significant_code(code Line 1523  for (code = first_significant_code(code
1523      case OP_NOT:      case OP_NOT:
1524      case OP_PLUS:      case OP_PLUS:
1525      case OP_MINPLUS:      case OP_MINPLUS:
1526        case OP_POSPLUS:
1527      case OP_EXACT:      case OP_EXACT:
1528      case OP_NOTPLUS:      case OP_NOTPLUS:
1529      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1530        case OP_NOTPOSPLUS:
1531      case OP_NOTEXACT:      case OP_NOTEXACT:
1532      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1533      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1534        case OP_TYPEPOSPLUS:
1535      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1536      return FALSE;      return FALSE;
1537    
# Line 1250  for (code = first_significant_code(code Line 1543  for (code = first_significant_code(code
1543      case OP_ALT:      case OP_ALT:
1544      return TRUE;      return TRUE;
1545    
1546      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1547      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1548    
1549  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1550      case OP_STAR:      case OP_STAR:
1551      case OP_MINSTAR:      case OP_MINSTAR:
1552        case OP_POSSTAR:
1553      case OP_QUERY:      case OP_QUERY:
1554      case OP_MINQUERY:      case OP_MINQUERY:
1555        case OP_POSQUERY:
1556      case OP_UPTO:      case OP_UPTO:
1557      case OP_MINUPTO:      case OP_MINUPTO:
1558        case OP_POSUPTO:
1559      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1560      break;      break;
1561  #endif  #endif
# Line 1377  earlier groups that are outside the curr Line 1673  earlier groups that are outside the curr
1673  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1674  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1675  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1676  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1677  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1678    
1679    This function has been extended with the possibility of forward references for
1680    recursions and subroutine calls. It must also check the list of such references
1681    for the group we are dealing with. If it finds that one of the recursions in
1682    the current group is on this list, it adjusts the offset in the list, not the
1683    value in the reference (which is a group number).
1684    
1685  Arguments:  Arguments:
1686    group      points to the start of the group    group      points to the start of the group
1687    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1688    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1689    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1690      save_hwm   the hwm forward reference pointer at the start of the group
1691    
1692  Returns:     nothing  Returns:     nothing
1693  */  */
1694    
1695  static void  static void
1696  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1697      uschar *save_hwm)
1698  {  {
1699  uschar *ptr = group;  uschar *ptr = group;
1700  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1701    {    {
1702    int offset = GET(ptr, 1);    int offset;
1703    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1704    
1705      /* See if this recursion is on the forward reference list. If so, adjust the
1706      reference. */
1707    
1708      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1709        {
1710        offset = GET(hc, 0);
1711        if (cd->start_code + offset == ptr + 1)
1712          {
1713          PUT(hc, 0, offset + adjust);
1714          break;
1715          }
1716        }
1717    
1718      /* Otherwise, adjust the recursion offset if it's after the start of this
1719      group. */
1720    
1721      if (hc >= cd->hwm)
1722        {
1723        offset = GET(ptr, 1);
1724        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1725        }
1726    
1727    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1728    }    }
1729  }  }
# Line 1475  Yield:        TRUE when range returned; Line 1802  Yield:        TRUE when range returned;
1802  */  */
1803    
1804  static BOOL  static BOOL
1805  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1806      unsigned int *odptr)
1807  {  {
1808  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1809    
1810  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1811    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1812    
1813  if (c > d) return FALSE;  if (c > d) return FALSE;
1814    
# Line 1492  next = othercase + 1; Line 1817  next = othercase + 1;
1817    
1818  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1819    {    {
1820    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1821    next++;    next++;
1822    }    }
1823    
# Line 1506  return TRUE; Line 1829  return TRUE;
1829  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1830    
1831    
1832    
1833  /*************************************************  /*************************************************
1834  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1835  *************************************************/  *************************************************/
1836    
1837  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1838  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1839  bits.  sense to automatically possessify the repeated item.
1840    
1841  Arguments:  Arguments:
1842    optionsptr     pointer to the option bits    op_code       the repeated op code
1843    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1844    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1845    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1846    errorcodeptr   points to error code variable    ptr           next character in pattern
1847    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1848    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1849    
1850  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1851  */  */
1852    
1853  static BOOL  static BOOL
1854  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1855    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1856  {  {
1857  int repeat_type, op_type;  int next;
 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
 int bravalue = 0;  
 int greedy_default, greedy_non_default;  
 int firstbyte, reqbyte;  
 int zeroreqbyte, zerofirstbyte;  
 int req_caseopt, reqvary, tempreqvary;  
 int condcount = 0;  
 int options = *optionsptr;  
 int after_manual_callout = 0;  
 register int c;  
 register uschar *code = *codeptr;  
 uschar *tempcode;  
 BOOL inescq = FALSE;  
 BOOL groupsetfirstbyte = FALSE;  
 const uschar *ptr = *ptrptr;  
 const uschar *tempptr;  
 uschar *previous = NULL;  
 uschar *previous_callout = NULL;  
 uschar classbits[32];  
1858    
1859  #ifdef SUPPORT_UTF8  /* Skip whitespace and comments in extended mode */
 BOOL class_utf8;  
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
 #endif  
1860    
1861  /* Set up the default and non-default settings for greediness */  if ((options & PCRE_EXTENDED) != 0)
1862      {
1863      for (;;)
1864        {
1865        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1866        if (*ptr == '#')
1867          {
1868          while (*(++ptr) != 0)
1869            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1870          }
1871        else break;
1872        }
1873      }
1874    
1875  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* If the next item is one that we can handle, get its value. A non-negative
1876  greedy_non_default = greedy_default ^ 1;  value is a character, a negative value is an escape value. */
1877    
1878  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if (*ptr == '\\')
1879  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1880  matches a non-fixed char first char; reqbyte just remains unset if we never    int temperrorcode = 0;
1881  find one.    next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1882      if (temperrorcode != 0) return FALSE;
1883      ptr++;    /* Point after the escape sequence */
1884      }
1885    
1886  When we hit a repeat whose minimum is zero, we may have to adjust these values  else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1887  to take the zero repeat into account. This is implemented by setting them to    {
1888  zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  #ifdef SUPPORT_UTF8
1889  item types that can be repeated set these backoff variables appropriately. */    if (utf8) { GETCHARINC(next, ptr); } else
1890    #endif
1891      next = *ptr++;
1892      }
1893    
1894  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  else return FALSE;
1895    
1896  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Skip whitespace and comments in extended mode */
1897  according to the current setting of the caseless flag. REQ_CASELESS is a bit  
1898  value > 255. It is added into the firstbyte or reqbyte variables to record the  if ((options & PCRE_EXTENDED) != 0)
1899      {
1900      for (;;)
1901        {
1902        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1903        if (*ptr == '#')
1904          {
1905          while (*(++ptr) != 0)
1906            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1907          }
1908        else break;
1909        }
1910      }
1911    
1912    /* If the next thing is itself optional, we have to give up. */
1913    
1914    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1915      return FALSE;
1916    
1917    /* Now compare the next item with the previous opcode. If the previous is a
1918    positive single character match, "item" either contains the character or, if
1919    "item" is greater than 127 in utf8 mode, the character's bytes are in
1920    utf8_char. */
1921    
1922    
1923    /* Handle cases when the next item is a character. */
1924    
1925    if (next >= 0) switch(op_code)
1926      {
1927      case OP_CHAR:
1928    #ifdef SUPPORT_UTF8
1929      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1930    #endif
1931      return item != next;
1932    
1933      /* For CHARNC (caseless character) we must check the other case. If we have
1934      Unicode property support, we can use it to test the other case of
1935      high-valued characters. */
1936    
1937      case OP_CHARNC:
1938    #ifdef SUPPORT_UTF8
1939      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1940    #endif
1941      if (item == next) return FALSE;
1942    #ifdef SUPPORT_UTF8
1943      if (utf8)
1944        {
1945        unsigned int othercase;
1946        if (next < 128) othercase = cd->fcc[next]; else
1947    #ifdef SUPPORT_UCP
1948        othercase = _pcre_ucp_othercase((unsigned int)next);
1949    #else
1950        othercase = NOTACHAR;
1951    #endif
1952        return (unsigned int)item != othercase;
1953        }
1954      else
1955    #endif  /* SUPPORT_UTF8 */
1956      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1957    
1958      /* For OP_NOT, "item" must be a single-byte character. */
1959    
1960      case OP_NOT:
1961      if (next < 0) return FALSE;  /* Not a character */
1962      if (item == next) return TRUE;
1963      if ((options & PCRE_CASELESS) == 0) return FALSE;
1964    #ifdef SUPPORT_UTF8
1965      if (utf8)
1966        {
1967        unsigned int othercase;
1968        if (next < 128) othercase = cd->fcc[next]; else
1969    #ifdef SUPPORT_UCP
1970        othercase = _pcre_ucp_othercase(next);
1971    #else
1972        othercase = NOTACHAR;
1973    #endif
1974        return (unsigned int)item == othercase;
1975        }
1976      else
1977    #endif  /* SUPPORT_UTF8 */
1978      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1979    
1980      case OP_DIGIT:
1981      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1982    
1983      case OP_NOT_DIGIT:
1984      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1985    
1986      case OP_WHITESPACE:
1987      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1988    
1989      case OP_NOT_WHITESPACE:
1990      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1991    
1992      case OP_WORDCHAR:
1993      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1994    
1995      case OP_NOT_WORDCHAR:
1996      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1997    
1998      case OP_HSPACE:
1999      case OP_NOT_HSPACE:
2000      switch(next)
2001        {
2002        case 0x09:
2003        case 0x20:
2004        case 0xa0:
2005        case 0x1680:
2006        case 0x180e:
2007        case 0x2000:
2008        case 0x2001:
2009        case 0x2002:
2010        case 0x2003:
2011        case 0x2004:
2012        case 0x2005:
2013        case 0x2006:
2014        case 0x2007:
2015        case 0x2008:
2016        case 0x2009:
2017        case 0x200A:
2018        case 0x202f:
2019        case 0x205f:
2020        case 0x3000:
2021        return op_code != OP_HSPACE;
2022        default:
2023        return op_code == OP_HSPACE;
2024        }
2025    
2026      case OP_VSPACE:
2027      case OP_NOT_VSPACE:
2028      switch(next)
2029        {
2030        case 0x0a:
2031        case 0x0b:
2032        case 0x0c:
2033        case 0x0d:
2034        case 0x85:
2035        case 0x2028:
2036        case 0x2029:
2037        return op_code != OP_VSPACE;
2038        default:
2039        return op_code == OP_VSPACE;
2040        }
2041    
2042      default:
2043      return FALSE;
2044      }
2045    
2046    
2047    /* Handle the case when the next item is \d, \s, etc. */
2048    
2049    switch(op_code)
2050      {
2051      case OP_CHAR:
2052      case OP_CHARNC:
2053    #ifdef SUPPORT_UTF8
2054      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2055    #endif
2056      switch(-next)
2057        {
2058        case ESC_d:
2059        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2060    
2061        case ESC_D:
2062        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2063    
2064        case ESC_s:
2065        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2066    
2067        case ESC_S:
2068        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2069    
2070        case ESC_w:
2071        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2072    
2073        case ESC_W:
2074        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2075    
2076        case ESC_h:
2077        case ESC_H:
2078        switch(item)
2079          {
2080          case 0x09:
2081          case 0x20:
2082          case 0xa0:
2083          case 0x1680:
2084          case 0x180e:
2085          case 0x2000:
2086          case 0x2001:
2087          case 0x2002:
2088          case 0x2003:
2089          case 0x2004:
2090          case 0x2005:
2091          case 0x2006:
2092          case 0x2007:
2093          case 0x2008:
2094          case 0x2009:
2095          case 0x200A:
2096          case 0x202f:
2097          case 0x205f:
2098          case 0x3000:
2099          return -next != ESC_h;
2100          default:
2101          return -next == ESC_h;
2102          }
2103    
2104        case ESC_v:
2105        case ESC_V:
2106        switch(item)
2107          {
2108          case 0x0a:
2109          case 0x0b:
2110          case 0x0c:
2111          case 0x0d:
2112          case 0x85:
2113          case 0x2028:
2114          case 0x2029:
2115          return -next != ESC_v;
2116          default:
2117          return -next == ESC_v;
2118          }
2119    
2120        default:
2121        return FALSE;
2122        }
2123    
2124      case OP_DIGIT:
2125      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2126             next == -ESC_h || next == -ESC_v;
2127    
2128      case OP_NOT_DIGIT:
2129      return next == -ESC_d;
2130    
2131      case OP_WHITESPACE:
2132      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2133    
2134      case OP_NOT_WHITESPACE:
2135      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2136    
2137      case OP_HSPACE:
2138      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2139    
2140      case OP_NOT_HSPACE:
2141      return next == -ESC_h;
2142    
2143      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2144      case OP_VSPACE:
2145      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2146    
2147      case OP_NOT_VSPACE:
2148      return next == -ESC_v;
2149    
2150      case OP_WORDCHAR:
2151      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2152    
2153      case OP_NOT_WORDCHAR:
2154      return next == -ESC_w || next == -ESC_d;
2155    
2156      default:
2157      return FALSE;
2158      }
2159    
2160    /* Control does not reach here */
2161    }
2162    
2163    
2164    
2165    /*************************************************
2166    *           Compile one branch                   *
2167    *************************************************/
2168    
2169    /* Scan the pattern, compiling it into the a vector. If the options are
2170    changed during the branch, the pointer is used to change the external options
2171    bits. This function is used during the pre-compile phase when we are trying
2172    to find out the amount of memory needed, as well as during the real compile
2173    phase. The value of lengthptr distinguishes the two phases.
2174    
2175    Arguments:
2176      optionsptr     pointer to the option bits
2177      codeptr        points to the pointer to the current code point
2178      ptrptr         points to the current pattern pointer
2179      errorcodeptr   points to error code variable
2180      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2181      reqbyteptr     set to the last literal character required, else < 0
2182      bcptr          points to current branch chain
2183      cd             contains pointers to tables etc.
2184      lengthptr      NULL during the real compile phase
2185                     points to length accumulator during pre-compile phase
2186    
2187    Returns:         TRUE on success
2188                     FALSE, with *errorcodeptr set non-zero on error
2189    */
2190    
2191    static BOOL
2192    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2193      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2194      compile_data *cd, int *lengthptr)
2195    {
2196    int repeat_type, op_type;
2197    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2198    int bravalue = 0;
2199    int greedy_default, greedy_non_default;
2200    int firstbyte, reqbyte;
2201    int zeroreqbyte, zerofirstbyte;
2202    int req_caseopt, reqvary, tempreqvary;
2203    int options = *optionsptr;
2204    int after_manual_callout = 0;
2205    int length_prevgroup = 0;
2206    register int c;
2207    register uschar *code = *codeptr;
2208    uschar *last_code = code;
2209    uschar *orig_code = code;
2210    uschar *tempcode;
2211    BOOL inescq = FALSE;
2212    BOOL groupsetfirstbyte = FALSE;
2213    const uschar *ptr = *ptrptr;
2214    const uschar *tempptr;
2215    uschar *previous = NULL;
2216    uschar *previous_callout = NULL;
2217    uschar *save_hwm = NULL;
2218    uschar classbits[32];
2219    
2220    #ifdef SUPPORT_UTF8
2221    BOOL class_utf8;
2222    BOOL utf8 = (options & PCRE_UTF8) != 0;
2223    uschar *class_utf8data;
2224    uschar utf8_char[6];
2225    #else
2226    BOOL utf8 = FALSE;
2227    uschar *utf8_char = NULL;
2228    #endif
2229    
2230    #ifdef DEBUG
2231    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2232    #endif
2233    
2234    /* Set up the default and non-default settings for greediness */
2235    
2236    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2237    greedy_non_default = greedy_default ^ 1;
2238    
2239    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2240    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2241    matches a non-fixed char first char; reqbyte just remains unset if we never
2242    find one.
2243    
2244    When we hit a repeat whose minimum is zero, we may have to adjust these values
2245    to take the zero repeat into account. This is implemented by setting them to
2246    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2247    item types that can be repeated set these backoff variables appropriately. */
2248    
2249    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2250    
2251    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2252    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2253    value > 255. It is added into the firstbyte or reqbyte variables to record the
2254  case status of the value. This is used only for ASCII characters. */  case status of the value. This is used only for ASCII characters. */
2255    
2256  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
# Line 1595  for (;; ptr++) Line 2262  for (;; ptr++)
2262    BOOL negate_class;    BOOL negate_class;
2263    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2264    BOOL is_quantifier;    BOOL is_quantifier;
2265      BOOL is_recurse;
2266      BOOL reset_bracount;
2267    int class_charcount;    int class_charcount;
2268    int class_lastchar;    int class_lastchar;
2269    int newoptions;    int newoptions;
2270    int recno;    int recno;
2271      int refsign;
2272    int skipbytes;    int skipbytes;
2273    int subreqbyte;    int subreqbyte;
2274    int subfirstbyte;    int subfirstbyte;
2275      int terminator;
2276    int mclength;    int mclength;
2277    uschar mcbuffer[8];    uschar mcbuffer[8];
2278    
2279    /* Next byte in the pattern */    /* Get next byte in the pattern */
2280    
2281    c = *ptr;    c = *ptr;
2282    
2283      /* If we are in the pre-compile phase, accumulate the length used for the
2284      previous cycle of this loop. */
2285    
2286      if (lengthptr != NULL)
2287        {
2288    #ifdef DEBUG
2289        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2290    #endif
2291        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2292          {
2293          *errorcodeptr = ERR52;
2294          goto FAILED;
2295          }
2296    
2297        /* There is at least one situation where code goes backwards: this is the
2298        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2299        the class is simply eliminated. However, it is created first, so we have to
2300        allow memory for it. Therefore, don't ever reduce the length at this point.
2301        */
2302    
2303        if (code < last_code) code = last_code;
2304    
2305        /* Paranoid check for integer overflow */
2306    
2307        if (OFLOW_MAX - *lengthptr < code - last_code)
2308          {
2309          *errorcodeptr = ERR20;
2310          goto FAILED;
2311          }
2312    
2313        *lengthptr += code - last_code;
2314        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2315    
2316        /* If "previous" is set and it is not at the start of the work space, move
2317        it back to there, in order to avoid filling up the work space. Otherwise,
2318        if "previous" is NULL, reset the current code pointer to the start. */
2319    
2320        if (previous != NULL)
2321          {
2322          if (previous > orig_code)
2323            {
2324            memmove(orig_code, previous, code - previous);
2325            code -= previous - orig_code;
2326            previous = orig_code;
2327            }
2328          }
2329        else code = orig_code;
2330    
2331        /* Remember where this code item starts so we can pick up the length
2332        next time round. */
2333    
2334        last_code = code;
2335        }
2336    
2337      /* In the real compile phase, just check the workspace used by the forward
2338      reference list. */
2339    
2340      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2341        {
2342        *errorcodeptr = ERR52;
2343        goto FAILED;
2344        }
2345    
2346    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2347    
2348    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1623  for (;; ptr++) Line 2357  for (;; ptr++)
2357        {        {
2358        if (previous_callout != NULL)        if (previous_callout != NULL)
2359          {          {
2360          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2361              complete_callout(previous_callout, ptr, cd);
2362          previous_callout = NULL;          previous_callout = NULL;
2363          }          }
2364        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1644  for (;; ptr++) Line 2379  for (;; ptr++)
2379    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2380         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2381      {      {
2382      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2383          complete_callout(previous_callout, ptr, cd);
2384      previous_callout = NULL;      previous_callout = NULL;
2385      }      }
2386    
# Line 1655  for (;; ptr++) Line 2391  for (;; ptr++)
2391      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2392      if (c == '#')      if (c == '#')
2393        {        {
2394        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2395        on the Macintosh. */          {
2396        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2397        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2398          if (*ptr != 0) continue;
2399    
2400          /* Else fall through to handle end of string */
2401          c = 0;
2402        }        }
2403      }      }
2404    
# Line 1672  for (;; ptr++) Line 2412  for (;; ptr++)
2412    
2413    switch(c)    switch(c)
2414      {      {
2415      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2416        case 0:                        /* The branch terminates at string end */
2417      case 0:      case '|':                      /* or | or ) */
     case '|':  
2418      case ')':      case ')':
2419      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2420      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2421      *codeptr = code;      *codeptr = code;
2422      *ptrptr = ptr;      *ptrptr = ptr;
2423        if (lengthptr != NULL)
2424          {
2425          if (OFLOW_MAX - *lengthptr < code - last_code)
2426            {
2427            *errorcodeptr = ERR20;
2428            goto FAILED;
2429            }
2430          *lengthptr += code - last_code;   /* To include callout length */
2431          DPRINTF((">> end branch\n"));
2432          }
2433      return TRUE;      return TRUE;
2434    
2435    
2436        /* ===================================================================*/
2437      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2438      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2439    
# Line 1711  for (;; ptr++) Line 2462  for (;; ptr++)
2462      *code++ = OP_ANY;      *code++ = OP_ANY;
2463      break;      break;
2464    
2465      /* Character classes. If the included characters are all < 255 in value, we  
2466      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2467      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2468      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2469      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2470        map as usual, then invert it at the end. However, we use a different opcode
2471        so that data characters > 255 can be handled correctly.
2472    
2473      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2474      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1736  for (;; ptr++) Line 2489  for (;; ptr++)
2489        goto FAILED;        goto FAILED;
2490        }        }
2491    
2492      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2493        if the first few characters (either before or after ^) are \Q\E or \E we
2494        skip them too. This makes for compatibility with Perl. */
2495    
2496      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2497        for (;;)
2498        {        {
       negate_class = TRUE;  
2499        c = *(++ptr);        c = *(++ptr);
2500        }        if (c == '\\')
2501      else          {
2502        {          if (ptr[1] == 'E') ptr++;
2503        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2504                else break;
2505            }
2506          else if (!negate_class && c == '^')
2507            negate_class = TRUE;
2508          else break;
2509        }        }
2510    
2511      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2512      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2513      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2514    
2515      class_charcount = 0;      class_charcount = 0;
2516      class_lastchar = -1;      class_lastchar = -1;
2517    
2518        /* Initialize the 32-char bit map to all zeros. We build the map in a
2519        temporary bit of memory, in case the class contains only 1 character (less
2520        than 256), because in that case the compiled code doesn't use the bit map.
2521        */
2522    
2523        memset(classbits, 0, 32 * sizeof(uschar));
2524    
2525  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2526      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2527      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2528  #endif  #endif
2529    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2530      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2531      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2532      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2533    
2534      do      if (c != 0) do
2535        {        {
2536          const uschar *oldptr;
2537    
2538  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2539        if (utf8 && c > 127)        if (utf8 && c > 127)
2540          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1786  for (;; ptr++) Line 2546  for (;; ptr++)
2546    
2547        if (inescq)        if (inescq)
2548          {          {
2549          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2550            {            {
2551            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2552            ptr++;            ptr++;                            /* Skip the 'E' */
2553            continue;            continue;                         /* Carry on with next */
2554            }            }
2555          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2556          }          }
2557    
2558        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1806  for (;; ptr++) Line 2566  for (;; ptr++)
2566            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2567          {          {
2568          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2569          int posix_class, i;          int posix_class, taboffset, tabopt;
2570          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2571            uschar pbits[32];
2572    
2573          if (ptr[1] != ':')          if (ptr[1] != ':')
2574            {            {
# Line 1836  for (;; ptr++) Line 2597  for (;; ptr++)
2597          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2598            posix_class = 0;            posix_class = 0;
2599    
2600          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2601          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2602          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2603          white space chars afterwards. */          result into the bit map that is being built. */
2604    
2605          posix_class *= 3;          posix_class *= 3;
2606          for (i = 0; i < 3; i++)  
2607            /* Copy in the first table (always present) */
2608    
2609            memcpy(pbits, cbits + posix_class_maps[posix_class],
2610              32 * sizeof(uschar));
2611    
2612            /* If there is a second table, add or remove it as required. */
2613    
2614            taboffset = posix_class_maps[posix_class + 1];
2615            tabopt = posix_class_maps[posix_class + 2];
2616    
2617            if (taboffset >= 0)
2618            {            {
2619            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2620            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2621            else            else
2622              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2623            }            }
2624    
2625            /* Not see if we need to remove any special characters. An option
2626            value of 1 removes vertical space and 2 removes underscore. */
2627    
2628            if (tabopt < 0) tabopt = -tabopt;
2629            if (tabopt == 1) pbits[1] &= ~0x3c;
2630              else if (tabopt == 2) pbits[11] &= 0x7f;
2631    
2632            /* Add the POSIX table or its complement into the main table that is
2633            being built and we are done. */
2634    
2635            if (local_negate)
2636              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2637            else
2638              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2639    
2640          ptr = tempptr + 1;          ptr = tempptr + 1;
2641          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2642          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2643          }          }
2644    
2645        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2646        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2647        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2648        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2649        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2650        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2651    
2652        if (c == '\\')        if (c == '\\')
2653          {          {
2654          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2655            if (*errorcodeptr != 0) goto FAILED;
2656    
2657          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2658          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2659            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2660          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2661            {            {
2662            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1895  for (;; ptr++) Line 2671  for (;; ptr++)
2671            {            {
2672            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2673            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2674            switch (-c)  
2675              /* Save time by not doing this in the pre-compile phase. */
2676    
2677              if (lengthptr == NULL) switch (-c)
2678              {              {
2679              case ESC_d:              case ESC_d:
2680              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1923  for (;; ptr++) Line 2702  for (;; ptr++)
2702              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2703              continue;              continue;
2704    
2705  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
             case ESC_p:  
             case ESC_P:  
               {  
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
               class_utf8 = TRUE;  
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
                 XCL_PROP : XCL_NOTPROP;  
               *class_utf8data++ = property;  
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2706              continue;              continue;
 #endif  
2707    
2708              /* Unrecognized escapes are faulted if PCRE is running in its              default:    /* Not recognized; fall through */
2709              strict mode. By default, for compatibility with Perl, they are              break;      /* Need "default" setting to stop compiler warning. */
             treated as literals. */  
   
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2710              }              }
           }  
   
         /* Fall through if we have a single character (c >= 0). This may be  
         > 256 in UTF-8 mode. */  
2711    
2712          }   /* End of backslash handling */            /* In the pre-compile phase, just do the recognition. */
2713    
2714        /* A single character may be followed by '-' to form a range. However,            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2715        Perl does not permit ']' to be the end of the range. A '-' character                     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
       here is treated as a literal. */  
2716    
2717        if (ptr[1] == '-' && ptr[2] != ']')            /* We need to deal with \H, \h, \V, and \v in both phases because
2718          {            they use extra memory. */
2719          int d;  
2720          ptr += 2;            if (-c == ESC_h)
2721                {
2722                SETBIT(classbits, 0x09); /* VT */
2723                SETBIT(classbits, 0x20); /* SPACE */
2724                SETBIT(classbits, 0xa0); /* NSBP */
2725    #ifdef SUPPORT_UTF8
2726                if (utf8)
2727                  {
2728                  class_utf8 = TRUE;
2729                  *class_utf8data++ = XCL_SINGLE;
2730                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2731                  *class_utf8data++ = XCL_SINGLE;
2732                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2733                  *class_utf8data++ = XCL_RANGE;
2734                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2735                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2736                  *class_utf8data++ = XCL_SINGLE;
2737                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2738                  *class_utf8data++ = XCL_SINGLE;
2739                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2740                  *class_utf8data++ = XCL_SINGLE;
2741                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2742                  }
2743    #endif
2744                continue;
2745                }
2746    
2747              if (-c == ESC_H)
2748                {
2749                for (c = 0; c < 32; c++)
2750                  {
2751                  int x = 0xff;
2752                  switch (c)
2753                    {
2754                    case 0x09/8: x ^= 1 << (0x09%8); break;
2755                    case 0x20/8: x ^= 1 << (0x20%8); break;
2756                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2757                    default: break;
2758                    }
2759                  classbits[c] |= x;
2760                  }
2761    
2762    #ifdef SUPPORT_UTF8
2763                if (utf8)
2764                  {
2765                  class_utf8 = TRUE;
2766                  *class_utf8data++ = XCL_RANGE;
2767                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2768                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2769                  *class_utf8data++ = XCL_RANGE;
2770                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2771                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2772                  *class_utf8data++ = XCL_RANGE;
2773                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2774                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2775                  *class_utf8data++ = XCL_RANGE;
2776                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2777                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2778                  *class_utf8data++ = XCL_RANGE;
2779                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2780                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2781                  *class_utf8data++ = XCL_RANGE;
2782                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2783                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2784                  *class_utf8data++ = XCL_RANGE;
2785                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2786                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2787                  }
2788    #endif
2789                continue;
2790                }
2791    
2792              if (-c == ESC_v)
2793                {
2794                SETBIT(classbits, 0x0a); /* LF */
2795                SETBIT(classbits, 0x0b); /* VT */
2796                SETBIT(classbits, 0x0c); /* FF */
2797                SETBIT(classbits, 0x0d); /* CR */
2798                SETBIT(classbits, 0x85); /* NEL */
2799    #ifdef SUPPORT_UTF8
2800                if (utf8)
2801                  {
2802                  class_utf8 = TRUE;
2803                  *class_utf8data++ = XCL_RANGE;
2804                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2805                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2806                  }
2807    #endif
2808                continue;
2809                }
2810    
2811              if (-c == ESC_V)
2812                {
2813                for (c = 0; c < 32; c++)
2814                  {
2815                  int x = 0xff;
2816                  switch (c)
2817                    {
2818                    case 0x0a/8: x ^= 1 << (0x0a%8);
2819                                 x ^= 1 << (0x0b%8);
2820                                 x ^= 1 << (0x0c%8);
2821                                 x ^= 1 << (0x0d%8);
2822                                 break;
2823                    case 0x85/8: x ^= 1 << (0x85%8); break;
2824                    default: break;
2825                    }
2826                  classbits[c] |= x;
2827                  }
2828    
2829    #ifdef SUPPORT_UTF8
2830                if (utf8)
2831                  {
2832                  class_utf8 = TRUE;
2833                  *class_utf8data++ = XCL_RANGE;
2834                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2835                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2836                  *class_utf8data++ = XCL_RANGE;
2837                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2838                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2839                  }
2840    #endif
2841                continue;
2842                }
2843    
2844              /* We need to deal with \P and \p in both phases. */
2845    
2846    #ifdef SUPPORT_UCP
2847              if (-c == ESC_p || -c == ESC_P)
2848                {
2849                BOOL negated;
2850                int pdata;
2851                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2852                if (ptype < 0) goto FAILED;
2853                class_utf8 = TRUE;
2854                *class_utf8data++ = ((-c == ESC_p) != negated)?
2855                  XCL_PROP : XCL_NOTPROP;
2856                *class_utf8data++ = ptype;
2857                *class_utf8data++ = pdata;
2858                class_charcount -= 2;   /* Not a < 256 character */
2859                continue;
2860                }
2861    #endif
2862              /* Unrecognized escapes are faulted if PCRE is running in its
2863              strict mode. By default, for compatibility with Perl, they are
2864              treated as literals. */
2865    
2866              if ((options & PCRE_EXTRA) != 0)
2867                {
2868                *errorcodeptr = ERR7;
2869                goto FAILED;
2870                }
2871    
2872              class_charcount -= 2;  /* Undo the default count from above */
2873              c = *ptr;              /* Get the final character and fall through */
2874              }
2875    
2876            /* Fall through if we have a single character (c >= 0). This may be
2877            greater than 256 in UTF-8 mode. */
2878    
2879            }   /* End of backslash handling */
2880    
2881          /* A single character may be followed by '-' to form a range. However,
2882          Perl does not permit ']' to be the end of the range. A '-' character
2883          at the end is treated as a literal. Perl ignores orphaned \E sequences
2884          entirely. The code for handling \Q and \E is messy. */
2885    
2886          CHECK_RANGE:
2887          while (ptr[1] == '\\' && ptr[2] == 'E')
2888            {
2889            inescq = FALSE;
2890            ptr += 2;
2891            }
2892    
2893          oldptr = ptr;
2894    
2895          if (!inescq && ptr[1] == '-')
2896            {
2897            int d;
2898            ptr += 2;
2899            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2900    
2901            /* If we hit \Q (not followed by \E) at this point, go into escaped
2902            mode. */
2903    
2904            while (*ptr == '\\' && ptr[1] == 'Q')
2905              {
2906              ptr += 2;
2907              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2908              inescq = TRUE;
2909              break;
2910              }
2911    
2912            if (*ptr == 0 || (!inescq && *ptr == ']'))
2913              {
2914              ptr = oldptr;
2915              goto LONE_SINGLE_CHARACTER;
2916              }
2917    
2918  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2919          if (utf8)          if (utf8)
# Line 1981  for (;; ptr++) Line 2928  for (;; ptr++)
2928          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2929          in such circumstances. */          in such circumstances. */
2930    
2931          if (d == '\\')          if (!inescq && d == '\\')
2932            {            {
2933            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2934            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2935    
2936            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2937            was literal */            special means the '-' was literal */
2938    
2939            if (d < 0)            if (d < 0)
2940              {              {
2941              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2942              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2943                else if (d == -ESC_R) d = 'R'; else
2944                {                {
2945                ptr = oldptr - 2;                ptr = oldptr;
2946                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2947                }                }
2948              }              }
2949            }            }
2950    
2951          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2952          the pre-pass. Optimize one-character ranges */          one-character ranges */
2953    
2954            if (d < c)
2955              {
2956              *errorcodeptr = ERR8;
2957              goto FAILED;
2958              }
2959    
2960          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2961    
# Line 2022  for (;; ptr++) Line 2976  for (;; ptr++)
2976  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2977            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2978              {              {
2979              int occ, ocd;              unsigned int occ, ocd;
2980              int cc = c;              unsigned int cc = c;
2981              int origd = d;              unsigned int origd = d;
2982              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2983                {                {
2984                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2985                      ocd <= (unsigned int)d)
2986                    continue;                          /* Skip embedded ranges */
2987    
2988                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2989                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2990                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2991                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2992                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2993                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2994                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2995                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2996                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2997                  d = ocd;                  d = ocd;
2998                  continue;                  continue;
# Line 2082  for (;; ptr++) Line 3040  for (;; ptr++)
3040          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3041          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3042    
3043          for (; c <= d; c++)          class_charcount += d - c + 1;
3044            class_lastchar = d;
3045    
3046            /* We can save a bit of time by skipping this in the pre-compile. */
3047    
3048            if (lengthptr == NULL) for (; c <= d; c++)
3049            {            {
3050            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3051            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2090  for (;; ptr++) Line 3053  for (;; ptr++)
3053              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3054              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3055              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3056            }            }
3057    
3058          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2115  for (;; ptr++) Line 3076  for (;; ptr++)
3076  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3077          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3078            {            {
3079            int chartype;            unsigned int othercase;
3080            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3081              {              {
3082              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3083              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2143  for (;; ptr++) Line 3102  for (;; ptr++)
3102          }          }
3103        }        }
3104    
3105      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
3106    
3107      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3108    
3109        if (c == 0)                          /* Missing terminating ']' */
3110          {
3111          *errorcodeptr = ERR6;
3112          goto FAILED;
3113          }
3114    
3115      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3116      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2210  for (;; ptr++) Line 3174  for (;; ptr++)
3174    
3175      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3176      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3177      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3178    
3179  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3180      if (class_utf8)      if (class_utf8)
# Line 2220  for (;; ptr++) Line 3184  for (;; ptr++)
3184        code += LINK_SIZE;        code += LINK_SIZE;
3185        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3186    
3187        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3188        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3189    
3190        if (class_charcount > 0)        if (class_charcount > 0)
3191          {          {
3192          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3193            memmove(code + 32, code, class_utf8data - code);
3194          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3195          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3196          }          }
3197          else code = class_utf8data;
3198    
3199        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3200    
# Line 2254  for (;; ptr++) Line 3211  for (;; ptr++)
3211      if (negate_class)      if (negate_class)
3212        {        {
3213        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3214        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3215            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3216        }        }
3217      else      else
3218        {        {
# Line 2264  for (;; ptr++) Line 3222  for (;; ptr++)
3222      code += 32;      code += 32;
3223      break;      break;
3224    
3225    
3226        /* ===================================================================*/
3227      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3228      has been tested above. */      has been tested above. */
3229    
# Line 2331  for (;; ptr++) Line 3291  for (;; ptr++)
3291        }        }
3292      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3293    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3294      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3295      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3296      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2378  for (;; ptr++) Line 3324  for (;; ptr++)
3324          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3325          }          }
3326    
3327          /* If the repetition is unlimited, it pays to see if the next thing on
3328          the line is something that cannot possibly match this character. If so,
3329          automatically possessifying this item gains some performance in the case
3330          where the match fails. */
3331    
3332          if (!possessive_quantifier &&
3333              repeat_max < 0 &&
3334              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3335                options, cd))
3336            {
3337            repeat_type = 0;    /* Force greedy */
3338            possessive_quantifier = TRUE;
3339            }
3340    
3341        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3342        }        }
3343    
3344      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3345      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3346      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3347      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3348        currently used only for single-byte chars. */
3349    
3350      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3351        {        {
3352        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3353        c = previous[1];        c = previous[1];
3354          if (!possessive_quantifier &&
3355              repeat_max < 0 &&
3356              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3357            {
3358            repeat_type = 0;    /* Force greedy */
3359            possessive_quantifier = TRUE;
3360            }
3361        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3362        }        }
3363    
# Line 2403  for (;; ptr++) Line 3371  for (;; ptr++)
3371      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3372        {        {
3373        uschar *oldcode;        uschar *oldcode;
3374        int prop_type;        int prop_type, prop_value;
3375        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3376        c = *previous;        c = *previous;
3377    
3378          if (!possessive_quantifier &&
3379              repeat_max < 0 &&
3380              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3381            {
3382            repeat_type = 0;    /* Force greedy */
3383            possessive_quantifier = TRUE;
3384            }
3385    
3386        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3387        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3388          previous[1] : -1;          {
3389            prop_type = previous[1];
3390            prop_value = previous[2];
3391            }
3392          else prop_type = prop_value = -1;
3393    
3394        oldcode = code;        oldcode = code;
3395        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2443  for (;; ptr++) Line 3423  for (;; ptr++)
3423          }          }
3424    
3425        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3426        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3427        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3428        one less than the maximum. */        one less than the maximum. */
3429    
# Line 2470  for (;; ptr++) Line 3450  for (;; ptr++)
3450    
3451          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3452          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3453          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3454          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3455          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3456    
# Line 2486  for (;; ptr++) Line 3466  for (;; ptr++)
3466  #endif  #endif
3467              {              {
3468              *code++ = c;              *code++ = c;
3469              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3470                  {
3471                  *code++ = prop_type;
3472                  *code++ = prop_value;
3473                  }
3474              }              }
3475            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3476            }            }
3477    
3478          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3479          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3480            UPTO is just for 1 instance, we can use QUERY instead. */
3481    
3482          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3483            {            {
# Line 2505  for (;; ptr++) Line 3490  for (;; ptr++)
3490            else            else
3491  #endif  #endif
3492            *code++ = c;            *code++ = c;
3493            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3494                {
3495                *code++ = prop_type;
3496                *code++ = prop_value;
3497                }
3498            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3499            *code++ = OP_UPTO + repeat_type;  
3500            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3501                {
3502                *code++ = OP_QUERY + repeat_type;
3503                }
3504              else
3505                {
3506                *code++ = OP_UPTO + repeat_type;
3507                PUT2INC(code, 0, repeat_max);
3508                }
3509            }            }
3510          }          }
3511    
# Line 2524  for (;; ptr++) Line 3521  for (;; ptr++)
3521  #endif  #endif
3522        *code++ = c;        *code++ = c;
3523    
3524        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3525        defines the required property. */        define the required property. */
3526    
3527  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3528        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3529            {
3530            *code++ = prop_type;
3531            *code++ = prop_value;
3532            }
3533  #endif  #endif
3534        }        }
3535    
# Line 2571  for (;; ptr++) Line 3572  for (;; ptr++)
3572      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3573      cases. */      cases. */
3574    
3575      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3576               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3577        {        {
3578        register int i;        register int i;
3579        int ketoffset = 0;        int ketoffset = 0;
3580        int len = code - previous;        int len = code - previous;
3581        uschar *bralink = NULL;        uschar *bralink = NULL;
3582    
3583          /* Repeating a DEFINE group is pointless */
3584    
3585          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3586            {
3587            *errorcodeptr = ERR55;
3588            goto FAILED;
3589            }
3590    
3591        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3592        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3593        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2613  for (;; ptr++) Line 3622  for (;; ptr++)
3622          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3623          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3624          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3625          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3626          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3627            doing this. */
3628    
3629          if (repeat_max <= 1)          if (repeat_max <= 1)
3630            {            {
3631            *code = OP_END;            *code = OP_END;
3632            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3633            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3634            code++;            code++;
3635            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2637  for (;; ptr++) Line 3647  for (;; ptr++)
3647            {            {
3648            int offset;            int offset;
3649            *code = OP_END;            *code = OP_END;
3650            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3651            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3652            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3653            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3667  for (;; ptr++)
3667        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3668        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3669        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3670        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3671          forward reference subroutine calls in the group, there will be entries on
3672          the workspace list; replicate these with an appropriate increment. */
3673    
3674        else        else
3675          {          {
3676          if (repeat_min > 1)          if (repeat_min > 1)
3677            {            {
3678            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3679            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3680              potential integer overflow. */
3681    
3682              if (lengthptr != NULL)
3683                {
3684                int delta = (repeat_min - 1)*length_prevgroup;
3685                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3686                                                                (double)INT_MAX ||
3687                    OFLOW_MAX - *lengthptr < delta)
3688                  {
3689                  *errorcodeptr = ERR20;
3690                  goto FAILED;
3691                  }
3692                *lengthptr += delta;
3693                }
3694    
3695              /* This is compiling for real */
3696    
3697              else
3698              {              {
3699              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3700              code += len;              for (i = 1; i < repeat_min; i++)
3701                  {
3702                  uschar *hc;
3703                  uschar *this_hwm = cd->hwm;
3704                  memcpy(code, previous, len);
3705                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3706                    {
3707                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3708                    cd->hwm += LINK_SIZE;
3709                    }
3710                  save_hwm = this_hwm;
3711                  code += len;
3712                  }
3713              }              }
3714            }            }
3715    
3716          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3717          }          }
3718    
# Line 2677  for (;; ptr++) Line 3720  for (;; ptr++)
3720        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3721        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3722        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3723        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3724          replicate entries on the forward reference list. */
3725    
3726        if (repeat_max >= 0)        if (repeat_max >= 0)
3727          {          {
3728          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3729            just adjust the length as if we had. For each repetition we must add 1
3730            to the length for BRAZERO and for all but the last repetition we must
3731            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3732            paranoid checks to avoid integer overflow. */
3733    
3734            if (lengthptr != NULL && repeat_max > 0)
3735              {
3736              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3737                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3738              if ((double)repeat_max *
3739                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3740                      > (double)INT_MAX ||
3741                  OFLOW_MAX - *lengthptr < delta)
3742                {
3743                *errorcodeptr = ERR20;
3744                goto FAILED;
3745                }
3746              *lengthptr += delta;
3747              }
3748    
3749            /* This is compiling for real */
3750    
3751            else for (i = repeat_max - 1; i >= 0; i--)
3752            {            {
3753              uschar *hc;
3754              uschar *this_hwm = cd->hwm;
3755    
3756            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3757    
3758            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2698  for (;; ptr++) Line 3768  for (;; ptr++)
3768              }              }
3769    
3770            memcpy(code, previous, len);            memcpy(code, previous, len);
3771              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3772                {
3773                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3774                cd->hwm += LINK_SIZE;
3775                }
3776              save_hwm = this_hwm;
3777            code += len;            code += len;
3778            }            }
3779    
# Line 2720  for (;; ptr++) Line 3796  for (;; ptr++)
3796        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3797        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3798        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3799        correct offset was computed above. */        correct offset was computed above.
3800    
3801          Then, when we are doing the actual compile phase, check to see whether
3802          this group is a non-atomic one that could match an empty string. If so,
3803          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3804          that runtime checking can be done. [This check is also applied to
3805          atomic groups at runtime, but in a different way.] */
3806    
3807        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3808            {
3809            uschar *ketcode = code - ketoffset;
3810            uschar *bracode = ketcode - GET(ketcode, 1);
3811            *ketcode = OP_KETRMAX + repeat_type;
3812            if (lengthptr == NULL && *bracode != OP_ONCE)
3813              {
3814              uschar *scode = bracode;
3815              do
3816                {
3817                if (could_be_empty_branch(scode, ketcode, utf8))
3818                  {
3819                  *bracode += OP_SBRA - OP_BRA;
3820                  break;
3821                  }
3822                scode += GET(scode, 1);
3823                }
3824              while (*scode == OP_ALT);
3825              }
3826            }
3827        }        }
3828    
3829      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2733  for (;; ptr++) Line 3834  for (;; ptr++)
3834        goto FAILED;        goto FAILED;
3835        }        }
3836    
3837      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3838      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3839      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3840      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3841      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3842        but the special opcodes can optimize it a bit. The repeated item starts at
3843        tempcode, not at previous, which might be the first part of a string whose
3844        (former) last char we repeated.
3845    
3846        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3847        an 'upto' may follow. We skip over an 'exact' item, and then test the
3848        length of what remains before proceeding. */
3849    
3850      if (possessive_quantifier)      if (possessive_quantifier)
3851        {        {
3852        int len = code - tempcode;        int len;
3853        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3854        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3855        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3856        tempcode[0] = OP_ONCE;        len = code - tempcode;
3857        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3858        PUTINC(code, 0, len);          {
3859        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3860            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3861            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3862            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3863    
3864            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3865            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3866            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3867            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3868    
3869            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3870            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3871            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3872            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3873    
3874            default:
3875            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3876            code += 1 + LINK_SIZE;
3877            len += 1 + LINK_SIZE;
3878            tempcode[0] = OP_ONCE;
3879            *code++ = OP_KET;
3880            PUTINC(code, 0, len);
3881            PUT(tempcode, 1, len);
3882            break;
3883            }
3884        }        }
3885    
3886      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2761  for (;; ptr++) Line 3893  for (;; ptr++)
3893      break;      break;
3894    
3895    
3896      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3897      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3898      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3899      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
3900    
3901      case '(':      case '(':
3902      newoptions = options;      newoptions = options;
3903      skipbytes = 0;      skipbytes = 0;
3904        bravalue = OP_CBRA;
3905        save_hwm = cd->hwm;
3906        reset_bracount = FALSE;
3907    
3908        /* First deal with various "verbs" that can be introduced by '*'. */
3909    
3910        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3911          {
3912          int i, namelen;
3913          const uschar *name = ++ptr;
3914          previous = NULL;
3915          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3916          if (*ptr == ':')
3917            {
3918            *errorcodeptr = ERR59;   /* Not supported */
3919            goto FAILED;
3920            }
3921          if (*ptr != ')')
3922            {
3923            *errorcodeptr = ERR60;
3924            goto FAILED;
3925            }
3926          namelen = ptr - name;
3927          for (i = 0; i < verbcount; i++)
3928            {
3929            if (namelen == verbs[i].len &&
3930                strncmp((char *)name, verbs[i].name, namelen) == 0)
3931              {
3932              *code = verbs[i].op;
3933              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3934              break;
3935              }
3936            }
3937          if (i < verbcount) continue;
3938          *errorcodeptr = ERR60;
3939          goto FAILED;
3940          }
3941    
3942        /* Deal with the extended parentheses; all are introduced by '?', and the
3943        appearance of any of them means that this is not a capturing group. */
3944    
3945      if (*(++ptr) == '?')      else if (*ptr == '?')
3946        {        {
3947        int set, unset;        int i, set, unset, namelen;
3948        int *optset;        int *optset;
3949          const uschar *name;
3950          uschar *slot;
3951    
3952        switch (*(++ptr))        switch (*(++ptr))
3953          {          {
3954          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3955          ptr++;          ptr++;
3956          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3957            if (*ptr == 0)
3958              {
3959              *errorcodeptr = ERR18;
3960              goto FAILED;
3961              }
3962          continue;          continue;
3963    
3964          case ':':                 /* Non-extracting bracket */  
3965            /* ------------------------------------------------------------ */
3966            case '|':                 /* Reset capture count for each branch */
3967            reset_bracount = TRUE;
3968            /* Fall through */
3969    
3970            /* ------------------------------------------------------------ */
3971            case ':':                 /* Non-capturing bracket */
3972          bravalue = OP_BRA;          bravalue = OP_BRA;
3973          ptr++;          ptr++;
3974          break;          break;
3975    
3976    
3977            /* ------------------------------------------------------------ */
3978          case '(':          case '(':
3979          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3980    
3981          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3982            group), a name (referring to a named group), or 'R', referring to
3983            recursion. R<digits> and R&name are also permitted for recursion tests.
3984    
3985            There are several syntaxes for testing a named group: (?(name)) is used
3986            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3987    
3988            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3989            be the recursive thing or the name 'R' (and similarly for 'R' followed
3990            by digits), and (b) a number could be a name that consists of digits.
3991            In both cases, we look for a name first; if not found, we try the other
3992            cases. */
3993    
3994            /* For conditions that are assertions, check the syntax, and then exit
3995            the switch. This will take control down to where bracketed groups,
3996            including assertions, are processed. */
3997    
3998            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3999              break;
4000    
4001            /* Most other conditions use OP_CREF (a couple change to OP_RREF
4002            below), and all need to skip 3 bytes at the start of the group. */
4003    
4004          if (ptr[1] == 'R')          code[1+LINK_SIZE] = OP_CREF;
4005            skipbytes = 3;
4006            refsign = -1;
4007    
4008            /* Check for a test for recursion in a named group. */
4009    
4010            if (ptr[1] == 'R' && ptr[2] == '&')
4011            {            {
4012            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4013            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4014            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4015            }            }
4016    
4017          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4018          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4019    
4020          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4021            {            {
4022            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4023            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4024            }            }
4025          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
4026          set bravalue above. */            {
4027          break;            terminator = '\'';
4028              ptr++;
4029          case '=':                 /* Positive lookahead */            }
4030          bravalue = OP_ASSERT;          else
4031          ptr++;            {
4032          break;            terminator = 0;
4033              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4034              }
4035    
4036          case '!':                 /* Negative lookahead */          /* We now expect to read a name; any thing else is an error */
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
4037    
4038          case '<':                 /* Lookbehinds */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
         switch (*(++ptr))  
4039            {            {
4040            case '=':               /* Positive lookbehind */            ptr += 1;  /* To get the right offset */
4041            bravalue = OP_ASSERTBACK;            *errorcodeptr = ERR28;
4042            ptr++;            goto FAILED;
4043            break;            }
4044    
4045            case '!':               /* Negative lookbehind */          /* Read the name, but also get it as a number if it's all digits */
4046            bravalue = OP_ASSERTBACK_NOT;  
4047            recno = 0;
4048            name = ++ptr;
4049            while ((cd->ctypes[*ptr] & ctype_word) != 0)
4050              {
4051              if (recno >= 0)
4052                recno = ((digitab[*ptr] & ctype_digit) != 0)?
4053                  recno * 10 + *ptr - '0' : -1;
4054            ptr++;            ptr++;
           break;  
4055            }            }
4056          break;          namelen = ptr - name;
4057    
4058          case '>':                 /* One-time brackets */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4059          bravalue = OP_ONCE;            {
4060          ptr++;            ptr--;      /* Error offset */
4061          break;            *errorcodeptr = ERR26;
4062              goto FAILED;
4063              }
4064    
4065          case 'C':                 /* Callout - may be followed by digits; */          /* Do no further checking in the pre-compile phase. */
4066          previous_callout = code;  /* Save for later completion */  
4067          after_manual_callout = 1; /* Skip one item before completing */          if (lengthptr != NULL) break;
4068          *code++ = OP_CALLOUT;     /* Already checked that the terminating */  
4069            {                       /* closing parenthesis is present. */          /* In the real compile we do the work of looking for the actual
4070            int n = 0;          reference. If the string started with "+" or "-" we require the rest to
4071            while ((digitab[*(++ptr)] & ctype_digit) != 0)          be digits, in which case recno will be set. */
4072              n = n * 10 + *ptr - '0';  
4073            if (n > 255)          if (refsign > 0)
4074              {            {
4075              if (recno <= 0)
4076                {
4077                *errorcodeptr = ERR58;
4078                goto FAILED;
4079                }
4080              if (refsign == '-')
4081                {
4082                recno = cd->bracount - recno + 1;
4083                if (recno <= 0)
4084                  {
4085                  *errorcodeptr = ERR15;
4086                  goto FAILED;
4087                  }
4088                }
4089              else recno += cd->bracount;
4090              PUT2(code, 2+LINK_SIZE, recno);
4091              break;
4092              }
4093    
4094            /* Otherwise (did not start with "+" or "-"), start by looking for the
4095            name. */
4096    
4097            slot = cd->name_table;
4098            for (i = 0; i < cd->names_found; i++)
4099              {
4100              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4101              slot += cd->name_entry_size;
4102              }
4103    
4104            /* Found a previous named subpattern */
4105    
4106            if (i < cd->names_found)
4107              {
4108              recno = GET2(slot, 0);
4109              PUT2(code, 2+LINK_SIZE, recno);
4110              }
4111    
4112            /* Search the pattern for a forward reference */
4113    
4114            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4115                            (options & PCRE_EXTENDED) != 0)) > 0)
4116              {
4117              PUT2(code, 2+LINK_SIZE, i);
4118              }
4119    
4120            /* If terminator == 0 it means that the name followed directly after
4121            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4122            some further alternatives to try. For the cases where terminator != 0
4123            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4124            now checked all the possibilities, so give an error. */
4125    
4126            else if (terminator != 0)
4127              {
4128              *errorcodeptr = ERR15;
4129              goto FAILED;
4130              }
4131    
4132            /* Check for (?(R) for recursion. Allow digits after R to specify a
4133            specific group number. */
4134    
4135            else if (*name == 'R')
4136              {
4137              recno = 0;
4138              for (i = 1; i < namelen; i++)
4139                {
4140                if ((digitab[name[i]] & ctype_digit) == 0)
4141                  {
4142                  *errorcodeptr = ERR15;
4143                  goto FAILED;
4144                  }
4145                recno = recno * 10 + name[i] - '0';
4146                }
4147              if (recno == 0) recno = RREF_ANY;
4148              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4149              PUT2(code, 2+LINK_SIZE, recno);
4150              }
4151    
4152            /* Similarly, check for the (?(DEFINE) "condition", which is always
4153            false. */
4154    
4155            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4156              {
4157              code[1+LINK_SIZE] = OP_DEF;
4158              skipbytes = 1;
4159              }
4160    
4161            /* Check for the "name" actually being a subpattern number. */
4162    
4163            else if (recno > 0)
4164              {
4165              PUT2(code, 2+LINK_SIZE, recno);
4166              }
4167    
4168            /* Either an unidentified subpattern, or a reference to (?(0) */
4169    
4170            else
4171              {
4172              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4173              goto FAILED;
4174              }
4175            break;
4176    
4177    
4178            /* ------------------------------------------------------------ */
4179            case '=':                 /* Positive lookahead */
4180            bravalue = OP_ASSERT;
4181            ptr++;
4182            break;
4183    
4184    
4185            /* ------------------------------------------------------------ */
4186            case '!':                 /* Negative lookahead */
4187            ptr++;
4188            if (*ptr == ')')          /* Optimize (?!) */
4189              {
4190              *code++ = OP_FAIL;
4191              previous = NULL;
4192              continue;
4193              }
4194            bravalue = OP_ASSERT_NOT;
4195            break;
4196    
4197    
4198            /* ------------------------------------------------------------ */
4199            case '<':                 /* Lookbehind or named define */
4200            switch (ptr[1])
4201              {
4202              case '=':               /* Positive lookbehind */
4203              bravalue = OP_ASSERTBACK;
4204              ptr += 2;
4205              break;
4206    
4207              case '!':               /* Negative lookbehind */
4208              bravalue = OP_ASSERTBACK_NOT;
4209              ptr += 2;
4210              break;
4211    
4212              default:                /* Could be name define, else bad */
4213              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4214              ptr++;                  /* Correct offset for error */
4215              *errorcodeptr = ERR24;
4216              goto FAILED;
4217              }
4218            break;
4219    
4220    
4221            /* ------------------------------------------------------------ */
4222            case '>':                 /* One-time brackets */
4223            bravalue = OP_ONCE;
4224            ptr++;
4225            break;
4226    
4227    
4228            /* ------------------------------------------------------------ */
4229            case 'C':                 /* Callout - may be followed by digits; */
4230            previous_callout = code;  /* Save for later completion */
4231            after_manual_callout = 1; /* Skip one item before completing */
4232            *code++ = OP_CALLOUT;
4233              {
4234              int n = 0;
4235              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4236                n = n * 10 + *ptr - '0';
4237              if (*ptr != ')')
4238                {
4239                *errorcodeptr = ERR39;
4240                goto FAILED;
4241                }
4242              if (n > 255)
4243                {
4244              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
4245              goto FAILED;              goto FAILED;
4246              }              }
# Line 2876  for (;; ptr++) Line 4252  for (;; ptr++)
4252          previous = NULL;          previous = NULL;
4253          continue;          continue;
4254    
4255          case 'P':                 /* Named subpattern handling */  
4256          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4257            case 'P':                 /* Python-style named subpattern handling */
4258            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4259              {
4260              is_recurse = *ptr == '>';
4261              terminator = ')';
4262              goto NAMED_REF_OR_RECURSE;
4263              }
4264            else if (*ptr != '<')    /* Test for Python-style definition */
4265              {
4266              *errorcodeptr = ERR41;
4267              goto FAILED;
4268              }
4269            /* Fall through to handle (?P< as (?< is handled */
4270    
4271    
4272            /* ------------------------------------------------------------ */
4273            DEFINE_NAME:    /* Come here from (?< handling */
4274            case '\'':
4275            {            {
4276            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4277            uschar *slot = cd->name_table;            name = ++ptr;
4278            const uschar *name;     /* Don't amalgamate; some compilers */  
4279            name = ++ptr;           /* grumble at autoincrement in declaration */            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4280              namelen = ptr - name;
4281    
4282            while (*ptr++ != '>');            /* In the pre-compile phase, just do a syntax check. */
           namelen = ptr - name - 1;  
4283    
4284            for (i = 0; i < cd->names_found; i++)            if (lengthptr != NULL)
4285              {              {
4286              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
4287              if (crc == 0)                {
4288                  *errorcodeptr = ERR42;
4289                  goto FAILED;
4290                  }
4291                if (cd->names_found >= MAX_NAME_COUNT)
4292                  {
4293                  *errorcodeptr = ERR49;
4294                  goto FAILED;
4295                  }
4296                if (namelen + 3 > cd->name_entry_size)
4297                {                {
4298                if (slot[2+namelen] == 0)                cd->name_entry_size = namelen + 3;
4299                  if (namelen > MAX_NAME_SIZE)
4300                  {                  {
4301                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4302                  goto FAILED;                  goto FAILED;
4303                  }                  }
               crc = -1;             /* Current name is substring */  
4304                }                }
4305              if (crc < 0)              }
4306    
4307              /* In the real compile, create the entry in the table */
4308    
4309              else
4310                {
4311                slot = cd->name_table;
4312                for (i = 0; i < cd->names_found; i++)
4313                {                {
4314                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
4315                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
4316                break;                  {
4317                    if (slot[2+namelen] == 0)
4318                      {
4319                      if ((options & PCRE_DUPNAMES) == 0)
4320                        {
4321                        *errorcodeptr = ERR43;
4322                        goto FAILED;
4323                        }
4324                      }
4325                    else crc = -1;      /* Current name is substring */
4326                    }
4327                  if (crc < 0)
4328                    {
4329                    memmove(slot + cd->name_entry_size, slot,
4330                      (cd->names_found - i) * cd->name_entry_size);
4331                    break;
4332                    }
4333                  slot += cd->name_entry_size;
4334                }                }
             slot += cd->name_entry_size;  
             }  
4335    
4336            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
4337            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4338            slot[2+namelen] = 0;              slot[2+namelen] = 0;
4339            cd->names_found++;              }
           goto NUMBERED_GROUP;  
4340            }            }
4341    
4342          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4343    
4344            ptr++;                    /* Move past > or ' */
4345            cd->names_found++;
4346            goto NUMBERED_GROUP;
4347    
4348    
4349            /* ------------------------------------------------------------ */
4350            case '&':                 /* Perl recursion/subroutine syntax */
4351            terminator = ')';
4352            is_recurse = TRUE;
4353            /* Fall through */
4354    
4355            /* We come here from the Python syntax above that handles both
4356            references (?P=name) and recursion (?P>name), as well as falling
4357            through from the Perl recursion syntax (?&name). */
4358    
4359            NAMED_REF_OR_RECURSE:
4360            name = ++ptr;
4361            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4362            namelen = ptr - name;
4363    
4364            /* In the pre-compile phase, do a syntax check and set a dummy
4365            reference number. */
4366    
4367            if (lengthptr != NULL)
4368            {            {
4369            int i, namelen;            if (*ptr != terminator)
4370            int type = *ptr++;              {
4371            const uschar *name = ptr;              *errorcodeptr = ERR42;
4372            uschar *slot = cd->name_table;              goto FAILED;
4373                }
4374              if (namelen > MAX_NAME_SIZE)
4375                {
4376                *errorcodeptr = ERR48;
4377                goto FAILED;
4378                }
4379              recno = 0;
4380              }
4381    
4382            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4383    
4384            else
4385              {
4386              slot = cd->name_table;
4387            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4388              {              {
4389              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4390              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4391              }              }
4392            if (i >= cd->names_found)  
4393              if (i < cd->names_found)         /* Back reference */
4394                {
4395                recno = GET2(slot, 0);
4396                }
4397              else if ((recno =                /* Forward back reference */
4398                        find_parens(ptr, cd->bracount, name, namelen,
4399                          (options & PCRE_EXTENDED) != 0)) <= 0)
4400              {              {
4401              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4402              goto FAILED;              goto FAILED;
4403              }              }
4404              }
4405    
4406            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4407            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4408    
4409            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4410            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4411    
         /* Should never happen */  
         break;  
4412    
4413          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4414            case 'R':                 /* Recursion */
4415          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4416          /* Fall through */          /* Fall through */
4417    
         /* Recursion or "subroutine" call */  
4418    
4419          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4420          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4421            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4422            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4423            {            {
4424            const uschar *called;            const uschar *called;
4425    
4426              if ((refsign = *ptr) == '+') ptr++;
4427              else if (refsign == '-')
4428                {
4429                if ((digitab[ptr[1]] & ctype_digit) == 0)
4430                  goto OTHER_CHAR_AFTER_QUERY;
4431                ptr++;
4432                }
4433    
4434            recno = 0;            recno = 0;
4435            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4436              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4437    
4438              if (*ptr != ')')
4439                {
4440                *errorcodeptr = ERR29;
4441                goto FAILED;
4442                }
4443    
4444              if (refsign == '-')
4445                {
4446                if (recno == 0)
4447                  {
4448                  *errorcodeptr = ERR58;
4449                  goto FAILED;
4450                  }
4451                recno = cd->bracount - recno + 1;
4452                if (recno <= 0)
4453                  {
4454                  *errorcodeptr = ERR15;
4455                  goto FAILED;
4456                  }
4457                }
4458              else if (refsign == '+')
4459                {
4460                if (recno == 0)
4461                  {
4462                  *errorcodeptr = ERR58;
4463                  goto FAILED;
4464                  }
4465                recno += cd->bracount;
4466                }
4467    
4468            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4469    
4470            HANDLE_RECURSION:            HANDLE_RECURSION:
4471    
4472            previous = code;            previous = code;
4473              called = cd->start_code;
4474    
4475            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4476            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4477              this point. If we end up with a forward reference, first check that
4478              the bracket does occur later so we can give the error (and position)
4479              now. Then remember this forward reference in the workspace so it can
4480              be filled in at the end. */
4481    
4482            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)?  
             cd->start_code : find_bracket(cd->start_code, utf8, recno);  
   
           if (called == NULL)  
4483              {              {
4484              *errorcodeptr = ERR15;              *code = OP_END;
4485              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4486    
4487            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4488    
4489            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4490              {                {
4491              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4492              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4493                    {
4494                    *errorcodeptr = ERR15;
4495                    goto FAILED;
4496                    }
4497                  called = cd->start_code + recno;