/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 79 by nigel, Sat Feb 24 21:40:52 2007 UTC revision 211 by ph10, Thu Aug 9 09:52:43 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56    /* When DEBUG is defined, we need the pcre_printint() function, which is also
57    used by pcretest. DEBUG is not defined when building a production library. */
58    
59    #ifdef DEBUG
60    #include "pcre_printint.src"
61    #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 106  static const short int escapes[] = { Line 140  static const short int escapes[] = {
140  #endif  #endif
141    
142    
143    /* Table of special "verbs" like (*PRUNE) */
144    
145    typedef struct verbitem {
146      const char *name;
147      int   len;
148      int   op;
149    } verbitem;
150    
151    static verbitem verbs[] = {
152      { "ACCEPT", 6, OP_ACCEPT },
153      { "COMMIT", 6, OP_COMMIT },
154      { "F",      1, OP_FAIL },
155      { "FAIL",   4, OP_FAIL },
156      { "PRUNE",  5, OP_PRUNE },
157      { "SKIP",   4, OP_SKIP  },
158      { "THEN",   4, OP_THEN  }
159    };
160    
161    static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
165  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
166  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
167    
168  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 118  static const char *const posix_names[] = Line 173  static const char *const posix_names[] =
173  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
174    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
177  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
178  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
179    characters are removed, and for [:alpha:] and [:alnum:] the underscore
180    character is removed. The triples in the table consist of the base map offset,
181    second map offset or -1 if no second map, and a non-negative value for map
182    addition or a negative value for map subtraction (if there are two maps). The
183    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184    remove vertical space characters, 2 => remove underscore. */
185    
186  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
187    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
188    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
189    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
190    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
191    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
192    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
193    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
194    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
195    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
196    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
197    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
198    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
199    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
200    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
201  };  };
202    
203    
204    #define STRING(a)  # a
205    #define XSTRING(s) STRING(s)
206    
207  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
208  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
209    they are documented. Always add a new error instead. Messages marked DEAD below
210    are no longer used. */
211    
212  static const char *error_texts[] = {  static const char *error_texts[] = {
213    "no error",    "no error",
# Line 156  static const char *error_texts[] = { Line 222  static const char *error_texts[] = {
222    "range out of order in character class",    "range out of order in character class",
223    "nothing to repeat",    "nothing to repeat",
224    /* 10 */    /* 10 */
225    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
226    "internal error: unexpected repeat",    "internal error: unexpected repeat",
227    "unrecognized character after (?",    "unrecognized character after (?",
228    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 166  static const char *error_texts[] = { Line 232  static const char *error_texts[] = {
232    "erroffset passed as NULL",    "erroffset passed as NULL",
233    "unknown option bit(s) set",    "unknown option bit(s) set",
234    "missing ) after comment",    "missing ) after comment",
235    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
236    /* 20 */    /* 20 */
237    "regular expression too large",    "regular expression is too large",
238    "failed to get memory",    "failed to get memory",
239    "unmatched parentheses",    "unmatched parentheses",
240    "internal error: code overflow",    "internal error: code overflow",
241    "unrecognized character after (?<",    "unrecognized character after (?<",
242    /* 25 */    /* 25 */
243    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
244    "malformed number after (?(",    "malformed number or name after (?(",
245    "conditional group contains more than two branches",    "conditional group contains more than two branches",
246    "assertion expected after (?(",    "assertion expected after (?(",
247    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
248    /* 30 */    /* 30 */
249    "unknown POSIX class name",    "unknown POSIX class name",
250    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
251    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
252    "spare error",    "spare error",  /** DEAD **/
253    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
254    /* 35 */    /* 35 */
255    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 194  static const char *error_texts[] = { Line 260  static const char *error_texts[] = {
260    /* 40 */    /* 40 */
261    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
262    "unrecognized character after (?P",    "unrecognized character after (?P",
263    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
264    "two named groups have the same name",    "two named subpatterns have the same name",
265    "invalid UTF-8 string",    "invalid UTF-8 string",
266    /* 45 */    /* 45 */
267    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
268    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
269    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
270      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272      /* 50 */
273      "repeated subpattern is too long",    /** DEAD **/
274      "octal value is greater than \\377 (not in UTF-8 mode)",
275      "internal error: overran compiling workspace",
276      "internal error: previously-checked referenced subpattern not found",
277      "DEFINE group contains more than one branch",
278      /* 55 */
279      "repeating a DEFINE group is not allowed",
280      "inconsistent NEWLINE options",
281      "\\g is not followed by a braced name or an optionally braced non-zero number",
282      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283      "(*VERB) with an argument is not supported",
284      /* 60 */
285      "(*VERB) not recognized"
286  };  };
287    
288    
# Line 220  For convenience, we use the same bit def Line 302  For convenience, we use the same bit def
302    
303  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
304    
305  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
306  static const unsigned char digitab[] =  static const unsigned char digitab[] =
307    {    {
308    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 338  static const unsigned char digitab[] =
338    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
339    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
340    
341  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
342  static const unsigned char digitab[] =  static const unsigned char digitab[] =
343    {    {
344    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 352  static const unsigned char digitab[] =
352    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
353    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
354    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
355    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
356    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
357    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 386  static const unsigned char ebcdic_charta
386    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
387    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
388    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
389    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
390    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
391    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
392    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 413  static const unsigned char ebcdic_charta
413  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
414    
415  static BOOL  static BOOL
416    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
417      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
418    
419    
420    
# Line 342  static BOOL Line 424  static BOOL
424    
425  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
426  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
427  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
428  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
429  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
430    ptr is pointing at the \. On exit, it is on the final character of the escape
431    sequence.
432    
433  Arguments:  Arguments:
434    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 362  static int Line 446  static int
446  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
447    int options, BOOL isclass)    int options, BOOL isclass)
448  {  {
449  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
450    const uschar *ptr = *ptrptr + 1;
451  int c, i;  int c, i;
452    
453    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
454    ptr--;                            /* Set pointer back to the last byte */
455    
456  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
457    
 c = *(++ptr);  
458  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
459    
460  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
461  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
462  Otherwise further processing may be required. */  Otherwise further processing may be required. */
463    
464  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
465  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
466  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
467    
468  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
469  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
470  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
471  #endif  #endif
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 475  else if ((i = escapes[c - 0x48]) != 0)
475  else  else
476    {    {
477    const uschar *oldptr;    const uschar *oldptr;
478      BOOL braced, negated;
479    
480    switch (c)    switch (c)
481      {      {
482      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 490  else
490      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
491      break;      break;
492    
493        /* \g must be followed by a number, either plain or braced. If positive, it
494        is an absolute backreference. If negative, it is a relative backreference.
495        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
496        reference to a named group. This is part of Perl's movement towards a
497        unified syntax for back references. As this is synonymous with \k{name}, we
498        fudge it up by pretending it really was \k. */
499    
500        case 'g':
501        if (ptr[1] == '{')
502          {
503          const uschar *p;
504          for (p = ptr+2; *p != 0 && *p != '}'; p++)
505            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
506          if (*p != 0 && *p != '}')
507            {
508            c = -ESC_k;
509            break;
510            }
511          braced = TRUE;
512          ptr++;
513          }
514        else braced = FALSE;
515    
516        if (ptr[1] == '-')
517          {
518          negated = TRUE;
519          ptr++;
520          }
521        else negated = FALSE;
522    
523        c = 0;
524        while ((digitab[ptr[1]] & ctype_digit) != 0)
525          c = c * 10 + *(++ptr) - '0';
526    
527        if (c == 0 || (braced && *(++ptr) != '}'))
528          {
529          *errorcodeptr = ERR57;
530          return 0;
531          }
532    
533        if (negated)
534          {
535          if (c > bracount)
536            {
537            *errorcodeptr = ERR15;
538            return 0;
539            }
540          c = bracount - (c - 1);
541          }
542    
543        c = -(ESC_REF + c);
544        break;
545    
546      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
547      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
548      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 442  else Line 584  else
584        }        }
585    
586      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
587      larger first octal digit. */      larger first octal digit. The original code used just to take the least
588        significant 8 bits of octal numbers (I think this is what early Perls used
589        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
590        than 3 octal digits. */
591    
592      case '0':      case '0':
593      c -= '0';      c -= '0';
594      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
595          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
596      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
597      break;      break;
598    
599      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
600      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
601        treated as a data character. */
602    
603      case 'x':      case 'x':
604  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
605        {        {
606        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
607        register int count = 0;        int count = 0;
608    
609        c = 0;        c = 0;
610        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
611          {          {
612          int cc = *pt++;          register int cc = *pt++;
613            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
614          count++;          count++;
615  #if !EBCDIC    /* ASCII coding */  
616    #ifndef EBCDIC  /* ASCII coding */
617          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
618          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
619  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
620          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
621          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
622  #endif  #endif
623          }          }
624    
625        if (*pt == '}')        if (*pt == '}')
626          {          {
627          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
628          ptr = pt;          ptr = pt;
629          break;          break;
630          }          }
631    
632        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
633        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
634        }        }
 #endif  
635    
636      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
637    
638      c = 0;      c = 0;
639      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
640        {        {
641        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
642        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
643  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
644        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
645        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
646  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
647        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
648        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
649  #endif  #endif
650        }        }
651      break;      break;
652    
653      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
654        This coding is ASCII-specific, but then the whole concept of \cx is
655        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
656    
657      case 'c':      case 'c':
658      c = *(++ptr);      c = *(++ptr);
# Line 511  else Line 662  else
662        return 0;        return 0;
663        }        }
664    
665      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
666      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
667      c ^= 0x40;      c ^= 0x40;
668  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
669      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
670      c ^= 0xC0;      c ^= 0xC0;
671  #endif  #endif
# Line 560  escape sequence. Line 707  escape sequence.
707  Argument:  Argument:
708    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
709    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
710      dptr           points to an int that is set to the detailed property value
711    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
712    
713  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
714  */  */
715    
716  static int  static int
717  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
718  {  {
719  int c, i, bot, top;  int c, i, bot, top;
720  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
721  char name[4];  char name[32];
722    
723  c = *(++ptr);  c = *(++ptr);
724  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
725    
726  *negptr = FALSE;  *negptr = FALSE;
727    
728  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
729  preceded by ^ for negation. */  negation. */
730    
731  if (c == '{')  if (c == '{')
732    {    {
# Line 587  if (c == '{') Line 735  if (c == '{')
735      *negptr = TRUE;      *negptr = TRUE;
736      ptr++;      ptr++;
737      }      }
738    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
739      {      {
740      c = *(++ptr);      c = *(++ptr);
741      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
742      if (c == '}') break;      if (c == '}') break;
743      name[i] = c;      name[i] = c;
744      }      }
745    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
746    name[i] = 0;    name[i] = 0;
747    }    }
748    
# Line 619  top = _pcre_utt_size; Line 763  top = _pcre_utt_size;
763    
764  while (bot < top)  while (bot < top)
765    {    {
766    i = (bot + top)/2;    i = (bot + top) >> 1;
767    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
768    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
769        {
770        *dptr = _pcre_utt[i].value;
771        return _pcre_utt[i].type;
772        }
773    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
774    }    }
775    
 UNKNOWN_RETURN:  
776  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
777  *ptrptr = ptr;  *ptrptr = ptr;
778  return -1;  return -1;
# Line 698  read_repeat_counts(const uschar *p, int Line 845  read_repeat_counts(const uschar *p, int
845  int min = 0;  int min = 0;
846  int max = -1;  int max = -1;
847    
848    /* Read the minimum value and do a paranoid check: a negative value indicates
849    an integer overflow. */
850    
851  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';  while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
852    if (min < 0 || min > 65535)
853      {
854      *errorcodeptr = ERR5;
855      return p;
856      }
857    
858    /* Read the maximum value if there is one, and again do a paranoid on its size.
859    Also, max must not be less than min. */
860    
861  if (*p == '}') max = min; else  if (*p == '}') max = min; else
862    {    {
# Line 706  if (*p == '}') max = min; else Line 864  if (*p == '}') max = min; else
864      {      {
865      max = 0;      max = 0;
866      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';      while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
867        if (max < 0 || max > 65535)
868          {
869          *errorcodeptr = ERR5;
870          return p;
871          }
872      if (max < min)      if (max < min)
873        {        {
874        *errorcodeptr = ERR4;        *errorcodeptr = ERR4;
# Line 714  if (*p == '}') max = min; else Line 877  if (*p == '}') max = min; else
877      }      }
878    }    }
879    
880  /* Do paranoid checks, then fill in the required variables, and pass back the  /* Fill in the required variables, and pass back the pointer to the terminating
881  pointer to the terminating '}'. */  '}'. */
882    
883  if (min > 65535 || max > 65535)  *minp = min;
884    *errorcodeptr = ERR5;  *maxp = max;
885  else  return p;
886    }
887    
888    
889    
890    /*************************************************
891    *       Find forward referenced subpattern       *
892    *************************************************/
893    
894    /* This function scans along a pattern's text looking for capturing
895    subpatterns, and counting them. If it finds a named pattern that matches the
896    name it is given, it returns its number. Alternatively, if the name is NULL, it
897    returns when it reaches a given numbered subpattern. This is used for forward
898    references to subpatterns. We know that if (?P< is encountered, the name will
899    be terminated by '>' because that is checked in the first pass.
900    
901    Arguments:
902      ptr          current position in the pattern
903      count        current count of capturing parens so far encountered
904      name         name to seek, or NULL if seeking a numbered subpattern
905      lorn         name length, or subpattern number if name is NULL
906      xmode        TRUE if we are in /x mode
907    
908    Returns:       the number of the named subpattern, or -1 if not found
909    */
910    
911    static int
912    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
913      BOOL xmode)
914    {
915    const uschar *thisname;
916    
917    for (; *ptr != 0; ptr++)
918    {    {
919    *minp = min;    int term;
920    *maxp = max;  
921      /* Skip over backslashed characters and also entire \Q...\E */
922    
923      if (*ptr == '\\')
924        {
925        if (*(++ptr) == 0) return -1;
926        if (*ptr == 'Q') for (;;)
927          {
928          while (*(++ptr) != 0 && *ptr != '\\');
929          if (*ptr == 0) return -1;
930          if (*(++ptr) == 'E') break;
931          }
932        continue;
933        }
934    
935      /* Skip over character classes */
936    
937      if (*ptr == '[')
938        {
939        while (*(++ptr) != ']')
940          {
941          if (*ptr == '\\')
942            {
943            if (*(++ptr) == 0) return -1;
944            if (*ptr == 'Q') for (;;)
945              {
946              while (*(++ptr) != 0 && *ptr != '\\');
947              if (*ptr == 0) return -1;
948              if (*(++ptr) == 'E') break;
949              }
950            continue;
951            }
952          }
953        continue;
954        }
955    
956      /* Skip comments in /x mode */
957    
958      if (xmode && *ptr == '#')
959        {
960        while (*(++ptr) != 0 && *ptr != '\n');
961        if (*ptr == 0) return -1;
962        continue;
963        }
964    
965      /* An opening parens must now be a real metacharacter */
966    
967      if (*ptr != '(') continue;
968      if (ptr[1] != '?' && ptr[1] != '*')
969        {
970        count++;
971        if (name == NULL && count == lorn) return count;
972        continue;
973        }
974    
975      ptr += 2;
976      if (*ptr == 'P') ptr++;                      /* Allow optional P */
977    
978      /* We have to disambiguate (?<! and (?<= from (?<name> */
979    
980      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
981           *ptr != '\'')
982        continue;
983    
984      count++;
985    
986      if (name == NULL && count == lorn) return count;
987      term = *ptr++;
988      if (term == '<') term = '>';
989      thisname = ptr;
990      while (*ptr != term) ptr++;
991      if (name != NULL && lorn == ptr - thisname &&
992          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
993        return count;
994    }    }
995  return p;  
996    return -1;
997  }  }
998    
999    
# Line 778  for (;;) Line 1047  for (;;)
1047    
1048      case OP_CALLOUT:      case OP_CALLOUT:
1049      case OP_CREF:      case OP_CREF:
1050      case OP_BRANUMBER:      case OP_RREF:
1051        case OP_DEF:
1052      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1053      break;      break;
1054    
# Line 823  for (;;) Line 1093  for (;;)
1093    {    {
1094    int d;    int d;
1095    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1096    
1097    switch (op)    switch (op)
1098      {      {
1099        case OP_CBRA:
1100      case OP_BRA:      case OP_BRA:
1101      case OP_ONCE:      case OP_ONCE:
1102      case OP_COND:      case OP_COND:
1103      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1104      if (d < 0) return d;      if (d < 0) return d;
1105      branchlength += d;      branchlength += d;
1106      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 865  for (;;) Line 1135  for (;;)
1135      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1136    
1137      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1138      case OP_CREF:      case OP_CREF:
1139        case OP_RREF:
1140        case OP_DEF:
1141      case OP_OPT:      case OP_OPT:
1142      case OP_CALLOUT:      case OP_CALLOUT:
1143      case OP_SOD:      case OP_SOD:
# Line 884  for (;;) Line 1155  for (;;)
1155    
1156      case OP_CHAR:      case OP_CHAR:
1157      case OP_CHARNC:      case OP_CHARNC:
1158        case OP_NOT:
1159      branchlength++;      branchlength++;
1160      cc += 2;      cc += 2;
1161  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 917  for (;;) Line 1189  for (;;)
1189    
1190      case OP_PROP:      case OP_PROP:
1191      case OP_NOTPROP:      case OP_NOTPROP:
1192      cc++;      cc += 2;
1193      /* Fall through */      /* Fall through */
1194    
1195      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 998  Returns:      pointer to the opcode for Line 1270  Returns:      pointer to the opcode for
1270  static const uschar *  static const uschar *
1271  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1272  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1273  for (;;)  for (;;)
1274    {    {
1275    register int c = *code;    register int c = *code;
1276    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1277    else if (c > OP_BRA)  
1278      /* XCLASS is used for classes that cannot be represented just by a bit
1279      map. This includes negated single high-valued characters. The length in
1280      the table is zero; the actual length is stored in the compiled code. */
1281    
1282      if (c == OP_XCLASS) code += GET(code, 1);
1283    
1284      /* Handle capturing bracket */
1285    
1286      else if (c == OP_CBRA)
1287      {      {
1288      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1289      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1290      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1291      }      }
1292    
1293      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1294      a multi-byte character. The length in the table is a minimum, so we have to
1295      arrange to skip the extra bytes. */
1296    
1297    else    else
1298      {      {
1299      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1300  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1301      if (utf8) switch(c)      if (utf8) switch(c)
1302        {        {
1303        case OP_CHAR:        case OP_CHAR:
# Line 1031  for (;;) Line 1305  for (;;)
1305        case OP_EXACT:        case OP_EXACT:
1306        case OP_UPTO:        case OP_UPTO:
1307        case OP_MINUPTO:        case OP_MINUPTO:
1308          case OP_POSUPTO:
1309        case OP_STAR:        case OP_STAR:
1310        case OP_MINSTAR:        case OP_MINSTAR:
1311          case OP_POSSTAR:
1312        case OP_PLUS:        case OP_PLUS:
1313        case OP_MINPLUS:        case OP_MINPLUS:
1314          case OP_POSPLUS:
1315        case OP_QUERY:        case OP_QUERY:
1316        case OP_MINQUERY:        case OP_MINQUERY:
1317        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1318        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1319        break;        break;
1320        }        }
1321  #endif  #endif
# Line 1072  Returns:      pointer to the opcode for Line 1342  Returns:      pointer to the opcode for
1342  static const uschar *  static const uschar *
1343  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1344  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1345  for (;;)  for (;;)
1346    {    {
1347    register int c = *code;    register int c = *code;
1348    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1349    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1350    else if (c > OP_BRA)  
1351      {    /* XCLASS is used for classes that cannot be represented just by a bit
1352      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1353      }    the table is zero; the actual length is stored in the compiled code. */
1354    
1355      if (c == OP_XCLASS) code += GET(code, 1);
1356    
1357      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1358      that are followed by a character may be followed by a multi-byte character.
1359      The length in the table is a minimum, so we have to arrange to skip the extra
1360      bytes. */
1361    
1362    else    else
1363      {      {
1364      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1365  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1366      if (utf8) switch(c)      if (utf8) switch(c)
1367        {        {
1368        case OP_CHAR:        case OP_CHAR:
# Line 1103  for (;;) Line 1370  for (;;)
1370        case OP_EXACT:        case OP_EXACT:
1371        case OP_UPTO:        case OP_UPTO:
1372        case OP_MINUPTO:        case OP_MINUPTO:
1373          case OP_POSUPTO:
1374        case OP_STAR:        case OP_STAR:
1375        case OP_MINSTAR:        case OP_MINSTAR:
1376          case OP_POSSTAR:
1377        case OP_PLUS:        case OP_PLUS:
1378        case OP_MINPLUS:        case OP_MINPLUS:
1379          case OP_POSPLUS:
1380        case OP_QUERY:        case OP_QUERY:
1381        case OP_MINQUERY:        case OP_MINQUERY:
1382        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1383        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1384        break;        break;
1385        }        }
1386  #endif  #endif
# Line 1132  for (;;) Line 1395  for (;;)
1395  *************************************************/  *************************************************/
1396    
1397  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1398  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1399  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1400  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1401  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1402    struck an inner bracket whose current branch will already have been scanned.
1403    
1404  Arguments:  Arguments:
1405    code        points to start of search    code        points to start of search
# Line 1149  static BOOL Line 1413  static BOOL
1413  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1414  {  {
1415  register int c;  register int c;
1416  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1417       code < endcode;       code < endcode;
1418       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1419    {    {
# Line 1157  for (code = first_significant_code(code Line 1421  for (code = first_significant_code(code
1421    
1422    c = *code;    c = *code;
1423    
1424    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1425    
1426      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1427        {
1428        code += _pcre_OP_lengths[c];
1429        do code += GET(code, 1); while (*code == OP_ALT);
1430        c = *code;
1431        continue;
1432        }
1433    
1434      /* For other groups, scan the branches. */
1435    
1436      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1437      {      {
1438      BOOL empty_branch;      BOOL empty_branch;
1439      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1173  for (code = first_significant_code(code Line 1449  for (code = first_significant_code(code
1449        }        }
1450      while (*code == OP_ALT);      while (*code == OP_ALT);
1451      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1452      c = *code;      c = *code;
1453        continue;
1454      }      }
1455    
1456    else switch (c)    /* Handle the other opcodes */
1457    
1458      switch (c)
1459      {      {
1460      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1461    
# Line 1233  for (code = first_significant_code(code Line 1511  for (code = first_significant_code(code
1511      case OP_NOT:      case OP_NOT:
1512      case OP_PLUS:      case OP_PLUS:
1513      case OP_MINPLUS:      case OP_MINPLUS:
1514        case OP_POSPLUS:
1515      case OP_EXACT:      case OP_EXACT:
1516      case OP_NOTPLUS:      case OP_NOTPLUS:
1517      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1518        case OP_NOTPOSPLUS:
1519      case OP_NOTEXACT:      case OP_NOTEXACT:
1520      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1521      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1522        case OP_TYPEPOSPLUS:
1523      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1524      return FALSE;      return FALSE;
1525    
# Line 1250  for (code = first_significant_code(code Line 1531  for (code = first_significant_code(code
1531      case OP_ALT:      case OP_ALT:
1532      return TRUE;      return TRUE;
1533    
1534      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1535      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1536    
1537  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1538      case OP_STAR:      case OP_STAR:
1539      case OP_MINSTAR:      case OP_MINSTAR:
1540        case OP_POSSTAR:
1541      case OP_QUERY:      case OP_QUERY:
1542      case OP_MINQUERY:      case OP_MINQUERY:
1543        case OP_POSQUERY:
1544      case OP_UPTO:      case OP_UPTO:
1545      case OP_MINUPTO:      case OP_MINUPTO:
1546        case OP_POSUPTO:
1547      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1548      break;      break;
1549  #endif  #endif
# Line 1377  earlier groups that are outside the curr Line 1661  earlier groups that are outside the curr
1661  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1662  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1663  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1664  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1665  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1666    
1667    This function has been extended with the possibility of forward references for
1668    recursions and subroutine calls. It must also check the list of such references
1669    for the group we are dealing with. If it finds that one of the recursions in
1670    the current group is on this list, it adjusts the offset in the list, not the
1671    value in the reference (which is a group number).
1672    
1673  Arguments:  Arguments:
1674    group      points to the start of the group    group      points to the start of the group
1675    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1676    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1677    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1678      save_hwm   the hwm forward reference pointer at the start of the group
1679    
1680  Returns:     nothing  Returns:     nothing
1681  */  */
1682    
1683  static void  static void
1684  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1685      uschar *save_hwm)
1686  {  {
1687  uschar *ptr = group;  uschar *ptr = group;
1688  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1689    {    {
1690    int offset = GET(ptr, 1);    int offset;
1691    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1692    
1693      /* See if this recursion is on the forward reference list. If so, adjust the
1694      reference. */
1695    
1696      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1697        {
1698        offset = GET(hc, 0);
1699        if (cd->start_code + offset == ptr + 1)
1700          {
1701          PUT(hc, 0, offset + adjust);
1702          break;
1703          }
1704        }
1705    
1706      /* Otherwise, adjust the recursion offset if it's after the start of this
1707      group. */
1708    
1709      if (hc >= cd->hwm)
1710        {
1711        offset = GET(ptr, 1);
1712        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1713        }
1714    
1715    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1716    }    }
1717  }  }
# Line 1475  Yield:        TRUE when range returned; Line 1790  Yield:        TRUE when range returned;
1790  */  */
1791    
1792  static BOOL  static BOOL
1793  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1794      unsigned int *odptr)
1795  {  {
1796  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1797    
1798  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1799    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1800    
1801  if (c > d) return FALSE;  if (c > d) return FALSE;
1802    
# Line 1492  next = othercase + 1; Line 1805  next = othercase + 1;
1805    
1806  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1807    {    {
1808    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1809    next++;    next++;
1810    }    }
1811    
# Line 1506  return TRUE; Line 1817  return TRUE;
1817  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1818    
1819    
1820    
1821  /*************************************************  /*************************************************
1822  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1823  *************************************************/  *************************************************/
1824    
1825  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1826  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1827  bits.  sense to automatically possessify the repeated item.
1828    
1829  Arguments:  Arguments:
1830    optionsptr     pointer to the option bits    op_code       the repeated op code
1831    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1832    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1833    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1834    errorcodeptr   points to error code variable    ptr           next character in pattern
1835    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1836    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1837    
1838  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1839  */  */
1840    
1841  static BOOL  static BOOL
1842  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1843    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1844  {  {
1845  int repeat_type, op_type;  int next;
 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
 int bravalue = 0;  
 int greedy_default, greedy_non_default;  
 int firstbyte, reqbyte;  
 int zeroreqbyte, zerofirstbyte;  
 int req_caseopt, reqvary, tempreqvary;  
 int condcount = 0;  
 int options = *optionsptr;  
 int after_manual_callout = 0;  
 register int c;  
 register uschar *code = *codeptr;  
 uschar *tempcode;  
 BOOL inescq = FALSE;  
 BOOL groupsetfirstbyte = FALSE;  
 const uschar *ptr = *ptrptr;  
 const uschar *tempptr;  
 uschar *previous = NULL;  
 uschar *previous_callout = NULL;  
 uschar classbits[32];  
   
 #ifdef SUPPORT_UTF8  
 BOOL class_utf8;  
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
 #endif  
1846    
1847  /* Set up the default and non-default settings for greediness */  /* Skip whitespace and comments in extended mode */
   
 greedy_default = ((options & PCRE_UNGREEDY) != 0);  
 greedy_non_default = greedy_default ^ 1;  
1848    
1849  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1850  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1851  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1852  find one.      {
1853        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1854        if (*ptr == '#')
1855          {
1856          while (*(++ptr) != 0)
1857            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1858          }
1859        else break;
1860        }
1861      }
1862    
1863  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next item is one that we can handle, get its value. A non-negative
1864  to take the zero repeat into account. This is implemented by setting them to  value is a character, a negative value is an escape value. */
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1865    
1866  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '\\')
1867      {
1868      int temperrorcode = 0;
1869      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1870      if (temperrorcode != 0) return FALSE;
1871      ptr++;    /* Point after the escape sequence */
1872      }
1873    
1874  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1875  according to the current setting of the caseless flag. REQ_CASELESS is a bit    {
1876  value > 255. It is added into the firstbyte or reqbyte variables to record the  #ifdef SUPPORT_UTF8
1877  case status of the value. This is used only for ASCII characters. */    if (utf8) { GETCHARINC(next, ptr); } else
1878    #endif
1879      next = *ptr++;
1880      }
1881    
1882  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  else return FALSE;
1883    
1884  /* Switch on next character until the end of the branch */  /* Skip whitespace and comments in extended mode */
1885    
1886  for (;; ptr++)  if ((options & PCRE_EXTENDED) != 0)
1887      {
1888      for (;;)
1889        {
1890        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1891        if (*ptr == '#')
1892          {
1893          while (*(++ptr) != 0)
1894            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1895          }
1896        else break;
1897        }
1898      }
1899    
1900    /* If the next thing is itself optional, we have to give up. */
1901    
1902    if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1903      return FALSE;
1904    
1905    /* Now compare the next item with the previous opcode. If the previous is a
1906    positive single character match, "item" either contains the character or, if
1907    "item" is greater than 127 in utf8 mode, the character's bytes are in
1908    utf8_char. */
1909    
1910    
1911    /* Handle cases when the next item is a character. */
1912    
1913    if (next >= 0) switch(op_code)
1914      {
1915      case OP_CHAR:
1916    #ifdef SUPPORT_UTF8
1917      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1918    #endif
1919      return item != next;
1920    
1921      /* For CHARNC (caseless character) we must check the other case. If we have
1922      Unicode property support, we can use it to test the other case of
1923      high-valued characters. */
1924    
1925      case OP_CHARNC:
1926    #ifdef SUPPORT_UTF8
1927      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1928    #endif
1929      if (item == next) return FALSE;
1930    #ifdef SUPPORT_UTF8
1931      if (utf8)
1932        {
1933        unsigned int othercase;
1934        if (next < 128) othercase = cd->fcc[next]; else
1935    #ifdef SUPPORT_UCP
1936        othercase = _pcre_ucp_othercase((unsigned int)next);
1937    #else
1938        othercase = NOTACHAR;
1939    #endif
1940        return (unsigned int)item != othercase;
1941        }
1942      else
1943    #endif  /* SUPPORT_UTF8 */
1944      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1945    
1946      /* For OP_NOT, "item" must be a single-byte character. */
1947    
1948      case OP_NOT:
1949      if (next < 0) return FALSE;  /* Not a character */
1950      if (item == next) return TRUE;
1951      if ((options & PCRE_CASELESS) == 0) return FALSE;
1952    #ifdef SUPPORT_UTF8
1953      if (utf8)
1954        {
1955        unsigned int othercase;
1956        if (next < 128) othercase = cd->fcc[next]; else
1957    #ifdef SUPPORT_UCP
1958        othercase = _pcre_ucp_othercase(next);
1959    #else
1960        othercase = NOTACHAR;
1961    #endif
1962        return (unsigned int)item == othercase;
1963        }
1964      else
1965    #endif  /* SUPPORT_UTF8 */
1966      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1967    
1968      case OP_DIGIT:
1969      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1970    
1971      case OP_NOT_DIGIT:
1972      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1973    
1974      case OP_WHITESPACE:
1975      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1976    
1977      case OP_NOT_WHITESPACE:
1978      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1979    
1980      case OP_WORDCHAR:
1981      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1982    
1983      case OP_NOT_WORDCHAR:
1984      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1985    
1986      case OP_HSPACE:
1987      case OP_NOT_HSPACE:
1988      switch(next)
1989        {
1990        case 0x09:
1991        case 0x20:
1992        case 0xa0:
1993        case 0x1680:
1994        case 0x180e:
1995        case 0x2000:
1996        case 0x2001:
1997        case 0x2002:
1998        case 0x2003:
1999        case 0x2004:
2000        case 0x2005:
2001        case 0x2006:
2002        case 0x2007:
2003        case 0x2008:
2004        case 0x2009:
2005        case 0x200A:
2006        case 0x202f:
2007        case 0x205f:
2008        case 0x3000:
2009        return op_code != OP_HSPACE;
2010        default:
2011        return op_code == OP_HSPACE;
2012        }
2013    
2014      case OP_VSPACE:
2015      case OP_NOT_VSPACE:
2016      switch(next)
2017        {
2018        case 0x0a:
2019        case 0x0b:
2020        case 0x0c:
2021        case 0x0d:
2022        case 0x85:
2023        case 0x2028:
2024        case 0x2029:
2025        return op_code != OP_VSPACE;
2026        default:
2027        return op_code == OP_VSPACE;
2028        }
2029    
2030      default:
2031      return FALSE;
2032      }
2033    
2034    
2035    /* Handle the case when the next item is \d, \s, etc. */
2036    
2037    switch(op_code)
2038      {
2039      case OP_CHAR:
2040      case OP_CHARNC:
2041    #ifdef SUPPORT_UTF8
2042      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2043    #endif
2044      switch(-next)
2045        {
2046        case ESC_d:
2047        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2048    
2049        case ESC_D:
2050        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2051    
2052        case ESC_s:
2053        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2054    
2055        case ESC_S:
2056        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2057    
2058        case ESC_w:
2059        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2060    
2061        case ESC_W:
2062        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2063    
2064        case ESC_h:
2065        case ESC_H:
2066        switch(item)
2067          {
2068          case 0x09:
2069          case 0x20:
2070          case 0xa0:
2071          case 0x1680:
2072          case 0x180e:
2073          case 0x2000:
2074          case 0x2001:
2075          case 0x2002:
2076          case 0x2003:
2077          case 0x2004:
2078          case 0x2005:
2079          case 0x2006:
2080          case 0x2007:
2081          case 0x2008:
2082          case 0x2009:
2083          case 0x200A:
2084          case 0x202f:
2085          case 0x205f:
2086          case 0x3000:
2087          return -next != ESC_h;
2088          default:
2089          return -next == ESC_h;
2090          }
2091    
2092        case ESC_v:
2093        case ESC_V:
2094        switch(item)
2095          {
2096          case 0x0a:
2097          case 0x0b:
2098          case 0x0c:
2099          case 0x0d:
2100          case 0x85:
2101          case 0x2028:
2102          case 0x2029:
2103          return -next != ESC_v;
2104          default:
2105          return -next == ESC_v;
2106          }
2107    
2108        default:
2109        return FALSE;
2110        }
2111    
2112      case OP_DIGIT:
2113      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2114             next == -ESC_h || next == -ESC_v;
2115    
2116      case OP_NOT_DIGIT:
2117      return next == -ESC_d;
2118    
2119      case OP_WHITESPACE:
2120      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2121    
2122      case OP_NOT_WHITESPACE:
2123      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2124    
2125      case OP_HSPACE:
2126      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2127    
2128      case OP_NOT_HSPACE:
2129      return next == -ESC_h;
2130    
2131      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2132      case OP_VSPACE:
2133      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2134    
2135      case OP_NOT_VSPACE:
2136      return next == -ESC_v;
2137    
2138      case OP_WORDCHAR:
2139      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2140    
2141      case OP_NOT_WORDCHAR:
2142      return next == -ESC_w || next == -ESC_d;
2143    
2144      default:
2145      return FALSE;
2146      }
2147    
2148    /* Control does not reach here */
2149    }
2150    
2151    
2152    
2153    /*************************************************
2154    *           Compile one branch                   *
2155    *************************************************/
2156    
2157    /* Scan the pattern, compiling it into the a vector. If the options are
2158    changed during the branch, the pointer is used to change the external options
2159    bits. This function is used during the pre-compile phase when we are trying
2160    to find out the amount of memory needed, as well as during the real compile
2161    phase. The value of lengthptr distinguishes the two phases.
2162    
2163    Arguments:
2164      optionsptr     pointer to the option bits
2165      codeptr        points to the pointer to the current code point
2166      ptrptr         points to the current pattern pointer
2167      errorcodeptr   points to error code variable
2168      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2169      reqbyteptr     set to the last literal character required, else < 0
2170      bcptr          points to current branch chain
2171      cd             contains pointers to tables etc.
2172      lengthptr      NULL during the real compile phase
2173                     points to length accumulator during pre-compile phase
2174    
2175    Returns:         TRUE on success
2176                     FALSE, with *errorcodeptr set non-zero on error
2177    */
2178    
2179    static BOOL
2180    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2181      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2182      compile_data *cd, int *lengthptr)
2183    {
2184    int repeat_type, op_type;
2185    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2186    int bravalue = 0;
2187    int greedy_default, greedy_non_default;
2188    int firstbyte, reqbyte;
2189    int zeroreqbyte, zerofirstbyte;
2190    int req_caseopt, reqvary, tempreqvary;
2191    int options = *optionsptr;
2192    int after_manual_callout = 0;
2193    int length_prevgroup = 0;
2194    register int c;
2195    register uschar *code = *codeptr;
2196    uschar *last_code = code;
2197    uschar *orig_code = code;
2198    uschar *tempcode;
2199    BOOL inescq = FALSE;
2200    BOOL groupsetfirstbyte = FALSE;
2201    const uschar *ptr = *ptrptr;
2202    const uschar *tempptr;
2203    uschar *previous = NULL;
2204    uschar *previous_callout = NULL;
2205    uschar *save_hwm = NULL;
2206    uschar classbits[32];
2207    
2208    #ifdef SUPPORT_UTF8
2209    BOOL class_utf8;
2210    BOOL utf8 = (options & PCRE_UTF8) != 0;
2211    uschar *class_utf8data;
2212    uschar utf8_char[6];
2213    #else
2214    BOOL utf8 = FALSE;
2215    uschar *utf8_char = NULL;
2216    #endif
2217    
2218    #ifdef DEBUG
2219    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2220    #endif
2221    
2222    /* Set up the default and non-default settings for greediness */
2223    
2224    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2225    greedy_non_default = greedy_default ^ 1;
2226    
2227    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2228    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2229    matches a non-fixed char first char; reqbyte just remains unset if we never
2230    find one.
2231    
2232    When we hit a repeat whose minimum is zero, we may have to adjust these values
2233    to take the zero repeat into account. This is implemented by setting them to
2234    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2235    item types that can be repeated set these backoff variables appropriately. */
2236    
2237    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2238    
2239    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2240    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2241    value > 255. It is added into the firstbyte or reqbyte variables to record the
2242    case status of the value. This is used only for ASCII characters. */
2243    
2244    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2245    
2246    /* Switch on next character until the end of the branch */
2247    
2248    for (;; ptr++)
2249    {    {
2250    BOOL negate_class;    BOOL negate_class;
2251    BOOL possessive_quantifier;    BOOL possessive_quantifier;
2252    BOOL is_quantifier;    BOOL is_quantifier;
2253      BOOL is_recurse;
2254      BOOL reset_bracount;
2255    int class_charcount;    int class_charcount;
2256    int class_lastchar;    int class_lastchar;
2257    int newoptions;    int newoptions;
2258    int recno;    int recno;
2259      int refsign;
2260    int skipbytes;    int skipbytes;
2261    int subreqbyte;    int subreqbyte;
2262    int subfirstbyte;    int subfirstbyte;
2263      int terminator;
2264    int mclength;    int mclength;
2265    uschar mcbuffer[8];    uschar mcbuffer[8];
2266    
2267    /* Next byte in the pattern */    /* Get next byte in the pattern */
2268    
2269    c = *ptr;    c = *ptr;
2270    
2271      /* If we are in the pre-compile phase, accumulate the length used for the
2272      previous cycle of this loop. */
2273    
2274      if (lengthptr != NULL)
2275        {
2276    #ifdef DEBUG
2277        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2278    #endif
2279        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2280          {
2281          *errorcodeptr = ERR52;
2282          goto FAILED;
2283          }
2284    
2285        /* There is at least one situation where code goes backwards: this is the
2286        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2287        the class is simply eliminated. However, it is created first, so we have to
2288        allow memory for it. Therefore, don't ever reduce the length at this point.
2289        */
2290    
2291        if (code < last_code) code = last_code;
2292    
2293        /* Paranoid check for integer overflow */
2294    
2295        if (OFLOW_MAX - *lengthptr < code - last_code)
2296          {
2297          *errorcodeptr = ERR20;
2298          goto FAILED;
2299          }
2300    
2301        *lengthptr += code - last_code;
2302        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2303    
2304        /* If "previous" is set and it is not at the start of the work space, move
2305        it back to there, in order to avoid filling up the work space. Otherwise,
2306        if "previous" is NULL, reset the current code pointer to the start. */
2307    
2308        if (previous != NULL)
2309          {
2310          if (previous > orig_code)
2311            {
2312            memmove(orig_code, previous, code - previous);
2313            code -= previous - orig_code;
2314            previous = orig_code;
2315            }
2316          }
2317        else code = orig_code;
2318    
2319        /* Remember where this code item starts so we can pick up the length
2320        next time round. */
2321    
2322        last_code = code;
2323        }
2324    
2325      /* In the real compile phase, just check the workspace used by the forward
2326      reference list. */
2327    
2328      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2329        {
2330        *errorcodeptr = ERR52;
2331        goto FAILED;
2332        }
2333    
2334    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2335    
2336    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1623  for (;; ptr++) Line 2345  for (;; ptr++)
2345        {        {
2346        if (previous_callout != NULL)        if (previous_callout != NULL)
2347          {          {
2348          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2349              complete_callout(previous_callout, ptr, cd);
2350          previous_callout = NULL;          previous_callout = NULL;
2351          }          }
2352        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1644  for (;; ptr++) Line 2367  for (;; ptr++)
2367    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2368         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2369      {      {
2370      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2371          complete_callout(previous_callout, ptr, cd);
2372      previous_callout = NULL;      previous_callout = NULL;
2373      }      }
2374    
# Line 1655  for (;; ptr++) Line 2379  for (;; ptr++)
2379      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2380      if (c == '#')      if (c == '#')
2381        {        {
2382        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2383        on the Macintosh. */          {
2384        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2385        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2386          if (*ptr != 0) continue;
2387    
2388          /* Else fall through to handle end of string */
2389          c = 0;
2390        }        }
2391      }      }
2392    
# Line 1672  for (;; ptr++) Line 2400  for (;; ptr++)
2400    
2401    switch(c)    switch(c)
2402      {      {
2403      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2404        case 0:                        /* The branch terminates at string end */
2405      case 0:      case '|':                      /* or | or ) */
     case '|':  
2406      case ')':      case ')':
2407      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2408      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2409      *codeptr = code;      *codeptr = code;
2410      *ptrptr = ptr;      *ptrptr = ptr;
2411        if (lengthptr != NULL)
2412          {
2413          if (OFLOW_MAX - *lengthptr < code - last_code)
2414            {
2415            *errorcodeptr = ERR20;
2416            goto FAILED;
2417            }
2418          *lengthptr += code - last_code;   /* To include callout length */
2419          DPRINTF((">> end branch\n"));
2420          }
2421      return TRUE;      return TRUE;
2422    
2423    
2424        /* ===================================================================*/
2425      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2426      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2427    
# Line 1711  for (;; ptr++) Line 2450  for (;; ptr++)
2450      *code++ = OP_ANY;      *code++ = OP_ANY;
2451      break;      break;
2452    
2453      /* Character classes. If the included characters are all < 255 in value, we  
2454      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2455      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2456      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2457      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2458        map as usual, then invert it at the end. However, we use a different opcode
2459        so that data characters > 255 can be handled correctly.
2460    
2461      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2462      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1736  for (;; ptr++) Line 2477  for (;; ptr++)
2477        goto FAILED;        goto FAILED;
2478        }        }
2479    
2480      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2481        if the first few characters (either before or after ^) are \Q\E or \E we
2482        skip them too. This makes for compatibility with Perl. */
2483    
2484      if ((c = *(++ptr)) == '^')      negate_class = FALSE;
2485        for (;;)
2486        {        {
       negate_class = TRUE;  
2487        c = *(++ptr);        c = *(++ptr);
2488        }        if (c == '\\')
2489      else          {
2490        {          if (ptr[1] == 'E') ptr++;
2491        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2492                else break;
2493            }
2494          else if (!negate_class && c == '^')
2495            negate_class = TRUE;
2496          else break;
2497        }        }
2498    
2499      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2500      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2501      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2502    
2503      class_charcount = 0;      class_charcount = 0;
2504      class_lastchar = -1;      class_lastchar = -1;
2505    
2506        /* Initialize the 32-char bit map to all zeros. We build the map in a
2507        temporary bit of memory, in case the class contains only 1 character (less
2508        than 256), because in that case the compiled code doesn't use the bit map.
2509        */
2510    
2511        memset(classbits, 0, 32 * sizeof(uschar));
2512    
2513  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2514      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2515      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2516  #endif  #endif
2517    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2518      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2519      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2520      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2521    
2522      do      if (c != 0) do
2523        {        {
2524          const uschar *oldptr;
2525    
2526  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2527        if (utf8 && c > 127)        if (utf8 && c > 127)
2528          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1786  for (;; ptr++) Line 2534  for (;; ptr++)
2534    
2535        if (inescq)        if (inescq)
2536          {          {
2537          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2538            {            {
2539            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2540            ptr++;            ptr++;                            /* Skip the 'E' */
2541            continue;            continue;                         /* Carry on with next */
2542            }            }
2543          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2544          }          }
2545    
2546        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1806  for (;; ptr++) Line 2554  for (;; ptr++)
2554            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2555          {          {
2556          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2557          int posix_class, i;          int posix_class, taboffset, tabopt;
2558          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2559            uschar pbits[32];
2560    
2561          if (ptr[1] != ':')          if (ptr[1] != ':')
2562            {            {
# Line 1836  for (;; ptr++) Line 2585  for (;; ptr++)
2585          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2586            posix_class = 0;            posix_class = 0;
2587    
2588          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2589          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2590          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2591          white space chars afterwards. */          result into the bit map that is being built. */
2592    
2593          posix_class *= 3;          posix_class *= 3;
2594          for (i = 0; i < 3; i++)  
2595            /* Copy in the first table (always present) */
2596    
2597            memcpy(pbits, cbits + posix_class_maps[posix_class],
2598              32 * sizeof(uschar));
2599    
2600            /* If there is a second table, add or remove it as required. */
2601    
2602            taboffset = posix_class_maps[posix_class + 1];
2603            tabopt = posix_class_maps[posix_class + 2];
2604    
2605            if (taboffset >= 0)
2606            {            {
2607            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2608            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2609            else            else
2610              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2611            }            }
2612    
2613            /* Not see if we need to remove any special characters. An option
2614            value of 1 removes vertical space and 2 removes underscore. */
2615    
2616            if (tabopt < 0) tabopt = -tabopt;
2617            if (tabopt == 1) pbits[1] &= ~0x3c;
2618              else if (tabopt == 2) pbits[11] &= 0x7f;
2619    
2620            /* Add the POSIX table or its complement into the main table that is
2621            being built and we are done. */
2622    
2623            if (local_negate)
2624              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2625            else
2626              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2627    
2628          ptr = tempptr + 1;          ptr = tempptr + 1;
2629          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2630          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2631          }          }
2632    
2633        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2634        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2635        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2636        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2637        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2638        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2639    
2640        if (c == '\\')        if (c == '\\')
2641          {          {
2642          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2643            if (*errorcodeptr != 0) goto FAILED;
2644    
2645          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2646          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2647            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2648          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2649            {            {
2650            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1895  for (;; ptr++) Line 2659  for (;; ptr++)
2659            {            {
2660            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2661            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2662            switch (-c)  
2663              /* Save time by not doing this in the pre-compile phase. */
2664    
2665              if (lengthptr == NULL) switch (-c)
2666              {              {
2667              case ESC_d:              case ESC_d:
2668              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1923  for (;; ptr++) Line 2690  for (;; ptr++)
2690              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2691              continue;              continue;
2692    
2693  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
             case ESC_p:  
             case ESC_P:  
               {  
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
               class_utf8 = TRUE;  
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
                 XCL_PROP : XCL_NOTPROP;  
               *class_utf8data++ = property;  
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2694              continue;              continue;
 #endif  
2695    
2696              /* Unrecognized escapes are faulted if PCRE is running in its              default:    /* Not recognized; fall through */
2697              strict mode. By default, for compatibility with Perl, they are              break;      /* Need "default" setting to stop compiler warning. */
             treated as literals. */  
   
             default:  
             if ((options & PCRE_EXTRA) != 0)  
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2698              }              }
           }  
2699    
2700          /* Fall through if we have a single character (c >= 0). This may be            /* In the pre-compile phase, just do the recognition. */
         > 256 in UTF-8 mode. */  
2701    
2702          }   /* End of backslash handling */            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2703                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2704    
2705        /* A single character may be followed by '-' to form a range. However,            /* We need to deal with \H, \h, \V, and \v in both phases because
2706              they use extra memory. */
2707    
2708              if (-c == ESC_h)
2709                {
2710                SETBIT(classbits, 0x09); /* VT */
2711                SETBIT(classbits, 0x20); /* SPACE */
2712                SETBIT(classbits, 0xa0); /* NSBP */
2713    #ifdef SUPPORT_UTF8
2714                if (utf8)
2715                  {
2716                  class_utf8 = TRUE;
2717                  *class_utf8data++ = XCL_SINGLE;
2718                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2719                  *class_utf8data++ = XCL_SINGLE;
2720                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2721                  *class_utf8data++ = XCL_RANGE;
2722                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2723                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2724                  *class_utf8data++ = XCL_SINGLE;
2725                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2726                  *class_utf8data++ = XCL_SINGLE;
2727                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2728                  *class_utf8data++ = XCL_SINGLE;
2729                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2730                  }
2731    #endif
2732                continue;
2733                }
2734    
2735              if (-c == ESC_H)
2736                {
2737                for (c = 0; c < 32; c++)
2738                  {
2739                  int x = 0xff;
2740                  switch (c)
2741                    {
2742                    case 0x09/8: x ^= 1 << (0x09%8); break;
2743                    case 0x20/8: x ^= 1 << (0x20%8); break;
2744                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2745                    default: break;
2746                    }
2747                  classbits[c] |= x;
2748                  }
2749    
2750    #ifdef SUPPORT_UTF8
2751                if (utf8)
2752                  {
2753                  class_utf8 = TRUE;
2754                  *class_utf8data++ = XCL_RANGE;
2755                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2756                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2757                  *class_utf8data++ = XCL_RANGE;
2758                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2759                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2760                  *class_utf8data++ = XCL_RANGE;
2761                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2762                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2763                  *class_utf8data++ = XCL_RANGE;
2764                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2765                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2766                  *class_utf8data++ = XCL_RANGE;
2767                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2768                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2769                  *class_utf8data++ = XCL_RANGE;
2770                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2771                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2772                  *class_utf8data++ = XCL_RANGE;
2773                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2774                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2775                  }
2776    #endif
2777                continue;
2778                }
2779    
2780              if (-c == ESC_v)
2781                {
2782                SETBIT(classbits, 0x0a); /* LF */
2783                SETBIT(classbits, 0x0b); /* VT */
2784                SETBIT(classbits, 0x0c); /* FF */
2785                SETBIT(classbits, 0x0d); /* CR */
2786                SETBIT(classbits, 0x85); /* NEL */
2787    #ifdef SUPPORT_UTF8
2788                if (utf8)
2789                  {
2790                  class_utf8 = TRUE;
2791                  *class_utf8data++ = XCL_RANGE;
2792                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2793                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2794                  }
2795    #endif
2796                continue;
2797                }
2798    
2799              if (-c == ESC_V)
2800                {
2801                for (c = 0; c < 32; c++)
2802                  {
2803                  int x = 0xff;
2804                  switch (c)
2805                    {
2806                    case 0x0a/8: x ^= 1 << (0x0a%8);
2807                                 x ^= 1 << (0x0b%8);
2808                                 x ^= 1 << (0x0c%8);
2809                                 x ^= 1 << (0x0d%8);
2810                                 break;
2811                    case 0x85/8: x ^= 1 << (0x85%8); break;
2812                    default: break;
2813                    }
2814                  classbits[c] |= x;
2815                  }
2816    
2817    #ifdef SUPPORT_UTF8
2818                if (utf8)
2819                  {
2820                  class_utf8 = TRUE;
2821                  *class_utf8data++ = XCL_RANGE;
2822                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2823                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2824                  *class_utf8data++ = XCL_RANGE;
2825                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2826                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2827                  }
2828    #endif
2829                continue;
2830                }
2831    
2832              /* We need to deal with \P and \p in both phases. */
2833    
2834    #ifdef SUPPORT_UCP
2835              if (-c == ESC_p || -c == ESC_P)
2836                {
2837                BOOL negated;
2838                int pdata;
2839                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2840                if (ptype < 0) goto FAILED;
2841                class_utf8 = TRUE;
2842                *class_utf8data++ = ((-c == ESC_p) != negated)?
2843                  XCL_PROP : XCL_NOTPROP;
2844                *class_utf8data++ = ptype;
2845                *class_utf8data++ = pdata;
2846                class_charcount -= 2;   /* Not a < 256 character */
2847                continue;
2848                }
2849    #endif
2850              /* Unrecognized escapes are faulted if PCRE is running in its
2851              strict mode. By default, for compatibility with Perl, they are
2852              treated as literals. */
2853    
2854              if ((options & PCRE_EXTRA) != 0)
2855                {
2856                *errorcodeptr = ERR7;
2857                goto FAILED;
2858                }
2859    
2860              class_charcount -= 2;  /* Undo the default count from above */
2861              c = *ptr;              /* Get the final character and fall through */
2862              }
2863    
2864            /* Fall through if we have a single character (c >= 0). This may be
2865            greater than 256 in UTF-8 mode. */
2866    
2867            }   /* End of backslash handling */
2868    
2869          /* A single character may be followed by '-' to form a range. However,
2870        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2871        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2872          entirely. The code for handling \Q and \E is messy. */
2873    
2874          CHECK_RANGE:
2875          while (ptr[1] == '\\' && ptr[2] == 'E')
2876            {
2877            inescq = FALSE;
2878            ptr += 2;
2879            }
2880    
2881          oldptr = ptr;
2882    
2883        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
2884          {          {
2885          int d;          int d;
2886          ptr += 2;          ptr += 2;
2887            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2888    
2889            /* If we hit \Q (not followed by \E) at this point, go into escaped
2890            mode. */
2891    
2892            while (*ptr == '\\' && ptr[1] == 'Q')
2893              {
2894              ptr += 2;
2895              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2896              inescq = TRUE;
2897              break;
2898              }
2899    
2900            if (*ptr == 0 || (!inescq && *ptr == ']'))
2901              {
2902              ptr = oldptr;
2903              goto LONE_SINGLE_CHARACTER;
2904              }
2905    
2906  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2907          if (utf8)          if (utf8)
# Line 1981  for (;; ptr++) Line 2916  for (;; ptr++)
2916          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2917          in such circumstances. */          in such circumstances. */
2918    
2919          if (d == '\\')          if (!inescq && d == '\\')
2920            {            {
2921            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2922            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2923    
2924            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2925            was literal */            special means the '-' was literal */
2926    
2927            if (d < 0)            if (d < 0)
2928              {              {
2929              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2930              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2931                else if (d == -ESC_R) d = 'R'; else
2932                {                {
2933                ptr = oldptr - 2;                ptr = oldptr;
2934                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2935                }                }
2936              }              }
2937            }            }
2938    
2939          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2940          the pre-pass. Optimize one-character ranges */          one-character ranges */
2941    
2942            if (d < c)
2943              {
2944              *errorcodeptr = ERR8;
2945              goto FAILED;
2946              }
2947    
2948          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2949    
# Line 2022  for (;; ptr++) Line 2964  for (;; ptr++)
2964  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2965            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2966              {              {
2967              int occ, ocd;              unsigned int occ, ocd;
2968              int cc = c;              unsigned int cc = c;
2969              int origd = d;              unsigned int origd = d;
2970              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2971                {                {
2972                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2973                      ocd <= (unsigned int)d)
2974                    continue;                          /* Skip embedded ranges */
2975    
2976                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2977                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2978                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2979                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2980                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2981                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2982                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2983                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2984                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2985                  d = ocd;                  d = ocd;
2986                  continue;                  continue;
# Line 2082  for (;; ptr++) Line 3028  for (;; ptr++)
3028          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3029          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3030    
3031          for (; c <= d; c++)          class_charcount += d - c + 1;
3032            class_lastchar = d;
3033    
3034            /* We can save a bit of time by skipping this in the pre-compile. */
3035    
3036            if (lengthptr == NULL) for (; c <= d; c++)
3037            {            {
3038            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3039            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2090  for (;; ptr++) Line 3041  for (;; ptr++)
3041              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3042              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3043              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3044            }            }
3045    
3046          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2115  for (;; ptr++) Line 3064  for (;; ptr++)
3064  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3065          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3066            {            {
3067            int chartype;            unsigned int othercase;
3068            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3069              {              {
3070              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3071              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2143  for (;; ptr++) Line 3090  for (;; ptr++)
3090          }          }
3091        }        }
3092    
3093      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
3094    
3095      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3096    
3097        if (c == 0)                          /* Missing terminating ']' */
3098          {
3099          *errorcodeptr = ERR6;
3100          goto FAILED;
3101          }
3102    
3103      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3104      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2210  for (;; ptr++) Line 3162  for (;; ptr++)
3162    
3163      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3164      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3165      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3166    
3167  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3168      if (class_utf8)      if (class_utf8)
# Line 2220  for (;; ptr++) Line 3172  for (;; ptr++)
3172        code += LINK_SIZE;        code += LINK_SIZE;
3173        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3174    
3175        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3176        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3177    
3178        if (class_charcount > 0)        if (class_charcount > 0)
3179          {          {
3180          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3181            memmove(code + 32, code, class_utf8data - code);
3182          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3183          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3184          }          }
3185          else code = class_utf8data;
3186    
3187        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3188    
# Line 2254  for (;; ptr++) Line 3199  for (;; ptr++)
3199      if (negate_class)      if (negate_class)
3200        {        {
3201        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3202        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3203            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3204        }        }
3205      else      else
3206        {        {
# Line 2264  for (;; ptr++) Line 3210  for (;; ptr++)
3210      code += 32;      code += 32;
3211      break;      break;
3212    
3213    
3214        /* ===================================================================*/
3215      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3216      has been tested above. */      has been tested above. */
3217    
# Line 2331  for (;; ptr++) Line 3279  for (;; ptr++)
3279        }        }
3280      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3281    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3282      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3283      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3284      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2378  for (;; ptr++) Line 3312  for (;; ptr++)
3312          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3313          }          }
3314    
3315          /* If the repetition is unlimited, it pays to see if the next thing on
3316          the line is something that cannot possibly match this character. If so,
3317          automatically possessifying this item gains some performance in the case
3318          where the match fails. */
3319    
3320          if (!possessive_quantifier &&
3321              repeat_max < 0 &&
3322              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3323                options, cd))
3324            {
3325            repeat_type = 0;    /* Force greedy */
3326            possessive_quantifier = TRUE;
3327            }
3328    
3329        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3330        }        }
3331    
3332      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3333      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3334      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3335      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3336        currently used only for single-byte chars. */
3337    
3338      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3339        {        {
3340        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3341        c = previous[1];        c = previous[1];
3342          if (!possessive_quantifier &&
3343              repeat_max < 0 &&
3344              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3345            {
3346            repeat_type = 0;    /* Force greedy */
3347            possessive_quantifier = TRUE;
3348            }
3349        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3350        }        }
3351    
# Line 2403  for (;; ptr++) Line 3359  for (;; ptr++)
3359      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3360        {        {
3361        uschar *oldcode;        uschar *oldcode;
3362        int prop_type;        int prop_type, prop_value;
3363        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3364        c = *previous;        c = *previous;
3365    
3366          if (!possessive_quantifier &&
3367              repeat_max < 0 &&
3368              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3369            {
3370            repeat_type = 0;    /* Force greedy */
3371            possessive_quantifier = TRUE;
3372            }
3373    
3374        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3375        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3376          previous[1] : -1;          {
3377            prop_type = previous[1];
3378            prop_value = previous[2];
3379            }
3380          else prop_type = prop_value = -1;
3381    
3382        oldcode = code;        oldcode = code;
3383        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2443  for (;; ptr++) Line 3411  for (;; ptr++)
3411          }          }
3412    
3413        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3414        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3415        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3416        one less than the maximum. */        one less than the maximum. */
3417    
# Line 2470  for (;; ptr++) Line 3438  for (;; ptr++)
3438    
3439          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3440          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3441          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3442          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3443          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3444    
# Line 2486  for (;; ptr++) Line 3454  for (;; ptr++)
3454  #endif  #endif
3455              {              {
3456              *code++ = c;              *code++ = c;
3457              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3458                  {
3459                  *code++ = prop_type;
3460                  *code++ = prop_value;
3461                  }
3462              }              }
3463            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3464            }            }
3465    
3466          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3467          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3468            UPTO is just for 1 instance, we can use QUERY instead. */
3469    
3470          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3471            {            {
# Line 2505  for (;; ptr++) Line 3478  for (;; ptr++)
3478            else            else
3479  #endif  #endif
3480            *code++ = c;            *code++ = c;
3481            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3482                {
3483                *code++ = prop_type;
3484                *code++ = prop_value;
3485                }
3486            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3487            *code++ = OP_UPTO + repeat_type;  
3488            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3489                {
3490                *code++ = OP_QUERY + repeat_type;
3491                }
3492              else
3493                {
3494                *code++ = OP_UPTO + repeat_type;
3495                PUT2INC(code, 0, repeat_max);
3496                }
3497            }            }
3498          }          }
3499    
# Line 2524  for (;; ptr++) Line 3509  for (;; ptr++)
3509  #endif  #endif
3510        *code++ = c;        *code++ = c;
3511    
3512        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3513        defines the required property. */        define the required property. */
3514    
3515  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3516        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3517            {
3518            *code++ = prop_type;
3519            *code++ = prop_value;
3520            }
3521  #endif  #endif
3522        }        }
3523    
# Line 2571  for (;; ptr++) Line 3560  for (;; ptr++)
3560      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3561      cases. */      cases. */
3562    
3563      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3564               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3565        {        {
3566        register int i;        register int i;
3567        int ketoffset = 0;        int ketoffset = 0;
3568        int len = code - previous;        int len = code - previous;
3569        uschar *bralink = NULL;        uschar *bralink = NULL;
3570    
3571          /* Repeating a DEFINE group is pointless */
3572    
3573          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3574            {
3575            *errorcodeptr = ERR55;
3576            goto FAILED;
3577            }
3578    
3579        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3580        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3581        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2613  for (;; ptr++) Line 3610  for (;; ptr++)
3610          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3611          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3612          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3613          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3614          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3615            doing this. */
3616    
3617          if (repeat_max <= 1)          if (repeat_max <= 1)
3618            {            {
3619            *code = OP_END;            *code = OP_END;
3620            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3621            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3622            code++;            code++;
3623            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2637  for (;; ptr++) Line 3635  for (;; ptr++)
3635            {            {
3636            int offset;            int offset;
3637            *code = OP_END;            *code = OP_END;
3638            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3639            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3640            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3641            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3655  for (;; ptr++)
3655        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3656        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3657        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3658        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3659          forward reference subroutine calls in the group, there will be entries on
3660          the workspace list; replicate these with an appropriate increment. */
3661    
3662        else        else
3663          {          {
3664          if (repeat_min > 1)          if (repeat_min > 1)
3665            {            {
3666            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3667            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3668              potential integer overflow. */
3669    
3670              if (lengthptr != NULL)
3671                {
3672                int delta = (repeat_min - 1)*length_prevgroup;
3673                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3674                                                                (double)INT_MAX ||
3675                    OFLOW_MAX - *lengthptr < delta)
3676                  {
3677                  *errorcodeptr = ERR20;
3678                  goto FAILED;
3679                  }
3680                *lengthptr += delta;
3681                }
3682    
3683              /* This is compiling for real */
3684    
3685              else
3686              {              {
3687              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3688              code += len;              for (i = 1; i < repeat_min; i++)
3689                  {
3690                  uschar *hc;
3691                  uschar *this_hwm = cd->hwm;
3692                  memcpy(code, previous, len);
3693                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3694                    {
3695                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3696                    cd->hwm += LINK_SIZE;
3697                    }
3698                  save_hwm = this_hwm;
3699                  code += len;
3700                  }
3701              }              }
3702            }            }
3703    
3704          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3705          }          }
3706    
# Line 2677  for (;; ptr++) Line 3708  for (;; ptr++)
3708        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3709        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3710        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3711        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3712          replicate entries on the forward reference list. */
3713    
3714        if (repeat_max >= 0)        if (repeat_max >= 0)
3715          {          {
3716          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3717            just adjust the length as if we had. For each repetition we must add 1
3718            to the length for BRAZERO and for all but the last repetition we must
3719            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3720            paranoid checks to avoid integer overflow. */
3721    
3722            if (lengthptr != NULL && repeat_max > 0)
3723              {
3724              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3725                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3726              if ((double)repeat_max *
3727                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3728                      > (double)INT_MAX ||
3729                  OFLOW_MAX - *lengthptr < delta)
3730                {
3731                *errorcodeptr = ERR20;
3732                goto FAILED;
3733                }
3734              *lengthptr += delta;
3735              }
3736    
3737            /* This is compiling for real */
3738    
3739            else for (i = repeat_max - 1; i >= 0; i--)
3740            {            {
3741              uschar *hc;
3742              uschar *this_hwm = cd->hwm;
3743    
3744            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3745    
3746            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2698  for (;; ptr++) Line 3756  for (;; ptr++)
3756              }              }
3757    
3758            memcpy(code, previous, len);            memcpy(code, previous, len);
3759              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3760                {
3761                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3762                cd->hwm += LINK_SIZE;
3763                }
3764              save_hwm = this_hwm;
3765            code += len;            code += len;
3766            }            }
3767    
# Line 2720  for (;; ptr++) Line 3784  for (;; ptr++)
3784        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3785        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3786        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3787        correct offset was computed above. */        correct offset was computed above.
3788    
3789          Then, when we are doing the actual compile phase, check to see whether
3790          this group is a non-atomic one that could match an empty string. If so,
3791          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3792          that runtime checking can be done. [This check is also applied to
3793          atomic groups at runtime, but in a different way.] */
3794    
3795        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3796            {
3797            uschar *ketcode = code - ketoffset;
3798            uschar *bracode = ketcode - GET(ketcode, 1);
3799            *ketcode = OP_KETRMAX + repeat_type;
3800            if (lengthptr == NULL && *bracode != OP_ONCE)
3801              {
3802              uschar *scode = bracode;
3803              do
3804                {
3805                if (could_be_empty_branch(scode, ketcode, utf8))
3806                  {
3807                  *bracode += OP_SBRA - OP_BRA;
3808                  break;
3809                  }
3810                scode += GET(scode, 1);
3811                }
3812              while (*scode == OP_ALT);
3813              }
3814            }
3815        }        }
3816    
3817      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2733  for (;; ptr++) Line 3822  for (;; ptr++)
3822        goto FAILED;        goto FAILED;
3823        }        }
3824    
3825      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3826      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3827      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3828      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3829      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3830        but the special opcodes can optimize it a bit. The repeated item starts at
3831        tempcode, not at previous, which might be the first part of a string whose
3832        (former) last char we repeated.
3833    
3834        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3835        an 'upto' may follow. We skip over an 'exact' item, and then test the
3836        length of what remains before proceeding. */
3837    
3838      if (possessive_quantifier)      if (possessive_quantifier)
3839        {        {
3840        int len = code - tempcode;        int len;
3841        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3842        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3843        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3844        tempcode[0] = OP_ONCE;        len = code - tempcode;
3845        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3846        PUTINC(code, 0, len);          {
3847        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3848            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3849            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3850            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3851    
3852            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3853            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3854            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3855            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3856    
3857            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3858            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3859            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3860            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3861    
3862            default:
3863            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3864            code += 1 + LINK_SIZE;
3865            len += 1 + LINK_SIZE;
3866            tempcode[0] = OP_ONCE;
3867            *code++ = OP_KET;
3868            PUTINC(code, 0, len);
3869            PUT(tempcode, 1, len);
3870            break;
3871            }
3872        }        }
3873    
3874      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2761  for (;; ptr++) Line 3881  for (;; ptr++)
3881      break;      break;
3882    
3883    
3884      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3885      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3886      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3887      of any of them means that this is not a referencing group. They were      parenthesis forms.  */
     checked for validity in the first pass over the string, so we don't have to  
     check for syntax errors here.  */  
3888    
3889      case '(':      case '(':
3890      newoptions = options;      newoptions = options;
3891      skipbytes = 0;      skipbytes = 0;
3892        bravalue = OP_CBRA;
3893        save_hwm = cd->hwm;
3894        reset_bracount = FALSE;
3895    
3896        /* First deal with various "verbs" that can be introduced by '*'. */
3897    
3898        if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3899          {
3900          int i, namelen;
3901          const uschar *name = ++ptr;
3902          previous = NULL;
3903          while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3904          if (*ptr == ':')
3905            {
3906            *errorcodeptr = ERR59;   /* Not supported */
3907            goto FAILED;
3908            }
3909          if (*ptr != ')')
3910            {
3911            *errorcodeptr = ERR60;
3912            goto FAILED;
3913            }
3914          namelen = ptr - name;
3915          for (i = 0; i < verbcount; i++)
3916            {
3917            if (namelen == verbs[i].len &&
3918                strncmp((char *)name, verbs[i].name, namelen) == 0)
3919              {
3920              *code = verbs[i].op;
3921              if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3922              break;
3923              }
3924            }
3925          if (i < verbcount) continue;
3926          *errorcodeptr = ERR60;
3927          goto FAILED;
3928          }
3929    
3930        /* Deal with the extended parentheses; all are introduced by '?', and the
3931        appearance of any of them means that this is not a capturing group. */
3932    
3933      if (*(++ptr) == '?')      else if (*ptr == '?')
3934        {        {
3935        int set, unset;        int i, set, unset, namelen;
3936        int *optset;        int *optset;
3937          const uschar *name;
3938          uschar *slot;
3939    
3940        switch (*(++ptr))        switch (*(++ptr))
3941          {          {
3942          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3943          ptr++;          ptr++;
3944          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3945            if (*ptr == 0)
3946              {
3947              *errorcodeptr = ERR18;
3948              goto FAILED;
3949              }
3950          continue;          continue;
3951    
3952          case ':':                 /* Non-extracting bracket */  
3953            /* ------------------------------------------------------------ */
3954            case '|':                 /* Reset capture count for each branch */
3955            reset_bracount = TRUE;
3956            /* Fall through */
3957    
3958            /* ------------------------------------------------------------ */
3959            case ':':                 /* Non-capturing bracket */
3960          bravalue = OP_BRA;          bravalue = OP_BRA;
3961          ptr++;          ptr++;
3962          break;          break;
3963    
3964    
3965            /* ------------------------------------------------------------ */
3966          case '(':          case '(':
3967          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3968    
3969          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3970            group), a name (referring to a named group), or 'R', referring to
3971            recursion. R<digits> and R&name are also permitted for recursion tests.
3972    
3973            There are several syntaxes for testing a named group: (?(name)) is used
3974            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3975    
3976            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3977            be the recursive thing or the name 'R' (and similarly for 'R' followed
3978            by digits), and (b) a number could be a name that consists of digits.
3979            In both cases, we look for a name first; if not found, we try the other
3980            cases. */
3981    
3982            /* For conditions that are assertions, check the syntax, and then exit
3983            the switch. This will take control down to where bracketed groups,
3984            including assertions, are processed. */
3985    
3986            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3987              break;
3988    
3989            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3990            below), and all need to skip 3 bytes at the start of the group. */
3991    
3992          if (ptr[1] == 'R')          code[1+LINK_SIZE] = OP_CREF;
3993            skipbytes = 3;
3994            refsign = -1;
3995    
3996            /* Check for a test for recursion in a named group. */
3997    
3998            if (ptr[1] == 'R' && ptr[2] == '&')
3999            {            {
4000            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
4001            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
4002            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
4003            }            }
4004    
4005          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
4006          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
4007    
4008          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
4009            {            {
4010            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
4011            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
4012            }            }
4013          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
4014          set bravalue above. */            {
4015          break;            terminator = '\'';
4016              ptr++;
4017          case '=':                 /* Positive lookahead */            }
4018          bravalue = OP_ASSERT;          else
4019          ptr++;            {
4020          break;            terminator = 0;
4021              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4022              }
4023    
4024          case '!':                 /* Negative lookahead */          /* We now expect to read a name; any thing else is an error */
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
4025    
4026          case '<':                 /* Lookbehinds */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
         switch (*(++ptr))  
4027            {            {
4028            case '=':               /* Positive lookbehind */            ptr += 1;  /* To get the right offset */
4029            bravalue = OP_ASSERTBACK;            *errorcodeptr = ERR28;
4030            ptr++;            goto FAILED;
4031            break;            }
4032    
4033            case '!':               /* Negative lookbehind */          /* Read the name, but also get it as a number if it's all digits */
4034            bravalue = OP_ASSERTBACK_NOT;  
4035            recno = 0;
4036            name = ++ptr;
4037            while ((cd->ctypes[*ptr] & ctype_word) != 0)
4038              {
4039              if (recno >= 0)
4040                recno = ((digitab[*ptr] & ctype_digit) != 0)?
4041                  recno * 10 + *ptr - '0' : -1;
4042            ptr++;            ptr++;
           break;  
4043            }            }
4044          break;          namelen = ptr - name;
4045    
4046          case '>':                 /* One-time brackets */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4047          bravalue = OP_ONCE;            {
4048          ptr++;            ptr--;      /* Error offset */
4049          break;            *errorcodeptr = ERR26;
4050              goto FAILED;
4051              }
4052    
4053          case 'C':                 /* Callout - may be followed by digits; */          /* Do no further checking in the pre-compile phase. */
4054          previous_callout = code;  /* Save for later completion */  
4055          after_manual_callout = 1; /* Skip one item before completing */          if (lengthptr != NULL) break;
4056          *code++ = OP_CALLOUT;     /* Already checked that the terminating */  
4057            {                       /* closing parenthesis is present. */          /* In the real compile we do the work of looking for the actual
4058            int n = 0;          reference. If the string started with "+" or "-" we require the rest to
4059            while ((digitab[*(++ptr)] & ctype_digit) != 0)          be digits, in which case recno will be set. */
4060              n = n * 10 + *ptr - '0';  
4061            if (n > 255)          if (refsign > 0)
4062              {
4063              if (recno <= 0)
4064              {              {
4065              *errorcodeptr = ERR38;              *errorcodeptr = ERR58;
4066                goto FAILED;
4067                }
4068              if (refsign == '-')
4069                {
4070                recno = cd->bracount - recno + 1;
4071                if (recno <= 0)
4072                  {
4073                  *errorcodeptr = ERR15;
4074                  goto FAILED;
4075                  }
4076                }
4077              else recno += cd->bracount;
4078              PUT2(code, 2+LINK_SIZE, recno);
4079              break;
4080              }
4081    
4082            /* Otherwise (did not start with "+" or "-"), start by looking for the
4083            name. */
4084    
4085            slot = cd->name_table;
4086            for (i = 0; i < cd->names_found; i++)
4087              {
4088              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4089              slot += cd->name_entry_size;
4090              }
4091    
4092            /* Found a previous named subpattern */
4093    
4094            if (i < cd->names_found)
4095              {
4096              recno = GET2(slot, 0);
4097              PUT2(code, 2+LINK_SIZE, recno);
4098              }
4099    
4100            /* Search the pattern for a forward reference */
4101    
4102            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4103                            (options & PCRE_EXTENDED) != 0)) > 0)
4104              {
4105              PUT2(code, 2+LINK_SIZE, i);
4106              }
4107    
4108            /* If terminator == 0 it means that the name followed directly after
4109            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4110            some further alternatives to try. For the cases where terminator != 0
4111            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4112            now checked all the possibilities, so give an error. */
4113    
4114            else if (terminator != 0)
4115              {
4116              *errorcodeptr = ERR15;
4117              goto FAILED;
4118              }
4119    
4120            /* Check for (?(R) for recursion. Allow digits after R to specify a
4121            specific group number. */
4122    
4123            else if (*name == 'R')
4124              {
4125              recno = 0;
4126              for (i = 1; i < namelen; i++)
4127                {
4128                if ((digitab[name[i]] & ctype_digit) == 0)
4129                  {
4130                  *errorcodeptr = ERR15;
4131                  goto FAILED;
4132                  }
4133                recno = recno * 10 + name[i] - '0';
4134                }
4135              if (recno == 0) recno = RREF_ANY;
4136              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4137              PUT2(code, 2+LINK_SIZE, recno);
4138              }
4139    
4140            /* Similarly, check for the (?(DEFINE) "condition", which is always
4141            false. */
4142    
4143            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4144              {
4145              code[1+LINK_SIZE] = OP_DEF;
4146              skipbytes = 1;
4147              }
4148    
4149            /* Check for the "name" actually being a subpattern number. */
4150    
4151            else if (recno > 0)
4152              {
4153              PUT2(code, 2+LINK_SIZE, recno);
4154              }
4155    
4156            /* Either an unidentified subpattern, or a reference to (?(0) */
4157    
4158            else
4159              {
4160              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4161              goto FAILED;
4162              }
4163            break;
4164    
4165    
4166            /* ------------------------------------------------------------ */
4167            case '=':                 /* Positive lookahead */
4168            bravalue = OP_ASSERT;
4169            ptr++;
4170            break;
4171    
4172    
4173            /* ------------------------------------------------------------ */
4174            case '!':                 /* Negative lookahead */
4175            ptr++;
4176            if (*ptr == ')')          /* Optimize (?!) */
4177              {
4178              *code++ = OP_FAIL;
4179              previous = NULL;
4180              continue;
4181              }
4182            bravalue = OP_ASSERT_NOT;
4183            break;
4184    
4185    
4186            /* ------------------------------------------------------------ */
4187            case '<':                 /* Lookbehind or named define */
4188            switch (ptr[1])
4189              {
4190              case '=':               /* Positive lookbehind */
4191              bravalue = OP_ASSERTBACK;
4192              ptr += 2;
4193              break;
4194    
4195              case '!':               /* Negative lookbehind */
4196              bravalue = OP_ASSERTBACK_NOT;
4197              ptr += 2;
4198              break;
4199    
4200              default:                /* Could be name define, else bad */
4201              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4202              ptr++;                  /* Correct offset for error */
4203              *errorcodeptr = ERR24;
4204              goto FAILED;
4205              }
4206            break;
4207    
4208    
4209            /* ------------------------------------------------------------ */
4210            case '>':                 /* One-time brackets */
4211            bravalue = OP_ONCE;
4212            ptr++;
4213            break;
4214    
4215    
4216            /* ------------------------------------------------------------ */
4217            case 'C':                 /* Callout - may be followed by digits; */
4218            previous_callout = code;  /* Save for later completion */
4219            after_manual_callout = 1; /* Skip one item before completing */
4220            *code++ = OP_CALLOUT;
4221              {
4222              int n = 0;
4223              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4224                n = n * 10 + *ptr - '0';
4225              if (*ptr != ')')
4226                {
4227                *errorcodeptr = ERR39;
4228                goto FAILED;
4229                }
4230              if (n > 255)
4231                {
4232                *errorcodeptr = ERR38;
4233              goto FAILED;              goto FAILED;
4234              }              }
4235            *code++ = n;            *code++ = n;
# Line 2876  for (;; ptr++) Line 4240  for (;; ptr++)
4240          previous = NULL;          previous = NULL;
4241          continue;          continue;
4242    
4243          case 'P':                 /* Named subpattern handling */  
4244          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4245            case 'P':                 /* Python-style named subpattern handling */
4246            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4247            {            {
4248            int i, namelen;            is_recurse = *ptr == '>';
4249            uschar *slot = cd->name_table;            terminator = ')';
4250            const uschar *name;     /* Don't amalgamate; some compilers */            goto NAMED_REF_OR_RECURSE;
4251            name = ++ptr;           /* grumble at autoincrement in declaration */            }
4252            else if (*ptr != '<')    /* Test for Python-style definition */
4253              {
4254              *errorcodeptr = ERR41;
4255              goto FAILED;
4256              }
4257            /* Fall through to handle (?P< as (?< is handled */
4258    
           while (*ptr++ != '>');  
           namelen = ptr - name - 1;  
4259    
4260            for (i = 0; i < cd->names_found; i++)          /* ------------------------------------------------------------ */
4261            DEFINE_NAME:    /* Come here from (?< handling */
4262            case '\'':
4263              {
4264              terminator = (*ptr == '<')? '>' : '\'';
4265              name = ++ptr;
4266    
4267              while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4268              namelen = ptr - name;
4269    
4270              /* In the pre-compile phase, just do a syntax check. */
4271    
4272              if (lengthptr != NULL)
4273              {              {
4274              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
4275              if (crc == 0)                {
4276                  *errorcodeptr = ERR42;
4277                  goto FAILED;
4278                  }
4279                if (cd->names_found >= MAX_NAME_COUNT)
4280                  {
4281                  *errorcodeptr = ERR49;
4282                  goto FAILED;
4283                  }
4284                if (namelen + 3 > cd->name_entry_size)
4285                {                {
4286                if (slot[2+namelen] == 0)                cd->name_entry_size = namelen + 3;
4287                  if (namelen > MAX_NAME_SIZE)
4288                  {                  {
4289                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4290                  goto FAILED;                  goto FAILED;
4291                  }                  }
               crc = -1;             /* Current name is substring */  
4292                }                }
4293              if (crc < 0)              }
4294    
4295              /* In the real compile, create the entry in the table */
4296    
4297              else
4298                {
4299                slot = cd->name_table;
4300                for (i = 0; i < cd->names_found; i++)
4301                {                {
4302                memmove(slot + cd->name_entry_size, slot,                int crc = memcmp(name, slot+2, namelen);
4303                  (cd->names_found - i) * cd->name_entry_size);                if (crc == 0)
4304                break;                  {
4305                    if (slot[2+namelen] == 0)
4306                      {
4307                      if ((options & PCRE_DUPNAMES) == 0)
4308                        {
4309                        *errorcodeptr = ERR43;
4310                        goto FAILED;
4311                        }
4312                      }
4313                    else crc = -1;      /* Current name is substring */
4314                    }
4315                  if (crc < 0)
4316                    {
4317                    memmove(slot + cd->name_entry_size, slot,
4318                      (cd->names_found - i) * cd->name_entry_size);
4319                    break;
4320                    }
4321                  slot += cd->name_entry_size;
4322                }                }
             slot += cd->name_entry_size;  
             }  
4323    
4324            PUT2(slot, 0, *brackets + 1);              PUT2(slot, 0, cd->bracount + 1);
4325            memcpy(slot + 2, name, namelen);              memcpy(slot + 2, name, namelen);
4326            slot[2+namelen] = 0;              slot[2+namelen] = 0;
4327            cd->names_found++;              }
           goto NUMBERED_GROUP;  
4328            }            }
4329    
4330          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4331    
4332            ptr++;                    /* Move past > or ' */
4333            cd->names_found++;
4334            goto NUMBERED_GROUP;
4335    
4336    
4337            /* ------------------------------------------------------------ */
4338            case '&':                 /* Perl recursion/subroutine syntax */
4339            terminator = ')';
4340            is_recurse = TRUE;
4341            /* Fall through */
4342    
4343            /* We come here from the Python syntax above that handles both
4344            references (?P=name) and recursion (?P>name), as well as falling
4345            through from the Perl recursion syntax (?&name). */
4346    
4347            NAMED_REF_OR_RECURSE:
4348            name = ++ptr;
4349            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4350            namelen = ptr - name;
4351    
4352            /* In the pre-compile phase, do a syntax check and set a dummy
4353            reference number. */
4354    
4355            if (lengthptr != NULL)
4356            {            {
4357            int i, namelen;            if (*ptr != terminator)
4358            int type = *ptr++;              {
4359            const uschar *name = ptr;              *errorcodeptr = ERR42;
4360            uschar *slot = cd->name_table;              goto FAILED;
4361                }
4362              if (namelen > MAX_NAME_SIZE)
4363                {
4364                *errorcodeptr = ERR48;
4365                goto FAILED;
4366                }
4367              recno = 0;
4368              }
4369    
4370            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4371    
4372            else
4373              {
4374              slot = cd->name_table;
4375            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4376              {              {
4377              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4378              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4379              }              }
4380            if (i >= cd->names_found)  
4381              if (i < cd->names_found)         /* Back reference */
4382                {
4383                recno = GET2(slot, 0);
4384                }
4385              else if ((recno =                /* Forward back reference */
4386                        find_parens(ptr, cd->bracount, name, namelen,
4387                          (options & PCRE_EXTENDED) != 0)) <= 0)
4388              {              {
4389              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4390              goto FAILED;              goto FAILED;
4391              }              }
4392              }
4393    
4394            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4395            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4396    
4397            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4398            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4399    
         /* Should never happen */  
         break;  
4400    
4401          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4402            case 'R':                 /* Recursion */
4403          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4404          /* Fall through */          /* Fall through */
4405    
         /* Recursion or "subroutine" call */  
4406    
4407          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4408          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4409            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4410            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4411            {            {
4412            const uschar *called;            const uschar *called;
4413    
4414              if ((refsign = *ptr) == '+') ptr++;
4415              else if (refsign == '-')
4416                {
4417                if ((digitab[ptr[1]] & ctype_digit) == 0)
4418                  goto OTHER_CHAR_AFTER_QUERY;
4419                ptr++;
4420                }
4421    
4422            recno = 0;            recno = 0;
4423            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4424              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4425    
4426              if (*ptr != ')')
4427                {
4428                *errorcodeptr = ERR29;
4429                goto FAILED;
4430                }
4431    
4432              if (refsign == '-')
4433                {
4434                if (recno == 0)
4435                  {
4436                  *errorcodeptr = ERR58;
4437                  goto FAILED;
4438                  }
4439                recno = cd->bracount - recno + 1;
4440                if (recno <= 0)
4441                  {
4442                  *errorcodeptr = ERR15;
4443                  goto FAILED;
4444                  }
4445                }
4446              else if (refsign == '+')
4447                {
4448                if (recno == 0)
4449                  {
4450                  *errorcodeptr = ERR58;
4451                  goto FAILED;
4452                  }
4453                recno += cd->bracount;
4454                }
4455    
4456            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4457    
4458            HANDLE_RECURSION:            HANDLE_RECURSION:
4459    
4460            previous = code;            previous = code;
4461              called = cd->start_code;
4462    
4463            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4464            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4465              this point. If we end up with a forward reference, first check that
4466              the bracket does occur later so we can give the error (and position)
4467              now. Then remember this forward reference in the workspace so it can
4468              be filled in at the end. */
4469    
4470            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)?  
             cd->start_code : find_bracket(cd->start_code, utf8, recno);  
   
           if (called == NULL)  
4471              {              {
4472              *errorcodeptr = ERR15;              *code = OP_END;
4473              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4474    
4475            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4476    
4477            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4478              {                {
4479