/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 81 by nigel, Sat Feb 24 21:40:59 2007 UTC revision 206 by ph10, Fri Aug 3 14:53:04 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #ifdef HAVE_CONFIG_H
46    #include <config.h>
47    #endif
48    
49    #define NLBLOCK cd             /* Block containing newline information */
50    #define PSSTART start_pattern  /* Field containing processed string start */
51    #define PSEND   end_pattern    /* Field containing processed string end */
52    
53  #include "pcre_internal.h"  #include "pcre_internal.h"
54    
55    
56    /* When DEBUG is defined, we need the pcre_printint() function, which is also
57    used by pcretest. DEBUG is not defined when building a production library. */
58    
59    #ifdef DEBUG
60    #include "pcre_printint.src"
61    #endif
62    
63    
64    /* Macro for setting individual bits in class bitmaps. */
65    
66    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    /* Maximum length value to check against when making sure that the integer that
69    holds the compiled pattern length does not overflow. We make it a bit less than
70    INT_MAX to allow for adding in group terminating bytes, so that we don't have
71    to check them every time. */
72    
73    #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76  /*************************************************  /*************************************************
77  *      Code parameters and static tables         *  *      Code parameters and static tables         *
78  *************************************************/  *************************************************/
79    
80  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
81  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
82  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
83  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
84  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
85    so this number is very generous.
86    
87    The same workspace is used during the second, actual compile phase for
88    remembering forward references to groups so that they can be filled in at the
89    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90    is 4 there is plenty of room. */
91    
92  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
93    
94    
95  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 63  are simple data values; negative values Line 97  are simple data values; negative values
97  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
98  is invalid. */  is invalid. */
99    
100  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101  static const short int escapes[] = {  static const short int escapes[] = {
102       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
112  };  };
113    
114  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
115  static const short int escapes[] = {  static const short int escapes[] = {
116  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 87  static const short int escapes[] = { Line 121  static const short int escapes[] = {
121  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 107  static const short int escapes[] = { Line 141  static const short int escapes[] = {
141    
142    
143  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
144  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
145  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
146    
147  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 118  static const char *const posix_names[] = Line 152  static const char *const posix_names[] =
152  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
153    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
154    
155  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
156  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
157  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
158    characters are removed, and for [:alpha:] and [:alnum:] the underscore
159    character is removed. The triples in the table consist of the base map offset,
160    second map offset or -1 if no second map, and a non-negative value for map
161    addition or a negative value for map subtraction (if there are two maps). The
162    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
163    remove vertical space characters, 2 => remove underscore. */
164    
165  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
166    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
167    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
168    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
169    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
170    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
171    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
172    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
173    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
174    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
175    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
176    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
177    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
178    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
179    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
180  };  };
181    
182    
183    #define STRING(a)  # a
184    #define XSTRING(s) STRING(s)
185    
186  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
187  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
188    they are documented. Always add a new error instead. Messages marked DEAD below
189    are no longer used. */
190    
191  static const char *error_texts[] = {  static const char *error_texts[] = {
192    "no error",    "no error",
# Line 156  static const char *error_texts[] = { Line 201  static const char *error_texts[] = {
201    "range out of order in character class",    "range out of order in character class",
202    "nothing to repeat",    "nothing to repeat",
203    /* 10 */    /* 10 */
204    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
205    "internal error: unexpected repeat",    "internal error: unexpected repeat",
206    "unrecognized character after (?",    "unrecognized character after (?",
207    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 166  static const char *error_texts[] = { Line 211  static const char *error_texts[] = {
211    "erroffset passed as NULL",    "erroffset passed as NULL",
212    "unknown option bit(s) set",    "unknown option bit(s) set",
213    "missing ) after comment",    "missing ) after comment",
214    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
215    /* 20 */    /* 20 */
216    "regular expression too large",    "regular expression is too large",
217    "failed to get memory",    "failed to get memory",
218    "unmatched parentheses",    "unmatched parentheses",
219    "internal error: code overflow",    "internal error: code overflow",
220    "unrecognized character after (?<",    "unrecognized character after (?<",
221    /* 25 */    /* 25 */
222    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
223    "malformed number after (?(",    "malformed number or name after (?(",
224    "conditional group contains more than two branches",    "conditional group contains more than two branches",
225    "assertion expected after (?(",    "assertion expected after (?(",
226    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
227    /* 30 */    /* 30 */
228    "unknown POSIX class name",    "unknown POSIX class name",
229    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
230    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
231    "spare error",    "spare error",  /** DEAD **/
232    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
233    /* 35 */    /* 35 */
234    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 194  static const char *error_texts[] = { Line 239  static const char *error_texts[] = {
239    /* 40 */    /* 40 */
240    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
241    "unrecognized character after (?P",    "unrecognized character after (?P",
242    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
243    "two named groups have the same name",    "two named subpatterns have the same name",
244    "invalid UTF-8 string",    "invalid UTF-8 string",
245    /* 45 */    /* 45 */
246    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
247    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
248    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
249      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
250      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
251      /* 50 */
252      "repeated subpattern is too long",    /** DEAD **/
253      "octal value is greater than \\377 (not in UTF-8 mode)",
254      "internal error: overran compiling workspace",
255      "internal error: previously-checked referenced subpattern not found",
256      "DEFINE group contains more than one branch",
257      /* 55 */
258      "repeating a DEFINE group is not allowed",
259      "inconsistent NEWLINE options",
260      "\\g is not followed by a braced name or an optionally braced non-zero number",
261      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
262  };  };
263    
264    
# Line 220  For convenience, we use the same bit def Line 278  For convenience, we use the same bit def
278    
279  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
280    
281  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
282  static const unsigned char digitab[] =  static const unsigned char digitab[] =
283    {    {
284    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 256  static const unsigned char digitab[] = Line 314  static const unsigned char digitab[] =
314    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
315    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
316    
317  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
318  static const unsigned char digitab[] =  static const unsigned char digitab[] =
319    {    {
320    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 270  static const unsigned char digitab[] = Line 328  static const unsigned char digitab[] =
328    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
329    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
330    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
331    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
332    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
333    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
334    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 304  static const unsigned char ebcdic_charta Line 362  static const unsigned char ebcdic_charta
362    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
363    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
364    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
365    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
366    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
367    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
368    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 331  static const unsigned char ebcdic_charta Line 389  static const unsigned char ebcdic_charta
389  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
390    
391  static BOOL  static BOOL
392    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
393      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
394    
395    
396    
# Line 342  static BOOL Line 400  static BOOL
400    
401  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
402  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
403  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
404  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
405  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
406    ptr is pointing at the \. On exit, it is on the final character of the escape
407    sequence.
408    
409  Arguments:  Arguments:
410    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 362  static int Line 422  static int
422  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
423    int options, BOOL isclass)    int options, BOOL isclass)
424  {  {
425  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
426    const uschar *ptr = *ptrptr + 1;
427  int c, i;  int c, i;
428    
429    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
430    ptr--;                            /* Set pointer back to the last byte */
431    
432  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
433    
 c = *(++ptr);  
434  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
435    
436  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
437  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
438  Otherwise further processing may be required. */  Otherwise further processing may be required. */
439    
440  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
441  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
442  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
443    
444  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
445  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
446  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
447  #endif  #endif
# Line 388  else if ((i = escapes[c - 0x48]) != 0) Line 451  else if ((i = escapes[c - 0x48]) != 0)
451  else  else
452    {    {
453    const uschar *oldptr;    const uschar *oldptr;
454      BOOL braced, negated;
455    
456    switch (c)    switch (c)
457      {      {
458      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 401  else Line 466  else
466      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
467      break;      break;
468    
469        /* \g must be followed by a number, either plain or braced. If positive, it
470        is an absolute backreference. If negative, it is a relative backreference.
471        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
472        reference to a named group. This is part of Perl's movement towards a
473        unified syntax for back references. As this is synonymous with \k{name}, we
474        fudge it up by pretending it really was \k. */
475    
476        case 'g':
477        if (ptr[1] == '{')
478          {
479          const uschar *p;
480          for (p = ptr+2; *p != 0 && *p != '}'; p++)
481            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
482          if (*p != 0 && *p != '}')
483            {
484            c = -ESC_k;
485            break;
486            }
487          braced = TRUE;
488          ptr++;
489          }
490        else braced = FALSE;
491    
492        if (ptr[1] == '-')
493          {
494          negated = TRUE;
495          ptr++;
496          }
497        else negated = FALSE;
498    
499        c = 0;
500        while ((digitab[ptr[1]] & ctype_digit) != 0)
501          c = c * 10 + *(++ptr) - '0';
502    
503        if (c == 0 || (braced && *(++ptr) != '}'))
504          {
505          *errorcodeptr = ERR57;
506          return 0;
507          }
508    
509        if (negated)
510          {
511          if (c > bracount)
512            {
513            *errorcodeptr = ERR15;
514            return 0;
515            }
516          c = bracount - (c - 1);
517          }
518    
519        c = -(ESC_REF + c);
520        break;
521    
522      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
523      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
524      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 442  else Line 560  else
560        }        }
561    
562      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
563      larger first octal digit. */      larger first octal digit. The original code used just to take the least
564        significant 8 bits of octal numbers (I think this is what early Perls used
565        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
566        than 3 octal digits. */
567    
568      case '0':      case '0':
569      c -= '0';      c -= '0';
570      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
571          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
572      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
573      break;      break;
574    
575      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
576      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
577        treated as a data character. */
578    
579      case 'x':      case 'x':
580  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
581        {        {
582        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
583        register int count = 0;        int count = 0;
584    
585        c = 0;        c = 0;
586        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
587          {          {
588          int cc = *pt++;          register int cc = *pt++;
589            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
590          count++;          count++;
591  #if !EBCDIC    /* ASCII coding */  
592    #ifndef EBCDIC  /* ASCII coding */
593          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
594          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
595  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
596          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
597          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
598  #endif  #endif
599          }          }
600    
601        if (*pt == '}')        if (*pt == '}')
602          {          {
603          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
604          ptr = pt;          ptr = pt;
605          break;          break;
606          }          }
607    
608        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
609        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
610        }        }
 #endif  
611    
612      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
613    
614      c = 0;      c = 0;
615      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
616        {        {
617        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
618        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
619  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
620        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
621        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
622  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
623        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
624        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
625  #endif  #endif
626        }        }
627      break;      break;
628    
629      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
630        This coding is ASCII-specific, but then the whole concept of \cx is
631        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
632    
633      case 'c':      case 'c':
634      c = *(++ptr);      c = *(++ptr);
# Line 511  else Line 638  else
638        return 0;        return 0;
639        }        }
640    
641      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
642      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
643      c ^= 0x40;      c ^= 0x40;
644  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
645      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
646      c ^= 0xC0;      c ^= 0xC0;
647  #endif  #endif
# Line 560  escape sequence. Line 683  escape sequence.
683  Argument:  Argument:
684    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
685    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
686      dptr           points to an int that is set to the detailed property value
687    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
688    
689  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
690  */  */
691    
692  static int  static int
693  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
694  {  {
695  int c, i, bot, top;  int c, i, bot, top;
696  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
697  char name[4];  char name[32];
698    
699  c = *(++ptr);  c = *(++ptr);
700  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
701    
702  *negptr = FALSE;  *negptr = FALSE;
703    
704  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
705  preceded by ^ for negation. */  negation. */
706    
707  if (c == '{')  if (c == '{')
708    {    {
# Line 587  if (c == '{') Line 711  if (c == '{')
711      *negptr = TRUE;      *negptr = TRUE;
712      ptr++;      ptr++;
713      }      }
714    for (i = 0; i <= 2; i++)    for (i = 0; i < (int)sizeof(name) - 1; i++)
715      {      {
716      c = *(++ptr);      c = *(++ptr);
717      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
718      if (c == '}') break;      if (c == '}') break;
719      name[i] = c;      name[i] = c;
720      }      }
721    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
722    name[i] = 0;    name[i] = 0;
723    }    }
724    
# Line 619  top = _pcre_utt_size; Line 739  top = _pcre_utt_size;
739    
740  while (bot < top)  while (bot < top)
741    {    {
742    i = (bot + top)/2;    i = (bot + top) >> 1;
743    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
744    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
745        {
746        *dptr = _pcre_utt[i].value;
747        return _pcre_utt[i].type;
748        }
749    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
750    }    }
751    
 UNKNOWN_RETURN:  
752  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
753  *ptrptr = ptr;  *ptrptr = ptr;
754  return -1;  return -1;
# Line 741  return p; Line 864  return p;
864    
865    
866  /*************************************************  /*************************************************
867    *       Find forward referenced subpattern       *
868    *************************************************/
869    
870    /* This function scans along a pattern's text looking for capturing
871    subpatterns, and counting them. If it finds a named pattern that matches the
872    name it is given, it returns its number. Alternatively, if the name is NULL, it
873    returns when it reaches a given numbered subpattern. This is used for forward
874    references to subpatterns. We know that if (?P< is encountered, the name will
875    be terminated by '>' because that is checked in the first pass.
876    
877    Arguments:
878      ptr          current position in the pattern
879      count        current count of capturing parens so far encountered
880      name         name to seek, or NULL if seeking a numbered subpattern
881      lorn         name length, or subpattern number if name is NULL
882      xmode        TRUE if we are in /x mode
883    
884    Returns:       the number of the named subpattern, or -1 if not found
885    */
886    
887    static int
888    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
889      BOOL xmode)
890    {
891    const uschar *thisname;
892    
893    for (; *ptr != 0; ptr++)
894      {
895      int term;
896    
897      /* Skip over backslashed characters and also entire \Q...\E */
898    
899      if (*ptr == '\\')
900        {
901        if (*(++ptr) == 0) return -1;
902        if (*ptr == 'Q') for (;;)
903          {
904          while (*(++ptr) != 0 && *ptr != '\\');
905          if (*ptr == 0) return -1;
906          if (*(++ptr) == 'E') break;
907          }
908        continue;
909        }
910    
911      /* Skip over character classes */
912    
913      if (*ptr == '[')
914        {
915        while (*(++ptr) != ']')
916          {
917          if (*ptr == '\\')
918            {
919            if (*(++ptr) == 0) return -1;
920            if (*ptr == 'Q') for (;;)
921              {
922              while (*(++ptr) != 0 && *ptr != '\\');
923              if (*ptr == 0) return -1;
924              if (*(++ptr) == 'E') break;
925              }
926            continue;
927            }
928          }
929        continue;
930        }
931    
932      /* Skip comments in /x mode */
933    
934      if (xmode && *ptr == '#')
935        {
936        while (*(++ptr) != 0 && *ptr != '\n');
937        if (*ptr == 0) return -1;
938        continue;
939        }
940    
941      /* An opening parens must now be a real metacharacter */
942    
943      if (*ptr != '(') continue;
944      if (ptr[1] != '?')
945        {
946        count++;
947        if (name == NULL && count == lorn) return count;
948        continue;
949        }
950    
951      ptr += 2;
952      if (*ptr == 'P') ptr++;                      /* Allow optional P */
953    
954      /* We have to disambiguate (?<! and (?<= from (?<name> */
955    
956      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
957           *ptr != '\'')
958        continue;
959    
960      count++;
961    
962      if (name == NULL && count == lorn) return count;
963      term = *ptr++;
964      if (term == '<') term = '>';
965      thisname = ptr;
966      while (*ptr != term) ptr++;
967      if (name != NULL && lorn == ptr - thisname &&
968          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
969        return count;
970      }
971    
972    return -1;
973    }
974    
975    
976    
977    /*************************************************
978  *      Find first significant op code            *  *      Find first significant op code            *
979  *************************************************/  *************************************************/
980    
# Line 789  for (;;) Line 1023  for (;;)
1023    
1024      case OP_CALLOUT:      case OP_CALLOUT:
1025      case OP_CREF:      case OP_CREF:
1026      case OP_BRANUMBER:      case OP_RREF:
1027        case OP_DEF:
1028      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1029      break;      break;
1030    
# Line 834  for (;;) Line 1069  for (;;)
1069    {    {
1070    int d;    int d;
1071    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1072    
1073    switch (op)    switch (op)
1074      {      {
1075        case OP_CBRA:
1076      case OP_BRA:      case OP_BRA:
1077      case OP_ONCE:      case OP_ONCE:
1078      case OP_COND:      case OP_COND:
1079      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1080      if (d < 0) return d;      if (d < 0) return d;
1081      branchlength += d;      branchlength += d;
1082      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 876  for (;;) Line 1111  for (;;)
1111      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1112    
1113      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1114      case OP_CREF:      case OP_CREF:
1115        case OP_RREF:
1116        case OP_DEF:
1117      case OP_OPT:      case OP_OPT:
1118      case OP_CALLOUT:      case OP_CALLOUT:
1119      case OP_SOD:      case OP_SOD:
# Line 895  for (;;) Line 1131  for (;;)
1131    
1132      case OP_CHAR:      case OP_CHAR:
1133      case OP_CHARNC:      case OP_CHARNC:
1134        case OP_NOT:
1135      branchlength++;      branchlength++;
1136      cc += 2;      cc += 2;
1137  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 928  for (;;) Line 1165  for (;;)
1165    
1166      case OP_PROP:      case OP_PROP:
1167      case OP_NOTPROP:      case OP_NOTPROP:
1168      cc++;      cc += 2;
1169      /* Fall through */      /* Fall through */
1170    
1171      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 1009  Returns:      pointer to the opcode for Line 1246  Returns:      pointer to the opcode for
1246  static const uschar *  static const uschar *
1247  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1248  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1249  for (;;)  for (;;)
1250    {    {
1251    register int c = *code;    register int c = *code;
1252    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1253    else if (c > OP_BRA)  
1254      /* XCLASS is used for classes that cannot be represented just by a bit
1255      map. This includes negated single high-valued characters. The length in
1256      the table is zero; the actual length is stored in the compiled code. */
1257    
1258      if (c == OP_XCLASS) code += GET(code, 1);
1259    
1260      /* Handle capturing bracket */
1261    
1262      else if (c == OP_CBRA)
1263      {      {
1264      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1265      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1266      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1267      }      }
1268    
1269      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1270      a multi-byte character. The length in the table is a minimum, so we have to
1271      arrange to skip the extra bytes. */
1272    
1273    else    else
1274      {      {
1275      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1276  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1277      if (utf8) switch(c)      if (utf8) switch(c)
1278        {        {
1279        case OP_CHAR:        case OP_CHAR:
# Line 1042  for (;;) Line 1281  for (;;)
1281        case OP_EXACT:        case OP_EXACT:
1282        case OP_UPTO:        case OP_UPTO:
1283        case OP_MINUPTO:        case OP_MINUPTO:
1284          case OP_POSUPTO:
1285        case OP_STAR:        case OP_STAR:
1286        case OP_MINSTAR:        case OP_MINSTAR:
1287          case OP_POSSTAR:
1288        case OP_PLUS:        case OP_PLUS:
1289        case OP_MINPLUS:        case OP_MINPLUS:
1290          case OP_POSPLUS:
1291        case OP_QUERY:        case OP_QUERY:
1292        case OP_MINQUERY:        case OP_MINQUERY:
1293        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1294        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1295        break;        break;
1296        }        }
1297  #endif  #endif
# Line 1083  Returns:      pointer to the opcode for Line 1318  Returns:      pointer to the opcode for
1318  static const uschar *  static const uschar *
1319  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1320  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1321  for (;;)  for (;;)
1322    {    {
1323    register int c = *code;    register int c = *code;
1324    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1325    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1326    else if (c > OP_BRA)  
1327      {    /* XCLASS is used for classes that cannot be represented just by a bit
1328      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1329      }    the table is zero; the actual length is stored in the compiled code. */
1330    
1331      if (c == OP_XCLASS) code += GET(code, 1);
1332    
1333      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1334      that are followed by a character may be followed by a multi-byte character.
1335      The length in the table is a minimum, so we have to arrange to skip the extra
1336      bytes. */
1337    
1338    else    else
1339      {      {
1340      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1341  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1342      if (utf8) switch(c)      if (utf8) switch(c)
1343        {        {
1344        case OP_CHAR:        case OP_CHAR:
# Line 1114  for (;;) Line 1346  for (;;)
1346        case OP_EXACT:        case OP_EXACT:
1347        case OP_UPTO:        case OP_UPTO:
1348        case OP_MINUPTO:        case OP_MINUPTO:
1349          case OP_POSUPTO:
1350        case OP_STAR:        case OP_STAR:
1351        case OP_MINSTAR:        case OP_MINSTAR:
1352          case OP_POSSTAR:
1353        case OP_PLUS:        case OP_PLUS:
1354        case OP_MINPLUS:        case OP_MINPLUS:
1355          case OP_POSPLUS:
1356        case OP_QUERY:        case OP_QUERY:
1357        case OP_MINQUERY:        case OP_MINQUERY:
1358        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1359        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1360        break;        break;
1361        }        }
1362  #endif  #endif
# Line 1143  for (;;) Line 1371  for (;;)
1371  *************************************************/  *************************************************/
1372    
1373  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1374  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1375  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1376  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1377  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1378    struck an inner bracket whose current branch will already have been scanned.
1379    
1380  Arguments:  Arguments:
1381    code        points to start of search    code        points to start of search
# Line 1160  static BOOL Line 1389  static BOOL
1389  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1390  {  {
1391  register int c;  register int c;
1392  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1393       code < endcode;       code < endcode;
1394       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1395    {    {
# Line 1168  for (code = first_significant_code(code Line 1397  for (code = first_significant_code(code
1397    
1398    c = *code;    c = *code;
1399    
1400    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1401    
1402      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1403        {
1404        code += _pcre_OP_lengths[c];
1405        do code += GET(code, 1); while (*code == OP_ALT);
1406        c = *code;
1407        continue;
1408        }
1409    
1410      /* For other groups, scan the branches. */
1411    
1412      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1413      {      {
1414      BOOL empty_branch;      BOOL empty_branch;
1415      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1184  for (code = first_significant_code(code Line 1425  for (code = first_significant_code(code
1425        }        }
1426      while (*code == OP_ALT);      while (*code == OP_ALT);
1427      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1428      c = *code;      c = *code;
1429        continue;
1430      }      }
1431    
1432    else switch (c)    /* Handle the other opcodes */
1433    
1434      switch (c)
1435      {      {
1436      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1437    
# Line 1244  for (code = first_significant_code(code Line 1487  for (code = first_significant_code(code
1487      case OP_NOT:      case OP_NOT:
1488      case OP_PLUS:      case OP_PLUS:
1489      case OP_MINPLUS:      case OP_MINPLUS:
1490        case OP_POSPLUS:
1491      case OP_EXACT:      case OP_EXACT:
1492      case OP_NOTPLUS:      case OP_NOTPLUS:
1493      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1494        case OP_NOTPOSPLUS:
1495      case OP_NOTEXACT:      case OP_NOTEXACT:
1496      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1497      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1498        case OP_TYPEPOSPLUS:
1499      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1500      return FALSE;      return FALSE;
1501    
# Line 1261  for (code = first_significant_code(code Line 1507  for (code = first_significant_code(code
1507      case OP_ALT:      case OP_ALT:
1508      return TRUE;      return TRUE;
1509    
1510      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1511      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1512    
1513  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1514      case OP_STAR:      case OP_STAR:
1515      case OP_MINSTAR:      case OP_MINSTAR:
1516        case OP_POSSTAR:
1517      case OP_QUERY:      case OP_QUERY:
1518      case OP_MINQUERY:      case OP_MINQUERY:
1519        case OP_POSQUERY:
1520      case OP_UPTO:      case OP_UPTO:
1521      case OP_MINUPTO:      case OP_MINUPTO:
1522        case OP_POSUPTO:
1523      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1524      break;      break;
1525  #endif  #endif
# Line 1388  earlier groups that are outside the curr Line 1637  earlier groups that are outside the curr
1637  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1638  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1639  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1640  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1641  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1642    
1643    This function has been extended with the possibility of forward references for
1644    recursions and subroutine calls. It must also check the list of such references
1645    for the group we are dealing with. If it finds that one of the recursions in
1646    the current group is on this list, it adjusts the offset in the list, not the
1647    value in the reference (which is a group number).
1648    
1649  Arguments:  Arguments:
1650    group      points to the start of the group    group      points to the start of the group
1651    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1652    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1653    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1654      save_hwm   the hwm forward reference pointer at the start of the group
1655    
1656  Returns:     nothing  Returns:     nothing
1657  */  */
1658    
1659  static void  static void
1660  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1661      uschar *save_hwm)
1662  {  {
1663  uschar *ptr = group;  uschar *ptr = group;
1664  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1665    {    {
1666    int offset = GET(ptr, 1);    int offset;
1667    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1668    
1669      /* See if this recursion is on the forward reference list. If so, adjust the
1670      reference. */
1671    
1672      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1673        {
1674        offset = GET(hc, 0);
1675        if (cd->start_code + offset == ptr + 1)
1676          {
1677          PUT(hc, 0, offset + adjust);
1678          break;
1679          }
1680        }
1681    
1682      /* Otherwise, adjust the recursion offset if it's after the start of this
1683      group. */
1684    
1685      if (hc >= cd->hwm)
1686        {
1687        offset = GET(ptr, 1);
1688        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1689        }
1690    
1691    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1692    }    }
1693  }  }
# Line 1486  Yield:        TRUE when range returned; Line 1766  Yield:        TRUE when range returned;
1766  */  */
1767    
1768  static BOOL  static BOOL
1769  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1770      unsigned int *odptr)
1771  {  {
1772  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1773    
1774  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1775    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1776    
1777  if (c > d) return FALSE;  if (c > d) return FALSE;
1778    
# Line 1503  next = othercase + 1; Line 1781  next = othercase + 1;
1781    
1782  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1783    {    {
1784    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1785    next++;    next++;
1786    }    }
1787    
# Line 1517  return TRUE; Line 1793  return TRUE;
1793  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1794    
1795    
1796    
1797  /*************************************************  /*************************************************
1798  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1799  *************************************************/  *************************************************/
1800    
1801  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1802  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1803  bits.  sense to automatically possessify the repeated item.
1804    
1805  Arguments:  Arguments:
1806    optionsptr     pointer to the option bits    op_code       the repeated op code
1807    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1808    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1809    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1810    errorcodeptr   points to error code variable    ptr           next character in pattern
1811    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1812    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1813    
1814  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1815  */  */
1816    
1817  static BOOL  static BOOL
1818  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1819    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1820  {  {
1821  int repeat_type, op_type;  int next;
1822  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1823  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
1824  int greedy_default, greedy_non_default;  
1825  int firstbyte, reqbyte;  if ((options & PCRE_EXTENDED) != 0)
1826  int zeroreqbyte, zerofirstbyte;    {
1827  int req_caseopt, reqvary, tempreqvary;    for (;;)
1828  int condcount = 0;      {
1829  int options = *optionsptr;      while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1830  int after_manual_callout = 0;      if (*ptr == '#')
1831  register int c;        {
1832  register uschar *code = *codeptr;        while (*(++ptr) != 0)
1833  uschar *tempcode;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1834  BOOL inescq = FALSE;        }
1835  BOOL groupsetfirstbyte = FALSE;      else break;
1836  const uschar *ptr = *ptrptr;      }
1837  const uschar *tempptr;    }
 uschar *previous = NULL;  
 uschar *previous_callout = NULL;  
 uschar classbits[32];  
1838    
1839    /* If the next item is one that we can handle, get its value. A non-negative
1840    value is a character, a negative value is an escape value. */
1841    
1842    if (*ptr == '\\')
1843      {
1844      int temperrorcode = 0;
1845      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1846      if (temperrorcode != 0) return FALSE;
1847      ptr++;    /* Point after the escape sequence */
1848      }
1849    
1850    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1851      {
1852  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1853  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1854  #endif  #endif
1855      next = *ptr++;
1856      }
1857    
1858  /* Set up the default and non-default settings for greediness */  else return FALSE;
1859    
1860  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1861    
1862  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1863  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1864  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1865  find one.      {
1866        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867        if (*ptr == '#')
1868          {
1869          while (*(++ptr) != 0)
1870            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871          }
1872        else break;
1873        }
1874      }
1875    
1876  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1877    
1878  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1879      return FALSE;
1880    
1881  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1882  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1883  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1884  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1885    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1886    
1887  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1888    
1889  for (;; ptr++)  if (next >= 0) switch(op_code)
1890    {    {
1891    BOOL negate_class;    case OP_CHAR:
1892    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
1893    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894    int class_charcount;  #endif
1895    int class_lastchar;    return item != next;
1896    int newoptions;  
1897    int recno;    /* For CHARNC (caseless character) we must check the other case. If we have
1898    int skipbytes;    Unicode property support, we can use it to test the other case of
1899    int subreqbyte;    high-valued characters. */
1900    int subfirstbyte;  
1901    int mclength;    case OP_CHARNC:
1902    #ifdef SUPPORT_UTF8
1903      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1904    #endif
1905      if (item == next) return FALSE;
1906    #ifdef SUPPORT_UTF8
1907      if (utf8)
1908        {
1909        unsigned int othercase;
1910        if (next < 128) othercase = cd->fcc[next]; else
1911    #ifdef SUPPORT_UCP
1912        othercase = _pcre_ucp_othercase((unsigned int)next);
1913    #else
1914        othercase = NOTACHAR;
1915    #endif
1916        return (unsigned int)item != othercase;
1917        }
1918      else
1919    #endif  /* SUPPORT_UTF8 */
1920      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1921    
1922      /* For OP_NOT, "item" must be a single-byte character. */
1923    
1924      case OP_NOT:
1925      if (next < 0) return FALSE;  /* Not a character */
1926      if (item == next) return TRUE;
1927      if ((options & PCRE_CASELESS) == 0) return FALSE;
1928    #ifdef SUPPORT_UTF8
1929      if (utf8)
1930        {
1931        unsigned int othercase;
1932        if (next < 128) othercase = cd->fcc[next]; else
1933    #ifdef SUPPORT_UCP
1934        othercase = _pcre_ucp_othercase(next);
1935    #else
1936        othercase = NOTACHAR;
1937    #endif
1938        return (unsigned int)item == othercase;
1939        }
1940      else
1941    #endif  /* SUPPORT_UTF8 */
1942      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1943    
1944      case OP_DIGIT:
1945      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1946    
1947      case OP_NOT_DIGIT:
1948      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1949    
1950      case OP_WHITESPACE:
1951      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1952    
1953      case OP_NOT_WHITESPACE:
1954      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1955    
1956      case OP_WORDCHAR:
1957      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1958    
1959      case OP_NOT_WORDCHAR:
1960      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1961    
1962      case OP_HSPACE:
1963      case OP_NOT_HSPACE:
1964      switch(next)
1965        {
1966        case 0x09:
1967        case 0x20:
1968        case 0xa0:
1969        case 0x1680:
1970        case 0x180e:
1971        case 0x2000:
1972        case 0x2001:
1973        case 0x2002:
1974        case 0x2003:
1975        case 0x2004:
1976        case 0x2005:
1977        case 0x2006:
1978        case 0x2007:
1979        case 0x2008:
1980        case 0x2009:
1981        case 0x200A:
1982        case 0x202f:
1983        case 0x205f:
1984        case 0x3000:
1985        return op_code != OP_HSPACE;
1986        default:
1987        return op_code == OP_HSPACE;
1988        }
1989    
1990      case OP_VSPACE:
1991      case OP_NOT_VSPACE:
1992      switch(next)
1993        {
1994        case 0x0a:
1995        case 0x0b:
1996        case 0x0c:
1997        case 0x0d:
1998        case 0x85:
1999        case 0x2028:
2000        case 0x2029:
2001        return op_code != OP_VSPACE;
2002        default:
2003        return op_code == OP_VSPACE;
2004        }
2005    
2006      default:
2007      return FALSE;
2008      }
2009    
2010    
2011    /* Handle the case when the next item is \d, \s, etc. */
2012    
2013    switch(op_code)
2014      {
2015      case OP_CHAR:
2016      case OP_CHARNC:
2017    #ifdef SUPPORT_UTF8
2018      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2019    #endif
2020      switch(-next)
2021        {
2022        case ESC_d:
2023        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2024    
2025        case ESC_D:
2026        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2027    
2028        case ESC_s:
2029        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2030    
2031        case ESC_S:
2032        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2033    
2034        case ESC_w:
2035        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2036    
2037        case ESC_W:
2038        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2039    
2040        case ESC_h:
2041        case ESC_H:
2042        switch(item)
2043          {
2044          case 0x09:
2045          case 0x20:
2046          case 0xa0:
2047          case 0x1680:
2048          case 0x180e:
2049          case 0x2000:
2050          case 0x2001:
2051          case 0x2002:
2052          case 0x2003:
2053          case 0x2004:
2054          case 0x2005:
2055          case 0x2006:
2056          case 0x2007:
2057          case 0x2008:
2058          case 0x2009:
2059          case 0x200A:
2060          case 0x202f:
2061          case 0x205f:
2062          case 0x3000:
2063          return -next != ESC_h;
2064          default:
2065          return -next == ESC_h;
2066          }
2067    
2068        case ESC_v:
2069        case ESC_V:
2070        switch(item)
2071          {
2072          case 0x0a:
2073          case 0x0b:
2074          case 0x0c:
2075          case 0x0d:
2076          case 0x85:
2077          case 0x2028:
2078          case 0x2029:
2079          return -next != ESC_v;
2080          default:
2081          return -next == ESC_v;
2082          }
2083    
2084        default:
2085        return FALSE;
2086        }
2087    
2088      case OP_DIGIT:
2089      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2090             next == -ESC_h || next == -ESC_v;
2091    
2092      case OP_NOT_DIGIT:
2093      return next == -ESC_d;
2094    
2095      case OP_WHITESPACE:
2096      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2097    
2098      case OP_NOT_WHITESPACE:
2099      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2100    
2101      case OP_HSPACE:
2102      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2103    
2104      case OP_NOT_HSPACE:
2105      return next == -ESC_h;
2106    
2107      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2108      case OP_VSPACE:
2109      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2110    
2111      case OP_NOT_VSPACE:
2112      return next == -ESC_v;
2113    
2114      case OP_WORDCHAR:
2115      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2116    
2117      case OP_NOT_WORDCHAR:
2118      return next == -ESC_w || next == -ESC_d;
2119    
2120      default:
2121      return FALSE;
2122      }
2123    
2124    /* Control does not reach here */
2125    }
2126    
2127    
2128    
2129    /*************************************************
2130    *           Compile one branch                   *
2131    *************************************************/
2132    
2133    /* Scan the pattern, compiling it into the a vector. If the options are
2134    changed during the branch, the pointer is used to change the external options
2135    bits. This function is used during the pre-compile phase when we are trying
2136    to find out the amount of memory needed, as well as during the real compile
2137    phase. The value of lengthptr distinguishes the two phases.
2138    
2139    Arguments:
2140      optionsptr     pointer to the option bits
2141      codeptr        points to the pointer to the current code point
2142      ptrptr         points to the current pattern pointer
2143      errorcodeptr   points to error code variable
2144      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2145      reqbyteptr     set to the last literal character required, else < 0
2146      bcptr          points to current branch chain
2147      cd             contains pointers to tables etc.
2148      lengthptr      NULL during the real compile phase
2149                     points to length accumulator during pre-compile phase
2150    
2151    Returns:         TRUE on success
2152                     FALSE, with *errorcodeptr set non-zero on error
2153    */
2154    
2155    static BOOL
2156    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2157      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2158      compile_data *cd, int *lengthptr)
2159    {
2160    int repeat_type, op_type;
2161    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2162    int bravalue = 0;
2163    int greedy_default, greedy_non_default;
2164    int firstbyte, reqbyte;
2165    int zeroreqbyte, zerofirstbyte;
2166    int req_caseopt, reqvary, tempreqvary;
2167    int options = *optionsptr;
2168    int after_manual_callout = 0;
2169    int length_prevgroup = 0;
2170    register int c;
2171    register uschar *code = *codeptr;
2172    uschar *last_code = code;
2173    uschar *orig_code = code;
2174    uschar *tempcode;
2175    BOOL inescq = FALSE;
2176    BOOL groupsetfirstbyte = FALSE;
2177    const uschar *ptr = *ptrptr;
2178    const uschar *tempptr;
2179    uschar *previous = NULL;
2180    uschar *previous_callout = NULL;
2181    uschar *save_hwm = NULL;
2182    uschar classbits[32];
2183    
2184    #ifdef SUPPORT_UTF8
2185    BOOL class_utf8;
2186    BOOL utf8 = (options & PCRE_UTF8) != 0;
2187    uschar *class_utf8data;
2188    uschar utf8_char[6];
2189    #else
2190    BOOL utf8 = FALSE;
2191    uschar *utf8_char = NULL;
2192    #endif
2193    
2194    #ifdef DEBUG
2195    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2196    #endif
2197    
2198    /* Set up the default and non-default settings for greediness */
2199    
2200    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2201    greedy_non_default = greedy_default ^ 1;
2202    
2203    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2204    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2205    matches a non-fixed char first char; reqbyte just remains unset if we never
2206    find one.
2207    
2208    When we hit a repeat whose minimum is zero, we may have to adjust these values
2209    to take the zero repeat into account. This is implemented by setting them to
2210    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2211    item types that can be repeated set these backoff variables appropriately. */
2212    
2213    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2214    
2215    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2216    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2217    value > 255. It is added into the firstbyte or reqbyte variables to record the
2218    case status of the value. This is used only for ASCII characters. */
2219    
2220    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2221    
2222    /* Switch on next character until the end of the branch */
2223    
2224    for (;; ptr++)
2225      {
2226      BOOL negate_class;
2227      BOOL possessive_quantifier;
2228      BOOL is_quantifier;
2229      BOOL is_recurse;
2230      BOOL reset_bracount;
2231      int class_charcount;
2232      int class_lastchar;
2233      int newoptions;
2234      int recno;
2235      int refsign;
2236      int skipbytes;
2237      int subreqbyte;
2238      int subfirstbyte;
2239      int terminator;
2240      int mclength;
2241    uschar mcbuffer[8];    uschar mcbuffer[8];
2242    
2243    /* Next byte in the pattern */    /* Get next byte in the pattern */
2244    
2245    c = *ptr;    c = *ptr;
2246    
2247      /* If we are in the pre-compile phase, accumulate the length used for the
2248      previous cycle of this loop. */
2249    
2250      if (lengthptr != NULL)
2251        {
2252    #ifdef DEBUG
2253        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2254    #endif
2255        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2256          {
2257          *errorcodeptr = ERR52;
2258          goto FAILED;
2259          }
2260    
2261        /* There is at least one situation where code goes backwards: this is the
2262        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2263        the class is simply eliminated. However, it is created first, so we have to
2264        allow memory for it. Therefore, don't ever reduce the length at this point.
2265        */
2266    
2267        if (code < last_code) code = last_code;
2268    
2269        /* Paranoid check for integer overflow */
2270    
2271        if (OFLOW_MAX - *lengthptr < code - last_code)
2272          {
2273          *errorcodeptr = ERR20;
2274          goto FAILED;
2275          }
2276    
2277        *lengthptr += code - last_code;
2278        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2279    
2280        /* If "previous" is set and it is not at the start of the work space, move
2281        it back to there, in order to avoid filling up the work space. Otherwise,
2282        if "previous" is NULL, reset the current code pointer to the start. */
2283    
2284        if (previous != NULL)
2285          {
2286          if (previous > orig_code)
2287            {
2288            memmove(orig_code, previous, code - previous);
2289            code -= previous - orig_code;
2290            previous = orig_code;
2291            }
2292          }
2293        else code = orig_code;
2294    
2295        /* Remember where this code item starts so we can pick up the length
2296        next time round. */
2297    
2298        last_code = code;
2299        }
2300    
2301      /* In the real compile phase, just check the workspace used by the forward
2302      reference list. */
2303    
2304      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2305        {
2306        *errorcodeptr = ERR52;
2307        goto FAILED;
2308        }
2309    
2310    /* If in \Q...\E, check for the end; if not, we have a literal */    /* If in \Q...\E, check for the end; if not, we have a literal */
2311    
2312    if (inescq && c != 0)    if (inescq && c != 0)
# Line 1634  for (;; ptr++) Line 2321  for (;; ptr++)
2321        {        {
2322        if (previous_callout != NULL)        if (previous_callout != NULL)
2323          {          {
2324          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2325              complete_callout(previous_callout, ptr, cd);
2326          previous_callout = NULL;          previous_callout = NULL;
2327          }          }
2328        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1655  for (;; ptr++) Line 2343  for (;; ptr++)
2343    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2344         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2345      {      {
2346      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2347          complete_callout(previous_callout, ptr, cd);
2348      previous_callout = NULL;      previous_callout = NULL;
2349      }      }
2350    
# Line 1666  for (;; ptr++) Line 2355  for (;; ptr++)
2355      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2356      if (c == '#')      if (c == '#')
2357        {        {
2358        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2359        on the Macintosh. */          {
2360        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2361        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2362          if (*ptr != 0) continue;
2363    
2364          /* Else fall through to handle end of string */
2365          c = 0;
2366        }        }
2367      }      }
2368    
# Line 1683  for (;; ptr++) Line 2376  for (;; ptr++)
2376    
2377    switch(c)    switch(c)
2378      {      {
2379      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2380        case 0:                        /* The branch terminates at string end */
2381      case 0:      case '|':                      /* or | or ) */
     case '|':  
2382      case ')':      case ')':
2383      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2384      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2385      *codeptr = code;      *codeptr = code;
2386      *ptrptr = ptr;      *ptrptr = ptr;
2387        if (lengthptr != NULL)
2388          {
2389          if (OFLOW_MAX - *lengthptr < code - last_code)
2390            {
2391            *errorcodeptr = ERR20;
2392            goto FAILED;
2393            }
2394          *lengthptr += code - last_code;   /* To include callout length */
2395          DPRINTF((">> end branch\n"));
2396          }
2397      return TRUE;      return TRUE;
2398    
2399    
2400        /* ===================================================================*/
2401      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2402      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2403    
# Line 1722  for (;; ptr++) Line 2426  for (;; ptr++)
2426      *code++ = OP_ANY;      *code++ = OP_ANY;
2427      break;      break;
2428    
2429      /* Character classes. If the included characters are all < 255 in value, we  
2430      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2431      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2432      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2433      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2434        map as usual, then invert it at the end. However, we use a different opcode
2435        so that data characters > 255 can be handled correctly.
2436    
2437      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2438      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1747  for (;; ptr++) Line 2453  for (;; ptr++)
2453        goto FAILED;        goto FAILED;
2454        }        }
2455    
2456      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. Also,
2457        if the first few characters (either before or after ^) are \Q\E or \E we
2458      if ((c = *(++ptr)) == '^')      skip them too. This makes for compatibility with Perl. */
2459    
2460        negate_class = FALSE;
2461        for (;;)
2462        {        {
       negate_class = TRUE;  
2463        c = *(++ptr);        c = *(++ptr);
2464        }        if (c == '\\')
2465      else          {
2466        {          if (ptr[1] == 'E') ptr++;
2467        negate_class = FALSE;            else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2468        }              else break;
2469            }
2470          else if (!negate_class && c == '^')
2471            negate_class = TRUE;
2472          else break;
2473          }
2474    
2475      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2476      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2477      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2478    
2479      class_charcount = 0;      class_charcount = 0;
2480      class_lastchar = -1;      class_lastchar = -1;
2481    
2482        /* Initialize the 32-char bit map to all zeros. We build the map in a
2483        temporary bit of memory, in case the class contains only 1 character (less
2484        than 256), because in that case the compiled code doesn't use the bit map.
2485        */
2486    
2487        memset(classbits, 0, 32 * sizeof(uschar));
2488    
2489  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2490      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2491      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2492  #endif  #endif
2493    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2494      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2495      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2496      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2497    
2498      do      if (c != 0) do
2499        {        {
2500          const uschar *oldptr;
2501    
2502  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2503        if (utf8 && c > 127)        if (utf8 && c > 127)
2504          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1797  for (;; ptr++) Line 2510  for (;; ptr++)
2510    
2511        if (inescq)        if (inescq)
2512          {          {
2513          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2514            {            {
2515            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2516            ptr++;            ptr++;                            /* Skip the 'E' */
2517            continue;            continue;                         /* Carry on with next */
2518            }            }
2519          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2520          }          }
2521    
2522        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1817  for (;; ptr++) Line 2530  for (;; ptr++)
2530            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2531          {          {
2532          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2533          int posix_class, i;          int posix_class, taboffset, tabopt;
2534          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2535            uschar pbits[32];
2536    
2537          if (ptr[1] != ':')          if (ptr[1] != ':')
2538            {            {
# Line 1847  for (;; ptr++) Line 2561  for (;; ptr++)
2561          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2562            posix_class = 0;            posix_class = 0;
2563    
2564          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2565          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2566          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2567          white space chars afterwards. */          result into the bit map that is being built. */
2568    
2569          posix_class *= 3;          posix_class *= 3;
2570          for (i = 0; i < 3; i++)  
2571            /* Copy in the first table (always present) */
2572    
2573            memcpy(pbits, cbits + posix_class_maps[posix_class],
2574              32 * sizeof(uschar));
2575    
2576            /* If there is a second table, add or remove it as required. */
2577    
2578            taboffset = posix_class_maps[posix_class + 1];
2579            tabopt = posix_class_maps[posix_class + 2];
2580    
2581            if (taboffset >= 0)
2582            {            {
2583            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2584            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2585            else            else
2586              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2587            }            }
2588    
2589            /* Not see if we need to remove any special characters. An option
2590            value of 1 removes vertical space and 2 removes underscore. */
2591    
2592            if (tabopt < 0) tabopt = -tabopt;
2593            if (tabopt == 1) pbits[1] &= ~0x3c;
2594              else if (tabopt == 2) pbits[11] &= 0x7f;
2595    
2596            /* Add the POSIX table or its complement into the main table that is
2597            being built and we are done. */
2598    
2599            if (local_negate)
2600              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2601            else
2602              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2603    
2604          ptr = tempptr + 1;          ptr = tempptr + 1;
2605          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2606          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2607          }          }
2608    
2609        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2610        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2611        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2612        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2613        it marks a word boundary. Other escapes have preset maps ready to        to 'or' into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2614        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2615    
2616        if (c == '\\')        if (c == '\\')
2617          {          {
2618          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2619            if (*errorcodeptr != 0) goto FAILED;
2620    
2621          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2622          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2623            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2624          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2625            {            {
2626            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1906  for (;; ptr++) Line 2635  for (;; ptr++)
2635            {            {
2636            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2637            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2638            switch (-c)  
2639              /* Save time by not doing this in the pre-compile phase. */
2640    
2641              if (lengthptr == NULL) switch (-c)
2642              {              {
2643              case ESC_d:              case ESC_d:
2644              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1934  for (;; ptr++) Line 2666  for (;; ptr++)
2666              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2667              continue;              continue;
2668    
2669  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
             case ESC_p:  
             case ESC_P:  
               {  
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
               class_utf8 = TRUE;  
               *class_utf8data++ = ((-c == ESC_p) != negated)?  
                 XCL_PROP : XCL_NOTPROP;  
               *class_utf8data++ = property;  
               class_charcount -= 2;   /* Not a < 256 character */  
               }  
2670              continue;              continue;
 #endif  
   
             /* Unrecognized escapes are faulted if PCRE is running in its  
             strict mode. By default, for compatibility with Perl, they are  
             treated as literals. */  
2671    
2672              default:              default:    /* Not recognized; fall through */
2673              if ((options & PCRE_EXTRA) != 0)              break;      /* Need "default" setting to stop compiler warning. */
               {  
               *errorcodeptr = ERR7;  
               goto FAILED;  
               }  
             c = *ptr;              /* The final character */  
             class_charcount -= 2;  /* Undo the default count from above */  
2674              }              }
           }  
2675    
2676          /* Fall through if we have a single character (c >= 0). This may be            /* In the pre-compile phase, just do the recognition. */
         > 256 in UTF-8 mode. */  
2677    
2678          }   /* End of backslash handling */            else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2679                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2680    
2681              /* We need to deal with \H, \h, \V, and \v in both phases because
2682              they use extra memory. */
2683    
2684              if (-c == ESC_h)
2685                {
2686                SETBIT(classbits, 0x09); /* VT */
2687                SETBIT(classbits, 0x20); /* SPACE */
2688                SETBIT(classbits, 0xa0); /* NSBP */
2689    #ifdef SUPPORT_UTF8
2690                if (utf8)
2691                  {
2692                  class_utf8 = TRUE;
2693                  *class_utf8data++ = XCL_SINGLE;
2694                  class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2695                  *class_utf8data++ = XCL_SINGLE;
2696                  class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2697                  *class_utf8data++ = XCL_RANGE;
2698                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2699                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2700                  *class_utf8data++ = XCL_SINGLE;
2701                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2702                  *class_utf8data++ = XCL_SINGLE;
2703                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2704                  *class_utf8data++ = XCL_SINGLE;
2705                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2706                  }
2707    #endif
2708                continue;
2709                }
2710    
2711              if (-c == ESC_H)
2712                {
2713                for (c = 0; c < 32; c++)
2714                  {
2715                  int x = 0xff;
2716                  switch (c)
2717                    {
2718                    case 0x09/8: x ^= 1 << (0x09%8); break;
2719                    case 0x20/8: x ^= 1 << (0x20%8); break;
2720                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2721                    default: break;
2722                    }
2723                  classbits[c] |= x;
2724                  }
2725    
2726    #ifdef SUPPORT_UTF8
2727                if (utf8)
2728                  {
2729                  class_utf8 = TRUE;
2730                  *class_utf8data++ = XCL_RANGE;
2731                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2732                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2733                  *class_utf8data++ = XCL_RANGE;
2734                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2735                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2736                  *class_utf8data++ = XCL_RANGE;
2737                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2738                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2739                  *class_utf8data++ = XCL_RANGE;
2740                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2741                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2742                  *class_utf8data++ = XCL_RANGE;
2743                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2744                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2745                  *class_utf8data++ = XCL_RANGE;
2746                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2747                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2748                  *class_utf8data++ = XCL_RANGE;
2749                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2750                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2751                  }
2752    #endif
2753                continue;
2754                }
2755    
2756              if (-c == ESC_v)
2757                {
2758                SETBIT(classbits, 0x0a); /* LF */
2759                SETBIT(classbits, 0x0b); /* VT */
2760                SETBIT(classbits, 0x0c); /* FF */
2761                SETBIT(classbits, 0x0d); /* CR */
2762                SETBIT(classbits, 0x85); /* NEL */
2763    #ifdef SUPPORT_UTF8
2764                if (utf8)
2765                  {
2766                  class_utf8 = TRUE;
2767                  *class_utf8data++ = XCL_RANGE;
2768                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2769                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2770                  }
2771    #endif
2772                continue;
2773                }
2774    
2775              if (-c == ESC_V)
2776                {
2777                for (c = 0; c < 32; c++)
2778                  {
2779                  int x = 0xff;
2780                  switch (c)
2781                    {
2782                    case 0x0a/8: x ^= 1 << (0x0a%8);
2783                                 x ^= 1 << (0x0b%8);
2784                                 x ^= 1 << (0x0c%8);
2785                                 x ^= 1 << (0x0d%8);
2786                                 break;
2787                    case 0x85/8: x ^= 1 << (0x85%8); break;
2788                    default: break;
2789                    }
2790                  classbits[c] |= x;
2791                  }
2792    
2793    #ifdef SUPPORT_UTF8
2794                if (utf8)
2795                  {
2796                  class_utf8 = TRUE;
2797                  *class_utf8data++ = XCL_RANGE;
2798                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2799                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2800                  *class_utf8data++ = XCL_RANGE;
2801                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2802                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2803                  }
2804    #endif
2805                continue;
2806                }
2807    
2808              /* We need to deal with \P and \p in both phases. */
2809    
2810    #ifdef SUPPORT_UCP
2811              if (-c == ESC_p || -c == ESC_P)
2812                {
2813                BOOL negated;
2814                int pdata;
2815                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2816                if (ptype < 0) goto FAILED;
2817                class_utf8 = TRUE;
2818                *class_utf8data++ = ((-c == ESC_p) != negated)?
2819                  XCL_PROP : XCL_NOTPROP;
2820                *class_utf8data++ = ptype;
2821                *class_utf8data++ = pdata;
2822                class_charcount -= 2;   /* Not a < 256 character */
2823                continue;
2824                }
2825    #endif
2826              /* Unrecognized escapes are faulted if PCRE is running in its
2827              strict mode. By default, for compatibility with Perl, they are
2828              treated as literals. */
2829    
2830              if ((options & PCRE_EXTRA) != 0)
2831                {
2832                *errorcodeptr = ERR7;
2833                goto FAILED;
2834                }
2835    
2836              class_charcount -= 2;  /* Undo the default count from above */
2837              c = *ptr;              /* Get the final character and fall through */
2838              }
2839    
2840            /* Fall through if we have a single character (c >= 0). This may be
2841            greater than 256 in UTF-8 mode. */
2842    
2843            }   /* End of backslash handling */
2844    
2845        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2846        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2847        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2848          entirely. The code for handling \Q and \E is messy. */
2849    
2850          CHECK_RANGE:
2851          while (ptr[1] == '\\' && ptr[2] == 'E')
2852            {
2853            inescq = FALSE;
2854            ptr += 2;
2855            }
2856    
2857          oldptr = ptr;
2858    
2859        if (ptr[1] == '-' && ptr[2] != ']')        if (!inescq && ptr[1] == '-')
2860          {          {
2861          int d;          int d;
2862          ptr += 2;          ptr += 2;
2863            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2864    
2865            /* If we hit \Q (not followed by \E) at this point, go into escaped
2866            mode. */
2867    
2868            while (*ptr == '\\' && ptr[1] == 'Q')
2869              {
2870              ptr += 2;
2871              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2872              inescq = TRUE;
2873              break;
2874              }
2875    
2876            if (*ptr == 0 || (!inescq && *ptr == ']'))
2877              {
2878              ptr = oldptr;
2879              goto LONE_SINGLE_CHARACTER;
2880              }
2881    
2882  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2883          if (utf8)          if (utf8)
# Line 1992  for (;; ptr++) Line 2892  for (;; ptr++)
2892          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2893          in such circumstances. */          in such circumstances. */
2894    
2895          if (d == '\\')          if (!inescq && d == '\\')
2896            {            {
2897            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2898            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2899    
2900            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2901            was literal */            special means the '-' was literal */
2902    
2903            if (d < 0)            if (d < 0)
2904              {              {
2905              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2906              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2907                else if (d == -ESC_R) d = 'R'; else
2908                {                {
2909                ptr = oldptr - 2;                ptr = oldptr;
2910                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2911                }                }
2912              }              }
2913            }            }
2914    
2915          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2916          the pre-pass. Optimize one-character ranges */          one-character ranges */
2917    
2918            if (d < c)
2919              {
2920              *errorcodeptr = ERR8;
2921              goto FAILED;
2922              }
2923    
2924          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2925    
# Line 2033  for (;; ptr++) Line 2940  for (;; ptr++)
2940  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2941            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2942              {              {
2943              int occ, ocd;              unsigned int occ, ocd;
2944              int cc = c;              unsigned int cc = c;
2945              int origd = d;              unsigned int origd = d;
2946              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2947                {                {
2948                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2949                      ocd <= (unsigned int)d)
2950                    continue;                          /* Skip embedded ranges */
2951    
2952                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2953                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2954                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2955                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2956                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2957                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2958                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2959                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2960                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2961                  d = ocd;                  d = ocd;
2962                  continue;                  continue;
# Line 2093  for (;; ptr++) Line 3004  for (;; ptr++)
3004          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
3005          for partial ranges without UCP support. */          for partial ranges without UCP support. */
3006    
3007          for (; c <= d; c++)          class_charcount += d - c + 1;
3008            class_lastchar = d;
3009    
3010            /* We can save a bit of time by skipping this in the pre-compile. */
3011    
3012            if (lengthptr == NULL) for (; c <= d; c++)
3013            {            {
3014            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
3015            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2101  for (;; ptr++) Line 3017  for (;; ptr++)
3017              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
3018              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
3019              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
3020            }            }
3021    
3022          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2126  for (;; ptr++) Line 3040  for (;; ptr++)
3040  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3041          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3042            {            {
3043            int chartype;            unsigned int othercase;
3044            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3045              {              {
3046              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3047              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2154  for (;; ptr++) Line 3066  for (;; ptr++)
3066          }          }
3067        }        }
3068    
3069      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
     loop. This "while" is the end of the "do" above. */  
3070    
3071      while ((c = *(++ptr)) != ']' || inescq);      while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3072    
3073        if (c == 0)                          /* Missing terminating ']' */
3074          {
3075          *errorcodeptr = ERR6;
3076          goto FAILED;
3077          }
3078    
3079      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3080      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3081      can optimize the negative case only if there were no characters >= 128      can optimize the negative case only if there were no characters >= 128
# Line 2221  for (;; ptr++) Line 3138  for (;; ptr++)
3138    
3139      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3140      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3141      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3142    
3143  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3144      if (class_utf8)      if (class_utf8)
# Line 2231  for (;; ptr++) Line 3148  for (;; ptr++)
3148        code += LINK_SIZE;        code += LINK_SIZE;
3149        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3150    
3151        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3152        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3153    
3154        if (class_charcount > 0)        if (class_charcount > 0)
3155          {          {
3156          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3157            memmove(code + 32, code, class_utf8data - code);
3158          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3159          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3160          }          }
3161          else code = class_utf8data;
3162    
3163        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3164    
# Line 2265  for (;; ptr++) Line 3175  for (;; ptr++)
3175      if (negate_class)      if (negate_class)
3176        {        {
3177        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3178        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3179            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3180        }        }
3181      else      else
3182        {        {
# Line 2275  for (;; ptr++) Line 3186  for (;; ptr++)
3186      code += 32;      code += 32;
3187      break;      break;
3188    
3189    
3190        /* ===================================================================*/
3191      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3192      has been tested above. */      has been tested above. */
3193    
# Line 2342  for (;; ptr++) Line 3255  for (;; ptr++)
3255        }        }
3256      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3257    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3258      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3259      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3260      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2389  for (;; ptr++) Line 3288  for (;; ptr++)
3288          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3289          }          }
3290    
3291          /* If the repetition is unlimited, it pays to see if the next thing on
3292          the line is something that cannot possibly match this character. If so,
3293          automatically possessifying this item gains some performance in the case
3294          where the match fails. */
3295    
3296          if (!possessive_quantifier &&
3297              repeat_max < 0 &&
3298              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3299                options, cd))
3300            {
3301            repeat_type = 0;    /* Force greedy */
3302            possessive_quantifier = TRUE;
3303            }
3304    
3305        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3306        }        }
3307    
3308      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3309      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3310      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3311      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3312        currently used only for single-byte chars. */
3313    
3314      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3315        {        {
3316        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3317        c = previous[1];        c = previous[1];
3318          if (!possessive_quantifier &&
3319              repeat_max < 0 &&
3320              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3321            {
3322            repeat_type = 0;    /* Force greedy */
3323            possessive_quantifier = TRUE;
3324            }
3325        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3326        }        }
3327    
# Line 2414  for (;; ptr++) Line 3335  for (;; ptr++)
3335      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3336        {        {
3337        uschar *oldcode;        uschar *oldcode;
3338        int prop_type;        int prop_type, prop_value;
3339        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3340        c = *previous;        c = *previous;
3341    
3342          if (!possessive_quantifier &&
3343              repeat_max < 0 &&
3344              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3345            {
3346            repeat_type = 0;    /* Force greedy */
3347            possessive_quantifier = TRUE;
3348            }
3349    
3350        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3351        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3352          previous[1] : -1;          {
3353            prop_type = previous[1];
3354            prop_value = previous[2];
3355            }
3356          else prop_type = prop_value = -1;
3357    
3358        oldcode = code;        oldcode = code;
3359        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2454  for (;; ptr++) Line 3387  for (;; ptr++)
3387          }          }
3388    
3389        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3390        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3391        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3392        one less than the maximum. */        one less than the maximum. */
3393    
# Line 2481  for (;; ptr++) Line 3414  for (;; ptr++)
3414    
3415          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3416          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3417          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3418          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3419          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3420    
# Line 2497  for (;; ptr++) Line 3430  for (;; ptr++)
3430  #endif  #endif
3431              {              {
3432              *code++ = c;              *code++ = c;
3433              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3434                  {
3435                  *code++ = prop_type;
3436                  *code++ = prop_value;
3437                  }
3438              }              }
3439            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3440            }            }
3441    
3442          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3443          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3444            UPTO is just for 1 instance, we can use QUERY instead. */
3445    
3446          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3447            {            {
# Line 2516  for (;; ptr++) Line 3454  for (;; ptr++)
3454            else            else
3455  #endif  #endif
3456            *code++ = c;            *code++ = c;
3457            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3458                {
3459                *code++ = prop_type;
3460                *code++ = prop_value;
3461                }
3462            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3463            *code++ = OP_UPTO + repeat_type;  
3464            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3465                {
3466                *code++ = OP_QUERY + repeat_type;
3467                }
3468              else
3469                {
3470                *code++ = OP_UPTO + repeat_type;
3471                PUT2INC(code, 0, repeat_max);
3472                }
3473            }            }
3474          }          }
3475    
# Line 2535  for (;; ptr++) Line 3485  for (;; ptr++)
3485  #endif  #endif
3486        *code++ = c;        *code++ = c;
3487    
3488        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3489        defines the required property. */        define the required property. */
3490    
3491  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3492        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3493            {
3494            *code++ = prop_type;
3495            *code++ = prop_value;
3496            }
3497  #endif  #endif
3498        }        }
3499    
# Line 2582  for (;; ptr++) Line 3536  for (;; ptr++)
3536      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3537      cases. */      cases. */
3538    
3539      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3540               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3541        {        {
3542        register int i;        register int i;
3543        int ketoffset = 0;        int ketoffset = 0;
3544        int len = code - previous;        int len = code - previous;
3545        uschar *bralink = NULL;        uschar *bralink = NULL;
3546    
3547          /* Repeating a DEFINE group is pointless */
3548    
3549          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3550            {
3551            *errorcodeptr = ERR55;
3552            goto FAILED;
3553            }
3554    
3555        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3556        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3557        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2624  for (;; ptr++) Line 3586  for (;; ptr++)
3586          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3587          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3588          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3589          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3590          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3591            doing this. */
3592    
3593          if (repeat_max <= 1)          if (repeat_max <= 1)
3594            {            {
3595            *code = OP_END;            *code = OP_END;
3596            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3597            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3598            code++;            code++;
3599            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2648  for (;; ptr++) Line 3611  for (;; ptr++)
3611            {            {
3612            int offset;            int offset;
3613            *code = OP_END;            *code = OP_END;
3614            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3615            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3616            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3617            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2668  for (;; ptr++) Line 3631  for (;; ptr++)
3631        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3632        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3633        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3634        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3635          forward reference subroutine calls in the group, there will be entries on
3636          the workspace list; replicate these with an appropriate increment. */
3637    
3638        else        else
3639          {          {
3640          if (repeat_min > 1)          if (repeat_min > 1)
3641            {            {
3642            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3643            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. Do some paranoid checks for
3644              potential integer overflow. */
3645    
3646              if (lengthptr != NULL)
3647                {
3648                int delta = (repeat_min - 1)*length_prevgroup;
3649                if ((double)(repeat_min - 1)*(double)length_prevgroup >
3650                                                                (double)INT_MAX ||
3651                    OFLOW_MAX - *lengthptr < delta)
3652                  {
3653                  *errorcodeptr = ERR20;
3654                  goto FAILED;
3655                  }
3656                *lengthptr += delta;
3657                }
3658    
3659              /* This is compiling for real */
3660    
3661              else
3662              {              {
3663              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3664              code += len;              for (i = 1; i < repeat_min; i++)
3665                  {
3666                  uschar *hc;
3667                  uschar *this_hwm = cd->hwm;
3668                  memcpy(code, previous, len);
3669                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3670                    {
3671                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3672                    cd->hwm += LINK_SIZE;
3673                    }
3674                  save_hwm = this_hwm;
3675                  code += len;
3676                  }
3677              }              }
3678            }            }
3679    
3680          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3681          }          }
3682    
# Line 2688  for (;; ptr++) Line 3684  for (;; ptr++)
3684        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3685        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3686        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3687        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3688          replicate entries on the forward reference list. */
3689    
3690        if (repeat_max >= 0)        if (repeat_max >= 0)
3691          {          {
3692          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3693            just adjust the length as if we had. For each repetition we must add 1
3694            to the length for BRAZERO and for all but the last repetition we must
3695            add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3696            paranoid checks to avoid integer overflow. */
3697    
3698            if (lengthptr != NULL && repeat_max > 0)
3699              {
3700              int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3701                          2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3702              if ((double)repeat_max *
3703                    (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3704                      > (double)INT_MAX ||
3705                  OFLOW_MAX - *lengthptr < delta)
3706                {
3707                *errorcodeptr = ERR20;
3708                goto FAILED;
3709                }
3710              *lengthptr += delta;
3711              }
3712    
3713            /* This is compiling for real */
3714    
3715            else for (i = repeat_max - 1; i >= 0; i--)
3716            {            {
3717              uschar *hc;
3718              uschar *this_hwm = cd->hwm;
3719    
3720            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3721    
3722            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2709  for (;; ptr++) Line 3732  for (;; ptr++)
3732              }              }
3733    
3734            memcpy(code, previous, len);            memcpy(code, previous, len);
3735              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3736                {
3737                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3738                cd->hwm += LINK_SIZE;
3739                }
3740              save_hwm = this_hwm;
3741            code += len;            code += len;
3742            }            }
3743    
# Line 2731  for (;; ptr++) Line 3760  for (;; ptr++)
3760        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3761        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3762        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3763        correct offset was computed above. */        correct offset was computed above.
3764    
3765          Then, when we are doing the actual compile phase, check to see whether
3766          this group is a non-atomic one that could match an empty string. If so,
3767          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3768          that runtime checking can be done. [This check is also applied to
3769          atomic groups at runtime, but in a different way.] */
3770    
3771        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3772            {
3773            uschar *ketcode = code - ketoffset;
3774            uschar *bracode = ketcode - GET(ketcode, 1);
3775            *ketcode = OP_KETRMAX + repeat_type;
3776            if (lengthptr == NULL && *bracode != OP_ONCE)
3777              {
3778              uschar *scode = bracode;
3779              do
3780                {
3781                if (could_be_empty_branch(scode, ketcode, utf8))
3782                  {
3783                  *bracode += OP_SBRA - OP_BRA;
3784                  break;
3785                  }
3786                scode += GET(scode, 1);
3787                }
3788              while (*scode == OP_ALT);
3789              }
3790            }
3791        }        }
3792    
3793      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2744  for (;; ptr++) Line 3798  for (;; ptr++)
3798        goto FAILED;        goto FAILED;
3799        }        }
3800    
3801      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3802      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3803      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3804      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3805      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3806        but the special opcodes can optimize it a bit. The repeated item starts at
3807        tempcode, not at previous, which might be the first part of a string whose
3808        (former) last char we repeated.
3809    
3810        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3811        an 'upto' may follow. We skip over an 'exact' item, and then test the
3812        length of what remains before proceeding. */
3813    
3814      if (possessive_quantifier)      if (possessive_quantifier)
3815        {        {
3816        int len = code - tempcode;        int len;
3817        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3818        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3819        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3820        tempcode[0] = OP_ONCE;        len = code - tempcode;
3821        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3822        PUTINC(code, 0, len);          {
3823        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3824            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3825            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3826            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3827    
3828            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3829            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3830            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3831            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3832    
3833            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3834            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3835            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3836            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3837    
3838            default:
3839            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3840            code += 1 + LINK_SIZE;
3841            len += 1 + LINK_SIZE;
3842            tempcode[0] = OP_ONCE;
3843            *code++ = OP_KET;
3844            PUTINC(code, 0, len);
3845            PUT(tempcode, 1, len);
3846            break;
3847            }
3848        }        }
3849    
3850      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2772  for (;; ptr++) Line 3857  for (;; ptr++)
3857      break;      break;
3858    
3859    
3860      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3861      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3862      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3863      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3864      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3865      check for syntax errors here.  */      group. */
3866    
3867      case '(':      case '(':
3868      newoptions = options;      newoptions = options;
3869      skipbytes = 0;      skipbytes = 0;
3870        bravalue = OP_CBRA;
3871        save_hwm = cd->hwm;
3872        reset_bracount = FALSE;
3873    
3874      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3875        {        {
3876        int set, unset;        int i, set, unset, namelen;
3877        int *optset;        int *optset;
3878          const uschar *name;
3879          uschar *slot;
3880    
3881        switch (*(++ptr))        switch (*(++ptr))
3882          {          {
3883          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3884          ptr++;          ptr++;
3885          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3886            if (*ptr == 0)
3887              {
3888              *errorcodeptr = ERR18;
3889              goto FAILED;
3890              }
3891          continue;          continue;
3892    
3893          case ':':                 /* Non-extracting bracket */  
3894            /* ------------------------------------------------------------ */
3895            case '|':                 /* Reset capture count for each branch */
3896            reset_bracount = TRUE;
3897            /* Fall through */
3898    
3899            /* ------------------------------------------------------------ */
3900            case ':':                 /* Non-capturing bracket */
3901          bravalue = OP_BRA;          bravalue = OP_BRA;
3902          ptr++;          ptr++;
3903          break;          break;
3904    
3905    
3906            /* ------------------------------------------------------------ */
3907          case '(':          case '(':
3908          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3909    
3910          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3911            group), a name (referring to a named group), or 'R', referring to
3912            recursion. R<digits> and R&name are also permitted for recursion tests.
3913    
3914            There are several syntaxes for testing a named group: (?(name)) is used
3915            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3916    
3917            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3918            be the recursive thing or the name 'R' (and similarly for 'R' followed
3919            by digits), and (b) a number could be a name that consists of digits.
3920            In both cases, we look for a name first; if not found, we try the other
3921            cases. */
3922    
3923            /* For conditions that are assertions, check the syntax, and then exit
3924            the switch. This will take control down to where bracketed groups,
3925            including assertions, are processed. */
3926    
3927            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3928              break;
3929    
3930            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3931            below), and all need to skip 3 bytes at the start of the group. */
3932    
3933            code[1+LINK_SIZE] = OP_CREF;
3934            skipbytes = 3;
3935            refsign = -1;
3936    
3937            /* Check for a test for recursion in a named group. */
3938    
3939          if (ptr[1] == 'R')          if (ptr[1] == 'R' && ptr[2] == '&')
3940            {            {
3941            code[1+LINK_SIZE] = OP_CREF;            terminator = -1;
3942            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            ptr += 2;
3943            skipbytes = 3;            code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
           ptr += 3;  
3944            }            }
3945    
3946          /* Condition to test for a numbered subpattern match. We know that          /* Check for a test for a named group's having been set, using the Perl
3947          if a digit follows ( then there will just be digits until ) because          syntax (?(<name>) or (?('name') */
         the syntax was checked in the first pass. */  
3948    
3949          else if ((digitab[ptr[1]] && ctype_digit) != 0)          else if (ptr[1] == '<')
3950            {            {
3951            int condref;                 /* Don't amalgamate; some compilers */            terminator = '>';
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
             {  
             *errorcodeptr = ERR35;  
             goto FAILED;  
             }  
3952            ptr++;            ptr++;
           code[1+LINK_SIZE] = OP_CREF;  
           PUT2(code, 2+LINK_SIZE, condref);  
           skipbytes = 3;  
3953            }            }
3954          /* For conditions that are assertions, we just fall through, having          else if (ptr[1] == '\'')
3955          set bravalue above. */            {
3956          break;            terminator = '\'';
3957              ptr++;
3958          case '=':                 /* Positive lookahead */            }
3959          bravalue = OP_ASSERT;          else
3960          ptr++;            {
3961          break;            terminator = 0;
3962              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3963              }
3964    
3965          case '!':                 /* Negative lookahead */          /* We now expect to read a name; any thing else is an error */
         bravalue = OP_ASSERT_NOT;  
         ptr++;  
         break;  
3966    
3967          case '<':                 /* Lookbehinds */          if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
         switch (*(++ptr))  
3968            {            {
3969            case '=':               /* Positive lookbehind */            ptr += 1;  /* To get the right offset */
3970            bravalue = OP_ASSERTBACK;            *errorcodeptr = ERR28;
3971            ptr++;            goto FAILED;
3972            break;            }
3973    
3974            case '!':               /* Negative lookbehind */          /* Read the name, but also get it as a number if it's all digits */
3975            bravalue = OP_ASSERTBACK_NOT;  
3976            recno = 0;
3977            name = ++ptr;
3978            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3979              {
3980              if (recno >= 0)
3981                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3982                  recno * 10 + *ptr - '0' : -1;
3983            ptr++;            ptr++;
           break;  
3984            }            }
3985          break;          namelen = ptr - name;
3986    
3987          case '>':                 /* One-time brackets */          if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3988          bravalue = OP_ONCE;            {
3989          ptr++;            ptr--;      /* Error offset */
3990          break;            *errorcodeptr = ERR26;
3991              goto FAILED;
3992              }
3993    
3994          case 'C':                 /* Callout - may be followed by digits; */          /* Do no further checking in the pre-compile phase. */
3995          previous_callout = code;  /* Save for later completion */  
3996          after_manual_callout = 1; /* Skip one item before completing */          if (lengthptr != NULL) break;
3997          *code++ = OP_CALLOUT;     /* Already checked that the terminating */  
3998            {                       /* closing parenthesis is present. */          /* In the real compile we do the work of looking for the actual
3999            int n = 0;          reference. If the string started with "+" or "-" we require the rest to
4000            while ((digitab[*(++ptr)] & ctype_digit) != 0)          be digits, in which case recno will be set. */
4001              n = n * 10 + *ptr - '0';  
4002            if (n > 255)          if (refsign > 0)
4003              {
4004              if (recno <= 0)
4005              {              {
4006              *errorcodeptr = ERR38;              *errorcodeptr = ERR58;
4007              goto FAILED;              goto FAILED;
4008              }              }
4009            *code++ = n;            if (refsign == '-')
4010            PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */              {
4011            PUT(code, LINK_SIZE, 0);                    /* Default length */              recno = cd->bracount - recno + 1;
4012            code += 2 * LINK_SIZE;              if (recno <= 0)
4013                  {
4014                  *errorcodeptr = ERR15;
4015                  goto FAILED;
4016                  }
4017                }
4018              else recno += cd->bracount;
4019              PUT2(code, 2+LINK_SIZE, recno);
4020              break;
4021            }            }
         previous = NULL;  
         continue;  
4022    
4023          case 'P':                 /* Named subpattern handling */          /* Otherwise (did not start with "+" or "-"), start by looking for the
4024          if (*(++ptr) == '<')      /* Definition */          name. */
4025    
4026            slot = cd->name_table;
4027            for (i = 0; i < cd->names_found; i++)
4028            {            {
4029            int i, namelen;            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4030            uschar *slot = cd->name_table;            slot += cd->name_entry_size;
4031            const uschar *name;     /* Don't amalgamate; some compilers */            }
           name = ++ptr;           /* grumble at autoincrement in declaration */  
4032    
4033            while (*ptr++ != '>');          /* Found a previous named subpattern */
           namelen = ptr - name - 1;  
4034    
4035            for (i = 0; i < cd->names_found; i++)          if (i < cd->names_found)
4036              {            {
4037              int crc = memcmp(name, slot+2, namelen);            recno = GET2(slot, 0);
4038              if (crc == 0)            PUT2(code, 2+LINK_SIZE, recno);
4039                {            }
4040                if (slot[2+namelen] == 0)  
4041                  {          /* Search the pattern for a forward reference */
4042                  *errorcodeptr = ERR43;  
4043                  goto FAILED;          else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4044                  }                          (options & PCRE_EXTENDED) != 0)) > 0)
4045                crc = -1;             /* Current name is substring */            {
4046                }            PUT2(code, 2+LINK_SIZE, i);
4047              if (crc < 0)            }
4048    
4049            /* If terminator == 0 it means that the name followed directly after
4050            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4051            some further alternatives to try. For the cases where terminator != 0
4052            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4053            now checked all the possibilities, so give an error. */
4054    
4055            else if (terminator != 0)
4056              {
4057              *errorcodeptr = ERR15;
4058              goto FAILED;
4059              }
4060    
4061            /* Check for (?(R) for recursion. Allow digits after R to specify a
4062            specific group number. */
4063    
4064            else if (*name == 'R')
4065              {
4066              recno = 0;
4067              for (i = 1; i < namelen; i++)
4068                {
4069                if ((digitab[name[i]] & ctype_digit) == 0)
4070                {                {
4071                memmove(slot + cd->name_entry_size, slot,                *errorcodeptr = ERR15;
4072                  (cd->names_found - i) * cd->name_entry_size);                goto FAILED;
               break;  
4073                }                }
4074              slot += cd->name_entry_size;              recno = recno * 10 + name[i] - '0';
4075                }
4076              if (recno == 0) recno = RREF_ANY;
4077              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4078              PUT2(code, 2+LINK_SIZE, recno);
4079              }
4080    
4081            /* Similarly, check for the (?(DEFINE) "condition", which is always
4082            false. */
4083    
4084            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4085              {
4086              code[1+LINK_SIZE] = OP_DEF;
4087              skipbytes = 1;
4088              }
4089    
4090            /* Check for the "name" actually being a subpattern number. */
4091    
4092            else if (recno > 0)
4093              {
4094              PUT2(code, 2+LINK_SIZE, recno);
4095              }
4096    
4097            /* Either an unidentified subpattern, or a reference to (?(0) */
4098    
4099            else
4100              {
4101              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4102              goto FAILED;
4103              }
4104            break;
4105    
4106    
4107            /* ------------------------------------------------------------ */
4108            case '=':                 /* Positive lookahead */
4109            bravalue = OP_ASSERT;
4110            ptr++;
4111            break;
4112    
4113    
4114            /* ------------------------------------------------------------ */
4115            case '!':                 /* Negative lookahead */
4116            bravalue = OP_ASSERT_NOT;
4117            ptr++;
4118            break;
4119    
4120    
4121            /* ------------------------------------------------------------ */
4122            case '<':                 /* Lookbehind or named define */
4123            switch (ptr[1])
4124              {
4125              case '=':               /* Positive lookbehind */
4126              bravalue = OP_ASSERTBACK;
4127              ptr += 2;
4128              break;
4129    
4130              case '!':               /* Negative lookbehind */
4131              bravalue = OP_ASSERTBACK_NOT;
4132              ptr += 2;
4133              break;
4134    
4135              default:                /* Could be name define, else bad */
4136              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4137              ptr++;                  /* Correct offset for error */
4138              *errorcodeptr = ERR24;
4139              goto FAILED;
4140              }
4141            break;
4142    
4143    
4144            /* ------------------------------------------------------------ */
4145            case '>':                 /* One-time brackets */
4146            bravalue = OP_ONCE;
4147            ptr++;
4148            break;
4149    
4150    
4151            /* ------------------------------------------------------------ */
4152            case 'C':                 /* Callout - may be followed by digits; */
4153            previous_callout = code;  /* Save for later completion */
4154            after_manual_callout = 1; /* Skip one item before completing */
4155            *code++ = OP_CALLOUT;
4156              {
4157              int n = 0;
4158              while ((digitab[*(++ptr)] & ctype_digit) != 0)
4159                n = n * 10 + *ptr - '0';
4160              if (*ptr != ')')
4161                {
4162                *errorcodeptr = ERR39;
4163                goto FAILED;
4164                }
4165              if (n > 255)
4166                {
4167                *errorcodeptr = ERR38;
4168                goto FAILED;
4169              }              }
4170              *code++ = n;
4171              PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4172              PUT(code, LINK_SIZE, 0);                    /* Default length */
4173              code += 2 * LINK_SIZE;
4174              }
4175            previous = NULL;
4176            continue;
4177    
4178    
4179            PUT2(slot, 0, *brackets + 1);          /* ------------------------------------------------------------ */
4180            memcpy(slot + 2, name, namelen);          case 'P':                 /* Python-style named subpattern handling */
4181            slot[2+namelen] = 0;          if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4182            cd->names_found++;            {
4183            goto NUMBERED_GROUP;            is_recurse = *ptr == '>';
4184              terminator = ')';
4185              goto NAMED_REF_OR_RECURSE;
4186              }
4187            else if (*ptr != '<')    /* Test for Python-style definition */
4188              {
4189              *errorcodeptr = ERR41;
4190              goto FAILED;
4191            }            }
4192            /* Fall through to handle (?P< as (?< is handled */
4193    
4194          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */  
4195            /* ------------------------------------------------------------ */
4196            DEFINE_NAME:    /* Come here from (?< handling */
4197            case '\'':
4198            {            {
4199            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4200            int type = *ptr++;            name = ++ptr;
           const uschar *name = ptr;  
           uschar *slot = cd->name_table;  
4201    
4202            while (*ptr != ')') ptr++;            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4203            namelen = ptr - name;            namelen = ptr - name;
4204    
4205              /* In the pre-compile phase, just do a syntax check. */
4206    
4207              if (lengthptr != NULL)
4208                {
4209                if (*ptr != terminator)
4210                  {
4211                  *errorcodeptr = ERR42;
4212                  goto FAILED;
4213                  }
4214                if (cd->names_found >= MAX_NAME_COUNT)
4215                  {
4216                  *errorcodeptr = ERR49;
4217                  goto FAILED;
4218                  }
4219                if (namelen + 3 > cd->name_entry_size)
4220                  {
4221                  cd->name_entry_size = namelen + 3;
4222                  if (namelen > MAX_NAME_SIZE)
4223                    {
4224                    *errorcodeptr = ERR48;
4225                    goto FAILED;
4226                    }
4227                  }
4228                }
4229    
4230              /* In the real compile, create the entry in the table */
4231    
4232              else
4233                {
4234                slot = cd->name_table;
4235                for (i = 0; i < cd->names_found; i++)
4236                  {
4237                  int crc = memcmp(name, slot+2, namelen);
4238                  if (crc == 0)
4239                    {
4240                    if (slot[2+namelen] == 0)
4241                      {
4242                      if ((options & PCRE_DUPNAMES) == 0)
4243                        {
4244                        *errorcodeptr = ERR43;
4245                        goto FAILED;
4246                        }
4247                      }
4248                    else crc = -1;      /* Current name is substring */
4249                    }
4250                  if (crc < 0)
4251                    {
4252                    memmove(slot + cd->name_entry_size, slot,
4253                      (cd->names_found - i) * cd->name_entry_size);
4254                    break;
4255                    }
4256                  slot += cd->name_entry_size;
4257                  }
4258    
4259                PUT2(slot, 0, cd->bracount + 1);
4260                memcpy(slot + 2, name, namelen);
4261                slot[2+namelen] = 0;
4262                }
4263              }
4264    
4265            /* In both cases, count the number of names we've encountered. */
4266    
4267            ptr++;                    /* Move past > or ' */
4268            cd->names_found++;
4269            goto NUMBERED_GROUP;
4270    
4271    
4272            /* ------------------------------------------------------------ */
4273            case '&':                 /* Perl recursion/subroutine syntax */
4274            terminator = ')';
4275            is_recurse = TRUE;
4276            /* Fall through */
4277    
4278            /* We come here from the Python syntax above that handles both
4279            references (?P=name) and recursion (?P>name), as well as falling
4280            through from the Perl recursion syntax (?&name). */
4281    
4282            NAMED_REF_OR_RECURSE:
4283            name = ++ptr;
4284            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4285            namelen = ptr - name;
4286    
4287            /* In the pre-compile phase, do a syntax check and set a dummy
4288            reference number. */
4289    
4290            if (lengthptr != NULL)
4291              {
4292              if (*ptr != terminator)
4293                {
4294                *errorcodeptr = ERR42;
4295                goto FAILED;
4296                }
4297              if (namelen > MAX_NAME_SIZE)
4298                {
4299                *errorcodeptr = ERR48;
4300                goto FAILED;
4301                }
4302              recno = 0;
4303              }
4304    
4305            /* In the real compile, seek the name in the table */
4306    
4307            else
4308              {
4309              slot = cd->name_table;
4310            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4311              {              {
4312              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4313              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4314              }              }
4315            if (i >= cd->names_found)  
4316              if (i < cd->names_found)         /* Back reference */
4317                {
4318                recno = GET2(slot, 0);
4319                }
4320              else if ((recno =                /* Forward back reference */
4321                        find_parens(ptr, cd->bracount, name, namelen,
4322                          (options & PCRE_EXTENDED) != 0)) <= 0)
4323              {              {
4324              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4325              goto FAILED;              goto FAILED;
4326              }              }
4327              }
4328    
4329            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4330            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4331    
4332            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4333            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4334    
         /* Should never happen */  
         break;  
4335    
4336          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4337            case 'R':                 /* Recursion */
4338          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4339          /* Fall through */          /* Fall through */
4340    
         /* Recursion or "subroutine" call */  
4341    
4342          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4343          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4344            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4345            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4346            {            {
4347            const uschar *called;            const uschar *called;
4348    
4349              if ((refsign = *ptr) == '+') ptr++;
4350              else if (refsign == '-')
4351                {
4352                if ((digitab[ptr[1]] & ctype_digit) == 0)
4353                  goto OTHER_CHAR_AFTER_QUERY;
4354                ptr++;
4355                }
4356    
4357            recno = 0;            recno = 0;
4358            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4359              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4360    
4361              if (*ptr != ')')
4362                {
4363                *errorcodeptr = ERR29;
4364                goto FAILED;
4365                }
4366    
4367              if (refsign == '-')
4368                {
4369                if (recno == 0)
4370                  {
4371                  *errorcodeptr = ERR58;
4372                  goto FAILED;
4373                  }
4374                recno = cd->bracount - recno + 1;
4375                if (recno <= 0)
4376                  {
4377                  *errorcodeptr = ERR15;
4378                  goto FAILED;
4379                  }
4380                }
4381              else if (refsign == '+')
4382                {
4383                if (recno == 0)
4384                  {
4385                  *errorcodeptr = ERR58;
4386                  goto FAILED;
4387                  }
4388                recno += cd->bracount;
4389                }
4390    
4391            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4392    
4393            HANDLE_RECURSION:            HANDLE_RECURSION:
4394    
4395            previous = code;            previous = code;
4396              called = cd->start_code;
4397    
4398            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4399            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4400              this point. If we end up with a forward reference, first check that
4401            *code = OP_END;            the bracket does occur later so we can give the error (and position)
4402            called = (recno == 0)?            now. Then remember this forward reference in the workspace so it can
4403              cd->start_code : find_bracket(cd->start_code, utf8, recno);            be filled in at the end. */
4404    
4405            if (called == NULL)            if (lengthptr == NULL)
4406              {              {
4407              *errorcodeptr = ERR15;              *code = OP_END;
4408              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4409    
4410            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4411    
4412            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4413              {                {
4414              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4415              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4416                    {
4417                    *errorcodeptr = ERR15;
4418                    goto FAILED;
4419                    }
4420                  called = cd->start_code + recno;
4421                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4422                  }
4423    
4424                /* If not a forward reference, and the subpattern is still open,
4425                this is a recursive call. We check to see if this is a left
4426                recursion that could loop for ever, and diagnose that case. */
4427    
4428                else if (GET(called, 1) == 0 &&
4429                         could_be_empty(called, code, bcptr, utf8))
4430                  {
4431                  *errorcodeptr = ERR40;
4432                  goto FAILED;
4433                  }
4434              }              }
4435    
4436            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4437              "once" brackets. Set up a "previous group" length so that a
4438              subsequent quantifier will work. */
4439    
4440              *code = OP_ONCE;
4441              PUT(code, 1, 2 + 2*LINK_SIZE);
4442              code += 1 + LINK_SIZE;
4443    
4444            *code = OP_RECURSE;            *code = OP_RECURSE;
4445            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4446            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4447    
4448              *code = OP_KET;
4449              PUT(code, 1, 2 + 2*LINK_SIZE);
4450              code += 1 + LINK_SIZE;
4451    
4452              length_prevgroup = 3 + 3*LINK_SIZE;
4453            }            }
4454    
4455            /* Can't determine a first byte now */
4456    
4457            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4458          continue;          continue;
4459    
         /* Character after (? not specially recognized */  
4460    
4461          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4462            default:              /* Other characters: check option setting */
4463            OTHER_CHAR_AFTER_QUERY:
4464          set = unset = 0;          set = unset = 0;
4465          optset = &set;          optset = &set;
4466    
# Line 3027  for (;; ptr++) Line 4470  for (;; ptr++)
4470              {              {
4471              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4472    
4473                case 'J':    /* Record that it changed in the external options */
4474                *optset |= PCRE_DUPNAMES;
4475                cd->external_options |= PCRE_JCHANGED;
4476                break;
4477    
4478              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4479              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4480              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4481              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4482              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4483              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4484    
4485                default:  *errorcodeptr = ERR12;
4486                          ptr--;    /* Correct the offset */
4487                          goto FAILED;
4488              }              }
4489            }            }
4490    
# Line 3041  for (;; ptr++) Line 4493  for (;; ptr++)
4493          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4494    
4495          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4496          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4497          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4498          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4499          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4500          a group), a resetting item can be compiled.          caseless checking of required bytes.
4501    
4502          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4503          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4504          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4505            that value after the start, because it gets reset as code is discarded
4506            during the pre-compile. However, this can happen only at top level - if
4507            we are within parentheses, the starting BRA will still be present. At
4508            any parenthesis level, the length value can be used to test if anything
4509            has been compiled at that level. Thus, a test for both these conditions
4510            is necessary to ensure we correctly detect the start of the pattern in
4511            both phases.
4512    
4513            If we are not at the pattern start, compile code to change the ims
4514            options if this setting actually changes any of them. We also pass the
4515            new setting back so that it can be put at the start of any following
4516            branches, and when this group ends (if we are in a group), a resetting
4517            item can be compiled. */
4518    
4519          if (*ptr == ')')          if (*ptr == ')')
4520            {            {
4521            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4522                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4523              {              {
4524              *code++ = OP_OPT;              cd->external_options = newoptions;
4525              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4526              }              }
4527             else
4528                {
4529                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4530                  {
4531                  *code++ = OP_OPT;
4532                  *code++ = newoptions & PCRE_IMS;
4533                  }
4534    
4535            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4536            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4537            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4538    
4539            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4540            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4541            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4542            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4543                }
4544    
4545            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4546            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3079  for (;; ptr++) Line 4553  for (;; ptr++)
4553    
4554          bravalue = OP_BRA;          bravalue = OP_BRA;
4555          ptr++;          ptr++;
4556          }          }     /* End of switch for character following (? */
4557        }        }       /* End of (? handling */
4558    
4559      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4560      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4561        brackets. */
4562    
4563      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4564        {        {
4565        bravalue = OP_BRA;        bravalue = OP_BRA;
4566        }        }
4567    
4568      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4569    
4570      else      else
4571        {        {
4572        NUMBERED_GROUP:        NUMBERED_GROUP:
4573        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4574          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4575          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;  
         }  
       else bravalue = OP_BRA + *brackets;  
4576        }        }
4577    
4578      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed regex. Assertions may not be repeated, but
4579      kinds can be. We copy code into a non-register variable in order to be able      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4580      to pass its address because some compilers complain otherwise. Pass in a      non-register variable in order to be able to pass its address because some
4581      new setting for the ims options if they have changed. */      compilers complain otherwise. Pass in a new setting for the ims options if
4582        they have changed. */
4583    
4584      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
4585      *code = bravalue;      *code = bravalue;
4586      tempcode = code;      tempcode = code;
4587      tempreqvary = cd->req_varyopt;     /* Save value before bracket */      tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4588        length_prevgroup = 0;              /* Initialize for pre-compile phase */
4589    
4590      if (!compile_regex(      if (!compile_regex(
4591           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
4592           options & PCRE_IMS,           /* The previous ims option state */           options & PCRE_IMS,           /* The previous ims option state */
          brackets,                     /* Extracting bracket count */  
4593           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
4594           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
4595           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
4596           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
4597            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4598           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */           reset_bracount,               /* True if (?| group */
4599             skipbytes,                    /* Skip over bracket number */
4600           &subfirstbyte,                /* For possible first char */           &subfirstbyte,                /* For possible first char */
4601           &subreqbyte,                  /* For possible last char */           &subreqbyte,                  /* For possible last char */
4602           bcptr,                        /* Current branch chain */           bcptr,                        /* Current branch chain */
4603           cd))                          /* Tables block */           cd,                           /* Tables block */
4604             (lengthptr == NULL)? NULL :   /* Actual compile phase */
4605               &length_prevgroup           /* Pre-compile phase */
4606             ))
4607        goto FAILED;        goto FAILED;
4608    
4609      /* At the end of compiling, code is still pointing to the start of the      /* At the end of compiling, code is still pointing to the start of the
# Line 3139  for (;; ptr++) Line 4612  for (;; ptr++)
4612      is on the bracket. */      is on the bracket. */
4613    
4614      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
4615      two branches in the group. */      two branches in the group, or just one if it's a DEFINE group. We do this
4616        in the real compile phase, not in the pre-pass, where the whole group may
4617        not be available. */
4618    
4619      else if (bravalue == OP_COND)      if (bravalue == OP_COND && lengthptr == NULL)
4620        {        {
4621        uschar *tc = code;        uschar *tc = code;
4622        condcount = 0;        int condcount = 0;
4623    
4624        do {        do {
4625           condcount++;           condcount++;
# Line 3152  for (;; ptr++) Line 4627  for (;; ptr++)
4627           }           }
4628        while (*tc != OP_KET);        while (*tc != OP_KET);
4629    
4630        if (condcount > 2)        /* A DEFINE group is never obeyed inline (the "condition" is always
4631          false). It must have only one branch. */
4632    
4633          if (code[LINK_SIZE+1] == OP_DEF)
4634          {          {
4635          *errorcodeptr = ERR27;          if (condcount > 1)
4636          goto FAILED;            {
4637              *errorcodeptr = ERR54;
4638              goto FAILED;
4639              }
4640            bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4641            }
4642    
4643          /* A "normal" conditional group. If there is just one branch, we must not
4644          make use of its firstbyte or reqbyte, because this is equivalent to an
4645          empty second branch. */
4646    
4647          else
4648            {
4649            if (condcount > 2)
4650              {
4651              *errorcodeptr = ERR27;
4652              goto FAILED;
4653              }
4654            if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4655          }          }
4656          }
4657    
4658        /* Error if hit end of pattern */
4659    
4660        if (*ptr != ')')
4661          {
4662          *errorcodeptr = ERR14;
4663          goto FAILED;
4664          }
4665    
4666        /* If there is just one branch, we must not make use of its firstbyte or      /* In the pre-compile phase, update the length by the length of the nested
4667        reqbyte, because this is equivalent to an empty second branch. */      group, less the brackets at either end. Then reduce the compiled code to
4668        just the brackets so that it doesn't use much memory if it is duplicated by
4669        a quantifier. */
4670    
4671        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;      if (lengthptr != NULL)
4672          {
4673          if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4674            {
4675            *errorcodeptr = ERR20;
4676            goto FAILED;
4677            }
4678          *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4679          code++;
4680          PUTINC(code, 0, 1 + LINK_SIZE);
4681          *code++ = OP_KET;
4682          PUTINC(code, 0, 1 + LINK_SIZE);
4683        }        }
4684    
4685      /* Handle updating of the required and first characters. Update for normal      /* Otherwise update the main code pointer to the end of the group. */
4686      brackets of all kinds, and conditions with two branches (see code above).  
4687      If the bracket is followed by a quantifier with zero repeat, we have to      else code = tempcode;
4688      back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the  
4689      main loop so that they can be accessed for the back off. */      /* For a DEFINE group, required and first character settings are not