/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 85 by nigel, Sat Feb 24 21:41:13 2007 UTC revision 180 by ph10, Wed Jun 13 10:59:18 2007 UTC
# Line 6  Line 6 
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language.
7    
8                         Written by Philip Hazel                         Written by Philip Hazel
9             Copyright (c) 1997-2005 University of Cambridge             Copyright (c) 1997-2007 University of Cambridge
10    
11  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  supporting internal functions that are not used by other modules. */  supporting internal functions that are not used by other modules. */
43    
44    
45    #define NLBLOCK cd             /* Block containing newline information */
46    #define PSSTART start_pattern  /* Field containing processed string start */
47    #define PSEND   end_pattern    /* Field containing processed string end */
48    
49    
50  #include "pcre_internal.h"  #include "pcre_internal.h"
51    
52    
# Line 53  used by pcretest. DEBUG is not defined w Line 58  used by pcretest. DEBUG is not defined w
58  #endif  #endif
59    
60    
61    /* Macro for setting individual bits in class bitmaps. */
62    
63    #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
64    
65    
66  /*************************************************  /*************************************************
67  *      Code parameters and static tables         *  *      Code parameters and static tables         *
68  *************************************************/  *************************************************/
69    
70  /* Maximum number of items on the nested bracket stacks at compile time. This  /* This value specifies the size of stack workspace that is used during the
71  applies to the nesting of all kinds of parentheses. It does not limit  first pre-compile phase that determines how much memory is required. The regex
72  un-nested, non-capturing parentheses. This number can be made bigger if  is partly compiled into this space, but the compiled parts are discarded as
73  necessary - it is used to dimension one int and one unsigned char vector at  soon as they can be, so that hopefully there will never be an overrun. The code
74  compile time. */  does, however, check for an overrun. The largest amount I've seen used is 218,
75    so this number is very generous.
76    
77    The same workspace is used during the second, actual compile phase for
78    remembering forward references to groups so that they can be filled in at the
79    end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
80    is 4 there is plenty of room. */
81    
82  #define BRASTACK_SIZE 200  #define COMPILE_WORK_SIZE (4096)
83    
84    
85  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
# Line 72  are simple data values; negative values Line 87  are simple data values; negative values
87  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
88  is invalid. */  is invalid. */
89    
90  #if !EBCDIC   /* This is the "normal" table for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
91  static const short int escapes[] = {  static const short int escapes[] = {
92       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */       0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
93       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */       0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
94     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
95       0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */  -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
96  -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */  -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
97  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */  -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
98     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */     '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
99       0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */  -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
100  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */  -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
101       0,      0, -ESC_z                                            /* x - z */       0,      0, -ESC_z                                            /* x - z */
102  };  };
103    
104  #else         /* This is the "abnormal" table for EBCDIC systems */  #else           /* This is the "abnormal" table for EBCDIC systems */
105  static const short int escapes[] = {  static const short int escapes[] = {
106  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',  /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
107  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,  /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
# Line 96  static const short int escapes[] = { Line 111  static const short int escapes[] = {
111  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
112  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',  /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
113  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,  /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
114  /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,  /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
115  /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,  /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
116  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,  /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
117  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,  /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
118  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,  /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
119  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
120  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
121  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
122  /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
123  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,  /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
124  /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
125  /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
126  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
127  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,  /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
128  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0  /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
# Line 116  static const short int escapes[] = { Line 131  static const short int escapes[] = {
131    
132    
133  /* Tables of names of POSIX character classes and their lengths. The list is  /* Tables of names of POSIX character classes and their lengths. The list is
134  terminated by a zero length entry. The first three must be alpha, upper, lower,  terminated by a zero length entry. The first three must be alpha, lower, upper,
135  as this is assumed for handling case independence. */  as this is assumed for handling case independence. */
136    
137  static const char *const posix_names[] = {  static const char *const posix_names[] = {
# Line 127  static const char *const posix_names[] = Line 142  static const char *const posix_names[] =
142  static const uschar posix_name_lengths[] = {  static const uschar posix_name_lengths[] = {
143    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
144    
145  /* Table of class bit maps for each POSIX class; up to three may be combined  /* Table of class bit maps for each POSIX class. Each class is formed from a
146  to form the class. The table for [:blank:] is dynamically modified to remove  base map, with an optional addition or removal of another map. Then, for some
147  the vertical space characters. */  classes, there is some additional tweaking: for [:blank:] the vertical space
148    characters are removed, and for [:alpha:] and [:alnum:] the underscore
149    character is removed. The triples in the table consist of the base map offset,
150    second map offset or -1 if no second map, and a non-negative value for map
151    addition or a negative value for map subtraction (if there are two maps). The
152    absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
153    remove vertical space characters, 2 => remove underscore. */
154    
155  static const int posix_class_maps[] = {  static const int posix_class_maps[] = {
156    cbit_lower, cbit_upper, -1,             /* alpha */    cbit_word,  cbit_digit, -2,             /* alpha */
157    cbit_lower, -1,         -1,             /* lower */    cbit_lower, -1,          0,             /* lower */
158    cbit_upper, -1,         -1,             /* upper */    cbit_upper, -1,          0,             /* upper */
159    cbit_digit, cbit_lower, cbit_upper,     /* alnum */    cbit_word,  -1,          2,             /* alnum - word without underscore */
160    cbit_print, cbit_cntrl, -1,             /* ascii */    cbit_print, cbit_cntrl,  0,             /* ascii */
161    cbit_space, -1,         -1,             /* blank - a GNU extension */    cbit_space, -1,          1,             /* blank - a GNU extension */
162    cbit_cntrl, -1,         -1,             /* cntrl */    cbit_cntrl, -1,          0,             /* cntrl */
163    cbit_digit, -1,         -1,             /* digit */    cbit_digit, -1,          0,             /* digit */
164    cbit_graph, -1,         -1,             /* graph */    cbit_graph, -1,          0,             /* graph */
165    cbit_print, -1,         -1,             /* print */    cbit_print, -1,          0,             /* print */
166    cbit_punct, -1,         -1,             /* punct */    cbit_punct, -1,          0,             /* punct */
167    cbit_space, -1,         -1,             /* space */    cbit_space, -1,          0,             /* space */
168    cbit_word,  -1,         -1,             /* word - a Perl extension */    cbit_word,  -1,          0,             /* word - a Perl extension */
169    cbit_xdigit,-1,         -1              /* xdigit */    cbit_xdigit,-1,          0              /* xdigit */
170  };  };
171    
172    
173    #define STRING(a)  # a
174    #define XSTRING(s) STRING(s)
175    
176  /* The texts of compile-time error messages. These are "char *" because they  /* The texts of compile-time error messages. These are "char *" because they
177  are passed to the outside world. */  are passed to the outside world. Do not ever re-use any error number, because
178    they are documented. Always add a new error instead. Messages marked DEAD below
179    are no longer used. */
180    
181  static const char *error_texts[] = {  static const char *error_texts[] = {
182    "no error",    "no error",
# Line 165  static const char *error_texts[] = { Line 191  static const char *error_texts[] = {
191    "range out of order in character class",    "range out of order in character class",
192    "nothing to repeat",    "nothing to repeat",
193    /* 10 */    /* 10 */
194    "operand of unlimited repeat could match the empty string",    "operand of unlimited repeat could match the empty string",  /** DEAD **/
195    "internal error: unexpected repeat",    "internal error: unexpected repeat",
196    "unrecognized character after (?",    "unrecognized character after (?",
197    "POSIX named classes are supported only within a class",    "POSIX named classes are supported only within a class",
# Line 175  static const char *error_texts[] = { Line 201  static const char *error_texts[] = {
201    "erroffset passed as NULL",    "erroffset passed as NULL",
202    "unknown option bit(s) set",    "unknown option bit(s) set",
203    "missing ) after comment",    "missing ) after comment",
204    "parentheses nested too deeply",    "parentheses nested too deeply",  /** DEAD **/
205    /* 20 */    /* 20 */
206    "regular expression too large",    "regular expression too large",
207    "failed to get memory",    "failed to get memory",
# Line 184  static const char *error_texts[] = { Line 210  static const char *error_texts[] = {
210    "unrecognized character after (?<",    "unrecognized character after (?<",
211    /* 25 */    /* 25 */
212    "lookbehind assertion is not fixed length",    "lookbehind assertion is not fixed length",
213    "malformed number after (?(",    "malformed number or name after (?(",
214    "conditional group contains more than two branches",    "conditional group contains more than two branches",
215    "assertion expected after (?(",    "assertion expected after (?(",
216    "(?R or (?digits must be followed by )",    "(?R or (?[+-]digits must be followed by )",
217    /* 30 */    /* 30 */
218    "unknown POSIX class name",    "unknown POSIX class name",
219    "POSIX collating elements are not supported",    "POSIX collating elements are not supported",
220    "this version of PCRE is not compiled with PCRE_UTF8 support",    "this version of PCRE is not compiled with PCRE_UTF8 support",
221    "spare error",    "spare error",  /** DEAD **/
222    "character value in \\x{...} sequence is too large",    "character value in \\x{...} sequence is too large",
223    /* 35 */    /* 35 */
224    "invalid condition (?(0)",    "invalid condition (?(0)",
# Line 203  static const char *error_texts[] = { Line 229  static const char *error_texts[] = {
229    /* 40 */    /* 40 */
230    "recursive call could loop indefinitely",    "recursive call could loop indefinitely",
231    "unrecognized character after (?P",    "unrecognized character after (?P",
232    "syntax error after (?P",    "syntax error in subpattern name (missing terminator)",
233    "two named groups have the same name",    "two named subpatterns have the same name",
234    "invalid UTF-8 string",    "invalid UTF-8 string",
235    /* 45 */    /* 45 */
236    "support for \\P, \\p, and \\X has not been compiled",    "support for \\P, \\p, and \\X has not been compiled",
237    "malformed \\P or \\p sequence",    "malformed \\P or \\p sequence",
238    "unknown property name after \\P or \\p"    "unknown property name after \\P or \\p",
239      "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
240      "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
241      /* 50 */
242      "repeated subpattern is too long",
243      "octal value is greater than \\377 (not in UTF-8 mode)",
244      "internal error: overran compiling workspace",
245      "internal error: previously-checked referenced subpattern not found",
246      "DEFINE group contains more than one branch",
247      /* 55 */
248      "repeating a DEFINE group is not allowed",
249      "inconsistent NEWLINE options",
250      "\\g is not followed by a braced name or an optionally braced non-zero number",
251      "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
252  };  };
253    
254    
# Line 229  For convenience, we use the same bit def Line 268  For convenience, we use the same bit def
268    
269  Then we can use ctype_digit and ctype_xdigit in the code. */  Then we can use ctype_digit and ctype_xdigit in the code. */
270    
271  #if !EBCDIC    /* This is the "normal" case, for ASCII systems */  #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
272  static const unsigned char digitab[] =  static const unsigned char digitab[] =
273    {    {
274    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
# Line 265  static const unsigned char digitab[] = Line 304  static const unsigned char digitab[] =
304    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
305    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
306    
307  #else          /* This is the "abnormal" case, for EBCDIC systems */  #else           /* This is the "abnormal" case, for EBCDIC systems */
308  static const unsigned char digitab[] =  static const unsigned char digitab[] =
309    {    {
310    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
# Line 279  static const unsigned char digitab[] = Line 318  static const unsigned char digitab[] =
318    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
319    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
320    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
321    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
322    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
323    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
324    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
# Line 313  static const unsigned char ebcdic_charta Line 352  static const unsigned char ebcdic_charta
352    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
353    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */    0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
354    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
355    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-  */    0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
356    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
357    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */    0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
358    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
# Line 340  static const unsigned char ebcdic_charta Line 379  static const unsigned char ebcdic_charta
379  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
380    
381  static BOOL  static BOOL
382    compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
383      int *, int *, branch_chain *, compile_data *);      int *, int *, branch_chain *, compile_data *, int *);
384    
385    
386    
# Line 351  static BOOL Line 390  static BOOL
390    
391  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
392  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
393  encodes one of the more complicated things such as \d. When UTF-8 is enabled,  encodes one of the more complicated things such as \d. A backreference to group
394  a positive value greater than 255 may be returned. On entry, ptr is pointing at  n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
395  the \. On exit, it is on the final character of the escape sequence.  UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
396    ptr is pointing at the \. On exit, it is on the final character of the escape
397    sequence.
398    
399  Arguments:  Arguments:
400    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
# Line 371  static int Line 412  static int
412  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,  check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
413    int options, BOOL isclass)    int options, BOOL isclass)
414  {  {
415  const uschar *ptr = *ptrptr;  BOOL utf8 = (options & PCRE_UTF8) != 0;
416    const uschar *ptr = *ptrptr + 1;
417  int c, i;  int c, i;
418    
419    GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
420    ptr--;                            /* Set pointer back to the last byte */
421    
422  /* If backslash is at the end of the pattern, it's an error. */  /* If backslash is at the end of the pattern, it's an error. */
423    
 c = *(++ptr);  
424  if (c == 0) *errorcodeptr = ERR1;  if (c == 0) *errorcodeptr = ERR1;
425    
426  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in  /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
427  a table. A non-zero result is something that can be returned immediately.  a table. A non-zero result is something that can be returned immediately.
428  Otherwise further processing may be required. */  Otherwise further processing may be required. */
429    
430  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
431  else if (c < '0' || c > 'z') {}                           /* Not alphameric */  else if (c < '0' || c > 'z') {}                           /* Not alphameric */
432  else if ((i = escapes[c - '0']) != 0) c = i;  else if ((i = escapes[c - '0']) != 0) c = i;
433    
434  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
435  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */  else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
436  else if ((i = escapes[c - 0x48]) != 0)  c = i;  else if ((i = escapes[c - 0x48]) != 0)  c = i;
437  #endif  #endif
# Line 397  else if ((i = escapes[c - 0x48]) != 0) Line 441  else if ((i = escapes[c - 0x48]) != 0)
441  else  else
442    {    {
443    const uschar *oldptr;    const uschar *oldptr;
444      BOOL braced, negated;
445    
446    switch (c)    switch (c)
447      {      {
448      /* A number of Perl escapes are not handled by PCRE. We give an explicit      /* A number of Perl escapes are not handled by PCRE. We give an explicit
# Line 410  else Line 456  else
456      *errorcodeptr = ERR37;      *errorcodeptr = ERR37;
457      break;      break;
458    
459        /* \g must be followed by a number, either plain or braced. If positive, it
460        is an absolute backreference. If negative, it is a relative backreference.
461        This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
462        reference to a named group. This is part of Perl's movement towards a
463        unified syntax for back references. As this is synonymous with \k{name}, we
464        fudge it up by pretending it really was \k. */
465    
466        case 'g':
467        if (ptr[1] == '{')
468          {
469          const uschar *p;
470          for (p = ptr+2; *p != 0 && *p != '}'; p++)
471            if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
472          if (*p != 0 && *p != '}')
473            {
474            c = -ESC_k;
475            break;
476            }
477          braced = TRUE;
478          ptr++;
479          }
480        else braced = FALSE;
481    
482        if (ptr[1] == '-')
483          {
484          negated = TRUE;
485          ptr++;
486          }
487        else negated = FALSE;
488    
489        c = 0;
490        while ((digitab[ptr[1]] & ctype_digit) != 0)
491          c = c * 10 + *(++ptr) - '0';
492    
493        if (c == 0 || (braced && *(++ptr) != '}'))
494          {
495          *errorcodeptr = ERR57;
496          return 0;
497          }
498    
499        if (negated)
500          {
501          if (c > bracount)
502            {
503            *errorcodeptr = ERR15;
504            return 0;
505            }
506          c = bracount - (c - 1);
507          }
508    
509        c = -(ESC_REF + c);
510        break;
511    
512      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
513      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
514      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 451  else Line 550  else
550        }        }
551    
552      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
553      larger first octal digit. */      larger first octal digit. The original code used just to take the least
554        significant 8 bits of octal numbers (I think this is what early Perls used
555        to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
556        than 3 octal digits. */
557    
558      case '0':      case '0':
559      c -= '0';      c -= '0';
560      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')      while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
561          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
562      c &= 255;     /* Take least significant 8 bits */      if (!utf8 && c > 255) *errorcodeptr = ERR51;
563      break;      break;
564    
565      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number      /* \x is complicated. \x{ddd} is a character number which can be greater
566      which can be greater than 0xff, but only if the ddd are hex digits. */      than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
567        treated as a data character. */
568    
569      case 'x':      case 'x':
570  #ifdef SUPPORT_UTF8      if (ptr[1] == '{')
     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)  
571        {        {
572        const uschar *pt = ptr + 2;        const uschar *pt = ptr + 2;
573        register int count = 0;        int count = 0;
574    
575        c = 0;        c = 0;
576        while ((digitab[*pt] & ctype_xdigit) != 0)        while ((digitab[*pt] & ctype_xdigit) != 0)
577          {          {
578          int cc = *pt++;          register int cc = *pt++;
579            if (c == 0 && cc == '0') continue;     /* Leading zeroes */
580          count++;          count++;
581  #if !EBCDIC    /* ASCII coding */  
582    #ifndef EBCDIC  /* ASCII coding */
583          if (cc >= 'a') cc -= 32;               /* Convert to upper case */          if (cc >= 'a') cc -= 32;               /* Convert to upper case */
584          c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
585  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
586          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */          if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
587          c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));          c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
588  #endif  #endif
589          }          }
590    
591        if (*pt == '}')        if (*pt == '}')
592          {          {
593          if (c < 0 || count > 8) *errorcodeptr = ERR34;          if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
594          ptr = pt;          ptr = pt;
595          break;          break;
596          }          }
597    
598        /* If the sequence of hex digits does not end with '}', then we don't        /* If the sequence of hex digits does not end with '}', then we don't
599        recognize this construct; fall through to the normal \x handling. */        recognize this construct; fall through to the normal \x handling. */
600        }        }
 #endif  
601    
602      /* Read just a single hex char */      /* Read just a single-byte hex-defined char */
603    
604      c = 0;      c = 0;
605      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
606        {        {
607        int cc;                               /* Some compilers don't like ++ */        int cc;                               /* Some compilers don't like ++ */
608        cc = *(++ptr);                        /* in initializers */        cc = *(++ptr);                        /* in initializers */
609  #if !EBCDIC    /* ASCII coding */  #ifndef EBCDIC  /* ASCII coding */
610        if (cc >= 'a') cc -= 32;              /* Convert to upper case */        if (cc >= 'a') cc -= 32;              /* Convert to upper case */
611        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
612  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
613        if (cc <= 'z') cc += 64;              /* Convert to upper case */        if (cc <= 'z') cc += 64;              /* Convert to upper case */
614        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
615  #endif  #endif
616        }        }
617      break;      break;
618    
619      /* Other special escapes not starting with a digit are straightforward */      /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
620        This coding is ASCII-specific, but then the whole concept of \cx is
621        ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
622    
623      case 'c':      case 'c':
624      c = *(++ptr);      c = *(++ptr);
# Line 520  else Line 628  else
628        return 0;        return 0;
629        }        }
630    
631      /* A letter is upper-cased; then the 0x40 bit is flipped. This coding  #ifndef EBCDIC  /* ASCII coding */
     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.  
     (However, an EBCDIC equivalent has now been added.) */  
   
 #if !EBCDIC    /* ASCII coding */  
632      if (c >= 'a' && c <= 'z') c -= 32;      if (c >= 'a' && c <= 'z') c -= 32;
633      c ^= 0x40;      c ^= 0x40;
634  #else          /* EBCDIC coding */  #else           /* EBCDIC coding */
635      if (c >= 'a' && c <= 'z') c += 64;      if (c >= 'a' && c <= 'z') c += 64;
636      c ^= 0xC0;      c ^= 0xC0;
637  #endif  #endif
# Line 569  escape sequence. Line 673  escape sequence.
673  Argument:  Argument:
674    ptrptr         points to the pattern position pointer    ptrptr         points to the pattern position pointer
675    negptr         points to a boolean that is set TRUE for negation else FALSE    negptr         points to a boolean that is set TRUE for negation else FALSE
676      dptr           points to an int that is set to the detailed property value
677    errorcodeptr   points to the error code variable    errorcodeptr   points to the error code variable
678    
679  Returns:     value from ucp_type_table, or -1 for an invalid type  Returns:         type value from ucp_type_table, or -1 for an invalid type
680  */  */
681    
682  static int  static int
683  get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)  get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
684  {  {
685  int c, i, bot, top;  int c, i, bot, top;
686  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
687  char name[4];  char name[32];
688    
689  c = *(++ptr);  c = *(++ptr);
690  if (c == 0) goto ERROR_RETURN;  if (c == 0) goto ERROR_RETURN;
691    
692  *negptr = FALSE;  *negptr = FALSE;
693    
694  /* \P or \p can be followed by a one- or two-character name in {}, optionally  /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
695  preceded by ^ for negation. */  negation. */
696    
697  if (c == '{')  if (c == '{')
698    {    {
# Line 596  if (c == '{') Line 701  if (c == '{')
701      *negptr = TRUE;      *negptr = TRUE;
702      ptr++;      ptr++;
703      }      }
704    for (i = 0; i <= 2; i++)    for (i = 0; i < sizeof(name) - 1; i++)
705      {      {
706      c = *(++ptr);      c = *(++ptr);
707      if (c == 0) goto ERROR_RETURN;      if (c == 0) goto ERROR_RETURN;
708      if (c == '}') break;      if (c == '}') break;
709      name[i] = c;      name[i] = c;
710      }      }
711    if (c !='}')   /* Try to distinguish error cases */    if (c !='}') goto ERROR_RETURN;
     {  
     while (*(++ptr) != 0 && *ptr != '}');  
     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;  
     }  
712    name[i] = 0;    name[i] = 0;
713    }    }
714    
# Line 628  top = _pcre_utt_size; Line 729  top = _pcre_utt_size;
729    
730  while (bot < top)  while (bot < top)
731    {    {
732    i = (bot + top)/2;    i = (bot + top) >> 1;
733    c = strcmp(name, _pcre_utt[i].name);    c = strcmp(name, _pcre_utt[i].name);
734    if (c == 0) return _pcre_utt[i].value;    if (c == 0)
735        {
736        *dptr = _pcre_utt[i].value;
737        return _pcre_utt[i].type;
738        }
739    if (c > 0) bot = i + 1; else top = i;    if (c > 0) bot = i + 1; else top = i;
740    }    }
741    
 UNKNOWN_RETURN:  
742  *errorcodeptr = ERR47;  *errorcodeptr = ERR47;
743  *ptrptr = ptr;  *ptrptr = ptr;
744  return -1;  return -1;
# Line 750  return p; Line 854  return p;
854    
855    
856  /*************************************************  /*************************************************
857    *       Find forward referenced subpattern       *
858    *************************************************/
859    
860    /* This function scans along a pattern's text looking for capturing
861    subpatterns, and counting them. If it finds a named pattern that matches the
862    name it is given, it returns its number. Alternatively, if the name is NULL, it
863    returns when it reaches a given numbered subpattern. This is used for forward
864    references to subpatterns. We know that if (?P< is encountered, the name will
865    be terminated by '>' because that is checked in the first pass.
866    
867    Arguments:
868      ptr          current position in the pattern
869      count        current count of capturing parens so far encountered
870      name         name to seek, or NULL if seeking a numbered subpattern
871      lorn         name length, or subpattern number if name is NULL
872      xmode        TRUE if we are in /x mode
873    
874    Returns:       the number of the named subpattern, or -1 if not found
875    */
876    
877    static int
878    find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
879      BOOL xmode)
880    {
881    const uschar *thisname;
882    
883    for (; *ptr != 0; ptr++)
884      {
885      int term;
886    
887      /* Skip over backslashed characters and also entire \Q...\E */
888    
889      if (*ptr == '\\')
890        {
891        if (*(++ptr) == 0) return -1;
892        if (*ptr == 'Q') for (;;)
893          {
894          while (*(++ptr) != 0 && *ptr != '\\');
895          if (*ptr == 0) return -1;
896          if (*(++ptr) == 'E') break;
897          }
898        continue;
899        }
900    
901      /* Skip over character classes */
902    
903      if (*ptr == '[')
904        {
905        while (*(++ptr) != ']')
906          {
907          if (*ptr == '\\')
908            {
909            if (*(++ptr) == 0) return -1;
910            if (*ptr == 'Q') for (;;)
911              {
912              while (*(++ptr) != 0 && *ptr != '\\');
913              if (*ptr == 0) return -1;
914              if (*(++ptr) == 'E') break;
915              }
916            continue;
917            }
918          }
919        continue;
920        }
921    
922      /* Skip comments in /x mode */
923    
924      if (xmode && *ptr == '#')
925        {
926        while (*(++ptr) != 0 && *ptr != '\n');
927        if (*ptr == 0) return -1;
928        continue;
929        }
930    
931      /* An opening parens must now be a real metacharacter */
932    
933      if (*ptr != '(') continue;
934      if (ptr[1] != '?')
935        {
936        count++;
937        if (name == NULL && count == lorn) return count;
938        continue;
939        }
940    
941      ptr += 2;
942      if (*ptr == 'P') ptr++;                      /* Allow optional P */
943    
944      /* We have to disambiguate (?<! and (?<= from (?<name> */
945    
946      if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
947           *ptr != '\'')
948        continue;
949    
950      count++;
951    
952      if (name == NULL && count == lorn) return count;
953      term = *ptr++;
954      if (term == '<') term = '>';
955      thisname = ptr;
956      while (*ptr != term) ptr++;
957      if (name != NULL && lorn == ptr - thisname &&
958          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
959        return count;
960      }
961    
962    return -1;
963    }
964    
965    
966    
967    /*************************************************
968  *      Find first significant op code            *  *      Find first significant op code            *
969  *************************************************/  *************************************************/
970    
# Line 798  for (;;) Line 1013  for (;;)
1013    
1014      case OP_CALLOUT:      case OP_CALLOUT:
1015      case OP_CREF:      case OP_CREF:
1016      case OP_BRANUMBER:      case OP_RREF:
1017        case OP_DEF:
1018      code += _pcre_OP_lengths[*code];      code += _pcre_OP_lengths[*code];
1019      break;      break;
1020    
# Line 843  for (;;) Line 1059  for (;;)
1059    {    {
1060    int d;    int d;
1061    register int op = *cc;    register int op = *cc;
   if (op >= OP_BRA) op = OP_BRA;  
1062    
1063    switch (op)    switch (op)
1064      {      {
1065        case OP_CBRA:
1066      case OP_BRA:      case OP_BRA:
1067      case OP_ONCE:      case OP_ONCE:
1068      case OP_COND:      case OP_COND:
1069      d = find_fixedlength(cc, options);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1070      if (d < 0) return d;      if (d < 0) return d;
1071      branchlength += d;      branchlength += d;
1072      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 885  for (;;) Line 1101  for (;;)
1101      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1102    
1103      case OP_REVERSE:      case OP_REVERSE:
     case OP_BRANUMBER:  
1104      case OP_CREF:      case OP_CREF:
1105        case OP_RREF:
1106        case OP_DEF:
1107      case OP_OPT:      case OP_OPT:
1108      case OP_CALLOUT:      case OP_CALLOUT:
1109      case OP_SOD:      case OP_SOD:
# Line 904  for (;;) Line 1121  for (;;)
1121    
1122      case OP_CHAR:      case OP_CHAR:
1123      case OP_CHARNC:      case OP_CHARNC:
1124        case OP_NOT:
1125      branchlength++;      branchlength++;
1126      cc += 2;      cc += 2;
1127  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 937  for (;;) Line 1155  for (;;)
1155    
1156      case OP_PROP:      case OP_PROP:
1157      case OP_NOTPROP:      case OP_NOTPROP:
1158      cc++;      cc += 2;
1159      /* Fall through */      /* Fall through */
1160    
1161      case OP_NOT_DIGIT:      case OP_NOT_DIGIT:
# Line 1018  Returns:      pointer to the opcode for Line 1236  Returns:      pointer to the opcode for
1236  static const uschar *  static const uschar *
1237  find_bracket(const uschar *code, BOOL utf8, int number)  find_bracket(const uschar *code, BOOL utf8, int number)
1238  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1239  for (;;)  for (;;)
1240    {    {
1241    register int c = *code;    register int c = *code;
1242    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1243    else if (c > OP_BRA)  
1244      /* XCLASS is used for classes that cannot be represented just by a bit
1245      map. This includes negated single high-valued characters. The length in
1246      the table is zero; the actual length is stored in the compiled code. */
1247    
1248      if (c == OP_XCLASS) code += GET(code, 1);
1249    
1250      /* Handle capturing bracket */
1251    
1252      else if (c == OP_CBRA)
1253      {      {
1254      int n = c - OP_BRA;      int n = GET2(code, 1+LINK_SIZE);
     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);  
1255      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
1256      code += _pcre_OP_lengths[OP_BRA];      code += _pcre_OP_lengths[c];
1257      }      }
1258    
1259      /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1260      a multi-byte character. The length in the table is a minimum, so we have to
1261      arrange to skip the extra bytes. */
1262    
1263    else    else
1264      {      {
1265      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1266  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1267      if (utf8) switch(c)      if (utf8) switch(c)
1268        {        {
1269        case OP_CHAR:        case OP_CHAR:
# Line 1051  for (;;) Line 1271  for (;;)
1271        case OP_EXACT:        case OP_EXACT:
1272        case OP_UPTO:        case OP_UPTO:
1273        case OP_MINUPTO:        case OP_MINUPTO:
1274          case OP_POSUPTO:
1275        case OP_STAR:        case OP_STAR:
1276        case OP_MINSTAR:        case OP_MINSTAR:
1277          case OP_POSSTAR:
1278        case OP_PLUS:        case OP_PLUS:
1279        case OP_MINPLUS:        case OP_MINPLUS:
1280          case OP_POSPLUS:
1281        case OP_QUERY:        case OP_QUERY:
1282        case OP_MINQUERY:        case OP_MINQUERY:
1283        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1284        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1285        break;        break;
1286        }        }
1287  #endif  #endif
# Line 1092  Returns:      pointer to the opcode for Line 1308  Returns:      pointer to the opcode for
1308  static const uschar *  static const uschar *
1309  find_recurse(const uschar *code, BOOL utf8)  find_recurse(const uschar *code, BOOL utf8)
1310  {  {
 #ifndef SUPPORT_UTF8  
 utf8 = utf8;               /* Stop pedantic compilers complaining */  
 #endif  
   
1311  for (;;)  for (;;)
1312    {    {
1313    register int c = *code;    register int c = *code;
1314    if (c == OP_END) return NULL;    if (c == OP_END) return NULL;
1315    else if (c == OP_RECURSE) return code;    if (c == OP_RECURSE) return code;
1316    else if (c > OP_BRA)  
1317      {    /* XCLASS is used for classes that cannot be represented just by a bit
1318      code += _pcre_OP_lengths[OP_BRA];    map. This includes negated single high-valued characters. The length in
1319      }    the table is zero; the actual length is stored in the compiled code. */
1320    
1321      if (c == OP_XCLASS) code += GET(code, 1);
1322    
1323      /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1324      that are followed by a character may be followed by a multi-byte character.
1325      The length in the table is a minimum, so we have to arrange to skip the extra
1326      bytes. */
1327    
1328    else    else
1329      {      {
1330      code += _pcre_OP_lengths[c];      code += _pcre_OP_lengths[c];
   
1331  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
   
     /* In UTF-8 mode, opcodes that are followed by a character may be followed  
     by a multi-byte character. The length in the table is a minimum, so we have  
     to scan along to skip the extra bytes. All opcodes are less than 128, so we  
     can use relatively efficient code. */  
   
1332      if (utf8) switch(c)      if (utf8) switch(c)
1333        {        {
1334        case OP_CHAR:        case OP_CHAR:
# Line 1123  for (;;) Line 1336  for (;;)
1336        case OP_EXACT:        case OP_EXACT:
1337        case OP_UPTO:        case OP_UPTO:
1338        case OP_MINUPTO:        case OP_MINUPTO:
1339          case OP_POSUPTO:
1340        case OP_STAR:        case OP_STAR:
1341        case OP_MINSTAR:        case OP_MINSTAR:
1342          case OP_POSSTAR:
1343        case OP_PLUS:        case OP_PLUS:
1344        case OP_MINPLUS:        case OP_MINPLUS:
1345          case OP_POSPLUS:
1346        case OP_QUERY:        case OP_QUERY:
1347        case OP_MINQUERY:        case OP_MINQUERY:
1348        while ((*code & 0xc0) == 0x80) code++;        case OP_POSQUERY:
1349        break;        if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
   
       /* XCLASS is used for classes that cannot be represented just by a bit  
       map. This includes negated single high-valued characters. The length in  
       the table is zero; the actual length is stored in the compiled code. */  
   
       case OP_XCLASS:  
       code += GET(code, 1) + 1;  
1350        break;        break;
1351        }        }
1352  #endif  #endif
# Line 1152  for (;;) Line 1361  for (;;)
1361  *************************************************/  *************************************************/
1362    
1363  /* This function scans through a branch of a compiled pattern to see whether it  /* This function scans through a branch of a compiled pattern to see whether it
1364  can match the empty string or not. It is called only from could_be_empty()  can match the empty string or not. It is called from could_be_empty()
1365  below. Note that first_significant_code() skips over assertions. If we hit an  below and from compile_branch() when checking for an unlimited repeat of a
1366  unclosed bracket, we return "empty" - this means we've struck an inner bracket  group that can match nothing. Note that first_significant_code() skips over
1367  whose current branch will already have been scanned.  assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1368    struck an inner bracket whose current branch will already have been scanned.
1369    
1370  Arguments:  Arguments:
1371    code        points to start of search    code        points to start of search
# Line 1169  static BOOL Line 1379  static BOOL
1379  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)  could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1380  {  {
1381  register int c;  register int c;
1382  for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1383       code < endcode;       code < endcode;
1384       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1385    {    {
# Line 1177  for (code = first_significant_code(code Line 1387  for (code = first_significant_code(code
1387    
1388    c = *code;    c = *code;
1389    
1390    if (c >= OP_BRA)    /* Groups with zero repeats can of course be empty; skip them. */
1391    
1392      if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1393        {
1394        code += _pcre_OP_lengths[c];
1395        do code += GET(code, 1); while (*code == OP_ALT);
1396        c = *code;
1397        continue;
1398        }
1399    
1400      /* For other groups, scan the branches. */
1401    
1402      if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1403      {      {
1404      BOOL empty_branch;      BOOL empty_branch;
1405      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 1193  for (code = first_significant_code(code Line 1415  for (code = first_significant_code(code
1415        }        }
1416      while (*code == OP_ALT);      while (*code == OP_ALT);
1417      if (!empty_branch) return FALSE;   /* All branches are non-empty */      if (!empty_branch) return FALSE;   /* All branches are non-empty */
     code += 1 + LINK_SIZE;  
1418      c = *code;      c = *code;
1419        continue;
1420      }      }
1421    
1422    else switch (c)    /* Handle the other opcodes */
1423    
1424      switch (c)
1425      {      {
1426      /* Check for quantifiers after a class */      /* Check for quantifiers after a class */
1427    
# Line 1253  for (code = first_significant_code(code Line 1477  for (code = first_significant_code(code
1477      case OP_NOT:      case OP_NOT:
1478      case OP_PLUS:      case OP_PLUS:
1479      case OP_MINPLUS:      case OP_MINPLUS:
1480        case OP_POSPLUS:
1481      case OP_EXACT:      case OP_EXACT:
1482      case OP_NOTPLUS:      case OP_NOTPLUS:
1483      case OP_NOTMINPLUS:      case OP_NOTMINPLUS:
1484        case OP_NOTPOSPLUS:
1485      case OP_NOTEXACT:      case OP_NOTEXACT:
1486      case OP_TYPEPLUS:      case OP_TYPEPLUS:
1487      case OP_TYPEMINPLUS:      case OP_TYPEMINPLUS:
1488        case OP_TYPEPOSPLUS:
1489      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1490      return FALSE;      return FALSE;
1491    
# Line 1270  for (code = first_significant_code(code Line 1497  for (code = first_significant_code(code
1497      case OP_ALT:      case OP_ALT:
1498      return TRUE;      return TRUE;
1499    
1500      /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be      /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1501      followed by a multibyte character */      MINUPTO, and POSUPTO may be followed by a multibyte character */
1502    
1503  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1504      case OP_STAR:      case OP_STAR:
1505      case OP_MINSTAR:      case OP_MINSTAR:
1506        case OP_POSSTAR:
1507      case OP_QUERY:      case OP_QUERY:
1508      case OP_MINQUERY:      case OP_MINQUERY:
1509        case OP_POSQUERY:
1510      case OP_UPTO:      case OP_UPTO:
1511      case OP_MINUPTO:      case OP_MINUPTO:
1512        case OP_POSUPTO:
1513      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;      if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1514      break;      break;
1515  #endif  #endif
# Line 1397  earlier groups that are outside the curr Line 1627  earlier groups that are outside the curr
1627  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before  optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1628  it, after it has been compiled. This means that any OP_RECURSE items within it  it, after it has been compiled. This means that any OP_RECURSE items within it
1629  that refer to the group itself or any contained groups have to have their  that refer to the group itself or any contained groups have to have their
1630  offsets adjusted. That is the job of this function. Before it is called, the  offsets adjusted. That one of the jobs of this function. Before it is called,
1631  partially compiled regex must be temporarily terminated with OP_END.  the partially compiled regex must be temporarily terminated with OP_END.
1632    
1633    This function has been extended with the possibility of forward references for
1634    recursions and subroutine calls. It must also check the list of such references
1635    for the group we are dealing with. If it finds that one of the recursions in
1636    the current group is on this list, it adjusts the offset in the list, not the
1637    value in the reference (which is a group number).
1638    
1639  Arguments:  Arguments:
1640    group      points to the start of the group    group      points to the start of the group
1641    adjust     the amount by which the group is to be moved    adjust     the amount by which the group is to be moved
1642    utf8       TRUE in UTF-8 mode    utf8       TRUE in UTF-8 mode
1643    cd         contains pointers to tables etc.    cd         contains pointers to tables etc.
1644      save_hwm   the hwm forward reference pointer at the start of the group
1645    
1646  Returns:     nothing  Returns:     nothing
1647  */  */
1648    
1649  static void  static void
1650  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)  adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1651      uschar *save_hwm)
1652  {  {
1653  uschar *ptr = group;  uschar *ptr = group;
1654  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1655    {    {
1656    int offset = GET(ptr, 1);    int offset;
1657    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    uschar *hc;
1658    
1659      /* See if this recursion is on the forward reference list. If so, adjust the
1660      reference. */
1661    
1662      for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1663        {
1664        offset = GET(hc, 0);
1665        if (cd->start_code + offset == ptr + 1)
1666          {
1667          PUT(hc, 0, offset + adjust);
1668          break;
1669          }
1670        }
1671    
1672      /* Otherwise, adjust the recursion offset if it's after the start of this
1673      group. */
1674    
1675      if (hc >= cd->hwm)
1676        {
1677        offset = GET(ptr, 1);
1678        if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1679        }
1680    
1681    ptr += 1 + LINK_SIZE;    ptr += 1 + LINK_SIZE;
1682    }    }
1683  }  }
# Line 1495  Yield:        TRUE when range returned; Line 1756  Yield:        TRUE when range returned;
1756  */  */
1757    
1758  static BOOL  static BOOL
1759  get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)  get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1760      unsigned int *odptr)
1761  {  {
1762  int c, chartype, othercase, next;  unsigned int c, othercase, next;
1763    
1764  for (c = *cptr; c <= d; c++)  for (c = *cptr; c <= d; c++)
1765    {    { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
   if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)  
     break;  
   }  
1766    
1767  if (c > d) return FALSE;  if (c > d) return FALSE;
1768    
# Line 1512  next = othercase + 1; Line 1771  next = othercase + 1;
1771    
1772  for (++c; c <= d; c++)  for (++c; c <= d; c++)
1773    {    {
1774    if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||    if (_pcre_ucp_othercase(c) != next) break;
         othercase != next)  
     break;  
1775    next++;    next++;
1776    }    }
1777    
# Line 1526  return TRUE; Line 1783  return TRUE;
1783  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
1784    
1785    
1786    
1787  /*************************************************  /*************************************************
1788  *           Compile one branch                   *  *     Check if auto-possessifying is possible    *
1789  *************************************************/  *************************************************/
1790    
1791  /* Scan the pattern, compiling it into the code vector. If the options are  /* This function is called for unlimited repeats of certain items, to see
1792  changed during the branch, the pointer is used to change the external options  whether the next thing could possibly match the repeated item. If not, it makes
1793  bits.  sense to automatically possessify the repeated item.
1794    
1795  Arguments:  Arguments:
1796    optionsptr     pointer to the option bits    op_code       the repeated op code
1797    brackets       points to number of extracting brackets used    this          data for this item, depends on the opcode
1798    codeptr        points to the pointer to the current code point    utf8          TRUE in UTF-8 mode
1799    ptrptr         points to the current pattern pointer    utf8_char     used for utf8 character bytes, NULL if not relevant
1800    errorcodeptr   points to error code variable    ptr           next character in pattern
1801    firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)    options       options bits
1802    reqbyteptr     set to the last literal character required, else < 0    cd            contains pointers to tables etc.
   bcptr          points to current branch chain  
   cd             contains pointers to tables etc.  
1803    
1804  Returns:         TRUE on success  Returns:        TRUE if possessifying is wanted
                  FALSE, with *errorcodeptr set non-zero on error  
1805  */  */
1806    
1807  static BOOL  static BOOL
1808  compile_branch(int *optionsptr, int *brackets, uschar **codeptr,  check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1809    const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,    const uschar *ptr, int options, compile_data *cd)
   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)  
1810  {  {
1811  int repeat_type, op_type;  int next;
1812  int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */  
1813  int bravalue = 0;  /* Skip whitespace and comments in extended mode */
 int greedy_default, greedy_non_default;  
 int firstbyte, reqbyte;  
 int zeroreqbyte, zerofirstbyte;  
 int req_caseopt, reqvary, tempreqvary;  
 int condcount = 0;  
 int options = *optionsptr;  
 int after_manual_callout = 0;  
 register int c;  
 register uschar *code = *codeptr;  
 uschar *tempcode;  
 BOOL inescq = FALSE;  
 BOOL groupsetfirstbyte = FALSE;  
 const uschar *ptr = *ptrptr;  
 const uschar *tempptr;  
 uschar *previous = NULL;  
 uschar *previous_callout = NULL;  
 uschar classbits[32];  
1814    
1815    if ((options & PCRE_EXTENDED) != 0)
1816      {
1817      for (;;)
1818        {
1819        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1820        if (*ptr == '#')
1821          {
1822          while (*(++ptr) != 0)
1823            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1824          }
1825        else break;
1826        }
1827      }
1828    
1829    /* If the next item is one that we can handle, get its value. A non-negative
1830    value is a character, a negative value is an escape value. */
1831    
1832    if (*ptr == '\\')
1833      {
1834      int temperrorcode = 0;
1835      next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1836      if (temperrorcode != 0) return FALSE;
1837      ptr++;    /* Point after the escape sequence */
1838      }
1839    
1840    else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1841      {
1842  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1843  BOOL class_utf8;    if (utf8) { GETCHARINC(next, ptr); } else
 BOOL utf8 = (options & PCRE_UTF8) != 0;  
 uschar *class_utf8data;  
 uschar utf8_char[6];  
 #else  
 BOOL utf8 = FALSE;  
1844  #endif  #endif
1845      next = *ptr++;
1846      }
1847    
1848  /* Set up the default and non-default settings for greediness */  else return FALSE;
1849    
1850  greedy_default = ((options & PCRE_UNGREEDY) != 0);  /* Skip whitespace and comments in extended mode */
 greedy_non_default = greedy_default ^ 1;  
1851    
1852  /* Initialize no first byte, no required byte. REQ_UNSET means "no char  if ((options & PCRE_EXTENDED) != 0)
1853  matching encountered yet". It gets changed to REQ_NONE if we hit something that    {
1854  matches a non-fixed char first char; reqbyte just remains unset if we never    for (;;)
1855  find one.      {
1856        while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1857        if (*ptr == '#')
1858          {
1859          while (*(++ptr) != 0)
1860            if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1861          }
1862        else break;
1863        }
1864      }
1865    
1866  When we hit a repeat whose minimum is zero, we may have to adjust these values  /* If the next thing is itself optional, we have to give up. */
 to take the zero repeat into account. This is implemented by setting them to  
 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual  
 item types that can be repeated set these backoff variables appropriately. */  
1867    
1868  firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;  if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1869      return FALSE;
1870    
1871  /* The variable req_caseopt contains either the REQ_CASELESS value or zero,  /* Now compare the next item with the previous opcode. If the previous is a
1872  according to the current setting of the caseless flag. REQ_CASELESS is a bit  positive single character match, "item" either contains the character or, if
1873  value > 255. It is added into the firstbyte or reqbyte variables to record the  "item" is greater than 127 in utf8 mode, the character's bytes are in
1874  case status of the value. This is used only for ASCII characters. */  utf8_char. */
1875    
 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;  
1876    
1877  /* Switch on next character until the end of the branch */  /* Handle cases when the next item is a character. */
1878    
1879  for (;; ptr++)  if (next >= 0) switch(op_code)
1880    {    {
1881    BOOL negate_class;    case OP_CHAR:
1882    BOOL possessive_quantifier;  #ifdef SUPPORT_UTF8
1883    BOOL is_quantifier;    if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1884    int class_charcount;  #endif
1885    int class_lastchar;    return item != next;
   int newoptions;  
   int recno;  
   int skipbytes;  
   int subreqbyte;  
   int subfirstbyte;  
   int mclength;  
   uschar mcbuffer[8];  
   
   /* Next byte in the pattern */  
   
   c = *ptr;  
1886    
1887    /* If in \Q...\E, check for the end; if not, we have a literal */    /* For CHARNC (caseless character) we must check the other case. If we have
1888      Unicode property support, we can use it to test the other case of
1889      high-valued characters. */
1890    
1891    if (inescq && c != 0)    case OP_CHARNC:
1892      {  #ifdef SUPPORT_UTF8
1893      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894    #endif
1895      if (item == next) return FALSE;
1896    #ifdef SUPPORT_UTF8
1897      if (utf8)
1898        {
1899        unsigned int othercase;
1900        if (next < 128) othercase = cd->fcc[next]; else
1901    #ifdef SUPPORT_UCP
1902        othercase = _pcre_ucp_othercase((unsigned int)next);
1903    #else
1904        othercase = NOTACHAR;
1905    #endif
1906        return (unsigned int)item != othercase;
1907        }
1908      else
1909    #endif  /* SUPPORT_UTF8 */
1910      return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1911    
1912      /* For OP_NOT, "item" must be a single-byte character. */
1913    
1914      case OP_NOT:
1915      if (next < 0) return FALSE;  /* Not a character */
1916      if (item == next) return TRUE;
1917      if ((options & PCRE_CASELESS) == 0) return FALSE;
1918    #ifdef SUPPORT_UTF8
1919      if (utf8)
1920        {
1921        unsigned int othercase;
1922        if (next < 128) othercase = cd->fcc[next]; else
1923    #ifdef SUPPORT_UCP
1924        othercase = _pcre_ucp_othercase(next);
1925    #else
1926        othercase = NOTACHAR;
1927    #endif
1928        return (unsigned int)item == othercase;
1929        }
1930      else
1931    #endif  /* SUPPORT_UTF8 */
1932      return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1933    
1934      case OP_DIGIT:
1935      return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1936    
1937      case OP_NOT_DIGIT:
1938      return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1939    
1940      case OP_WHITESPACE:
1941      return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1942    
1943      case OP_NOT_WHITESPACE:
1944      return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1945    
1946      case OP_WORDCHAR:
1947      return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1948    
1949      case OP_NOT_WORDCHAR:
1950      return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1951    
1952      case OP_HSPACE:
1953      case OP_NOT_HSPACE:
1954      switch(next)
1955        {
1956        case 0x09:
1957        case 0x20:
1958        case 0xa0:
1959        case 0x1680:
1960        case 0x180e:
1961        case 0x2000:
1962        case 0x2001:
1963        case 0x2002:
1964        case 0x2003:
1965        case 0x2004:
1966        case 0x2005:
1967        case 0x2006:
1968        case 0x2007:
1969        case 0x2008:
1970        case 0x2009:
1971        case 0x200A:
1972        case 0x202f:
1973        case 0x205f:
1974        case 0x3000:
1975        return op_code != OP_HSPACE;
1976        default:
1977        return op_code == OP_HSPACE;
1978        }
1979    
1980      case OP_VSPACE:
1981      case OP_NOT_VSPACE:
1982      switch(next)
1983        {
1984        case 0x0a:
1985        case 0x0b:
1986        case 0x0c:
1987        case 0x0d:
1988        case 0x85:
1989        case 0x2028:
1990        case 0x2029:
1991        return op_code != OP_VSPACE;
1992        default:
1993        return op_code == OP_VSPACE;
1994        }
1995    
1996      default:
1997      return FALSE;
1998      }
1999    
2000    
2001    /* Handle the case when the next item is \d, \s, etc. */
2002    
2003    switch(op_code)
2004      {
2005      case OP_CHAR:
2006      case OP_CHARNC:
2007    #ifdef SUPPORT_UTF8
2008      if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2009    #endif
2010      switch(-next)
2011        {
2012        case ESC_d:
2013        return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2014    
2015        case ESC_D:
2016        return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2017    
2018        case ESC_s:
2019        return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2020    
2021        case ESC_S:
2022        return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2023    
2024        case ESC_w:
2025        return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2026    
2027        case ESC_W:
2028        return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2029    
2030        case ESC_h:
2031        case ESC_H:
2032        switch(item)
2033          {
2034          case 0x09:
2035          case 0x20:
2036          case 0xa0:
2037          case 0x1680:
2038          case 0x180e:
2039          case 0x2000:
2040          case 0x2001:
2041          case 0x2002:
2042          case 0x2003:
2043          case 0x2004:
2044          case 0x2005:
2045          case 0x2006:
2046          case 0x2007:
2047          case 0x2008:
2048          case 0x2009:
2049          case 0x200A:
2050          case 0x202f:
2051          case 0x205f:
2052          case 0x3000:
2053          return -next != ESC_h;
2054          default:
2055          return -next == ESC_h;
2056          }
2057    
2058        case ESC_v:
2059        case ESC_V:
2060        switch(item)
2061          {
2062          case 0x0a:
2063          case 0x0b:
2064          case 0x0c:
2065          case 0x0d:
2066          case 0x85:
2067          case 0x2028:
2068          case 0x2029:
2069          return -next != ESC_v;
2070          default:
2071          return -next == ESC_v;
2072          }
2073    
2074        default:
2075        return FALSE;
2076        }
2077    
2078      case OP_DIGIT:
2079      return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2080             next == -ESC_h || next == -ESC_v;
2081    
2082      case OP_NOT_DIGIT:
2083      return next == -ESC_d;
2084    
2085      case OP_WHITESPACE:
2086      return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2087    
2088      case OP_NOT_WHITESPACE:
2089      return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2090    
2091      case OP_HSPACE:
2092      return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2093    
2094      case OP_NOT_HSPACE:
2095      return next == -ESC_h;
2096    
2097      /* Can't have \S in here because VT matches \S (Perl anomaly) */
2098      case OP_VSPACE:
2099      return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2100    
2101      case OP_NOT_VSPACE:
2102      return next == -ESC_v;
2103    
2104      case OP_WORDCHAR:
2105      return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2106    
2107      case OP_NOT_WORDCHAR:
2108      return next == -ESC_w || next == -ESC_d;
2109    
2110      default:
2111      return FALSE;
2112      }
2113    
2114    /* Control does not reach here */
2115    }
2116    
2117    
2118    
2119    /*************************************************
2120    *           Compile one branch                   *
2121    *************************************************/
2122    
2123    /* Scan the pattern, compiling it into the a vector. If the options are
2124    changed during the branch, the pointer is used to change the external options
2125    bits. This function is used during the pre-compile phase when we are trying
2126    to find out the amount of memory needed, as well as during the real compile
2127    phase. The value of lengthptr distinguishes the two phases.
2128    
2129    Arguments:
2130      optionsptr     pointer to the option bits
2131      codeptr        points to the pointer to the current code point
2132      ptrptr         points to the current pattern pointer
2133      errorcodeptr   points to error code variable
2134      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2135      reqbyteptr     set to the last literal character required, else < 0
2136      bcptr          points to current branch chain
2137      cd             contains pointers to tables etc.
2138      lengthptr      NULL during the real compile phase
2139                     points to length accumulator during pre-compile phase
2140    
2141    Returns:         TRUE on success
2142                     FALSE, with *errorcodeptr set non-zero on error
2143    */
2144    
2145    static BOOL
2146    compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2147      int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2148      compile_data *cd, int *lengthptr)
2149    {
2150    int repeat_type, op_type;
2151    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2152    int bravalue = 0;
2153    int greedy_default, greedy_non_default;
2154    int firstbyte, reqbyte;
2155    int zeroreqbyte, zerofirstbyte;
2156    int req_caseopt, reqvary, tempreqvary;
2157    int options = *optionsptr;
2158    int after_manual_callout = 0;
2159    int length_prevgroup = 0;
2160    register int c;
2161    register uschar *code = *codeptr;
2162    uschar *last_code = code;
2163    uschar *orig_code = code;
2164    uschar *tempcode;
2165    BOOL inescq = FALSE;
2166    BOOL groupsetfirstbyte = FALSE;
2167    const uschar *ptr = *ptrptr;
2168    const uschar *tempptr;
2169    uschar *previous = NULL;
2170    uschar *previous_callout = NULL;
2171    uschar *save_hwm = NULL;
2172    uschar classbits[32];
2173    
2174    #ifdef SUPPORT_UTF8
2175    BOOL class_utf8;
2176    BOOL utf8 = (options & PCRE_UTF8) != 0;
2177    uschar *class_utf8data;
2178    uschar utf8_char[6];
2179    #else
2180    BOOL utf8 = FALSE;
2181    uschar *utf8_char = NULL;
2182    #endif
2183    
2184    #ifdef DEBUG
2185    if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2186    #endif
2187    
2188    /* Set up the default and non-default settings for greediness */
2189    
2190    greedy_default = ((options & PCRE_UNGREEDY) != 0);
2191    greedy_non_default = greedy_default ^ 1;
2192    
2193    /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2194    matching encountered yet". It gets changed to REQ_NONE if we hit something that
2195    matches a non-fixed char first char; reqbyte just remains unset if we never
2196    find one.
2197    
2198    When we hit a repeat whose minimum is zero, we may have to adjust these values
2199    to take the zero repeat into account. This is implemented by setting them to
2200    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2201    item types that can be repeated set these backoff variables appropriately. */
2202    
2203    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2204    
2205    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2206    according to the current setting of the caseless flag. REQ_CASELESS is a bit
2207    value > 255. It is added into the firstbyte or reqbyte variables to record the
2208    case status of the value. This is used only for ASCII characters. */
2209    
2210    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2211    
2212    /* Switch on next character until the end of the branch */
2213    
2214    for (;; ptr++)
2215      {
2216      BOOL negate_class;
2217      BOOL possessive_quantifier;
2218      BOOL is_quantifier;
2219      BOOL is_recurse;
2220      BOOL reset_bracount;
2221      int class_charcount;
2222      int class_lastchar;
2223      int newoptions;
2224      int recno;
2225      int refsign;
2226      int skipbytes;
2227      int subreqbyte;
2228      int subfirstbyte;
2229      int terminator;
2230      int mclength;
2231      uschar mcbuffer[8];
2232    
2233      /* Get next byte in the pattern */
2234    
2235      c = *ptr;
2236    
2237      /* If we are in the pre-compile phase, accumulate the length used for the
2238      previous cycle of this loop. */
2239    
2240      if (lengthptr != NULL)
2241        {
2242    #ifdef DEBUG
2243        if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2244    #endif
2245        if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2246          {
2247          *errorcodeptr = ERR52;
2248          goto FAILED;
2249          }
2250    
2251        /* There is at least one situation where code goes backwards: this is the
2252        case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2253        the class is simply eliminated. However, it is created first, so we have to
2254        allow memory for it. Therefore, don't ever reduce the length at this point.
2255        */
2256    
2257        if (code < last_code) code = last_code;
2258        *lengthptr += code - last_code;
2259        DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2260    
2261        /* If "previous" is set and it is not at the start of the work space, move
2262        it back to there, in order to avoid filling up the work space. Otherwise,
2263        if "previous" is NULL, reset the current code pointer to the start. */
2264    
2265        if (previous != NULL)
2266          {
2267          if (previous > orig_code)
2268            {
2269            memmove(orig_code, previous, code - previous);
2270            code -= previous - orig_code;
2271            previous = orig_code;
2272            }
2273          }
2274        else code = orig_code;
2275    
2276        /* Remember where this code item starts so we can pick up the length
2277        next time round. */
2278    
2279        last_code = code;
2280        }
2281    
2282      /* In the real compile phase, just check the workspace used by the forward
2283      reference list. */
2284    
2285      else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2286        {
2287        *errorcodeptr = ERR52;
2288        goto FAILED;
2289        }
2290    
2291      /* If in \Q...\E, check for the end; if not, we have a literal */
2292    
2293      if (inescq && c != 0)
2294        {
2295      if (c == '\\' && ptr[1] == 'E')      if (c == '\\' && ptr[1] == 'E')
2296        {        {
2297        inescq = FALSE;        inescq = FALSE;
# Line 1643  for (;; ptr++) Line 2302  for (;; ptr++)
2302        {        {
2303        if (previous_callout != NULL)        if (previous_callout != NULL)
2304          {          {
2305          complete_callout(previous_callout, ptr, cd);          if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2306              complete_callout(previous_callout, ptr, cd);
2307          previous_callout = NULL;          previous_callout = NULL;
2308          }          }
2309        if ((options & PCRE_AUTO_CALLOUT) != 0)        if ((options & PCRE_AUTO_CALLOUT) != 0)
# Line 1664  for (;; ptr++) Line 2324  for (;; ptr++)
2324    if (!is_quantifier && previous_callout != NULL &&    if (!is_quantifier && previous_callout != NULL &&
2325         after_manual_callout-- <= 0)         after_manual_callout-- <= 0)
2326      {      {
2327      complete_callout(previous_callout, ptr, cd);      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2328          complete_callout(previous_callout, ptr, cd);
2329      previous_callout = NULL;      previous_callout = NULL;
2330      }      }
2331    
# Line 1675  for (;; ptr++) Line 2336  for (;; ptr++)
2336      if ((cd->ctypes[c] & ctype_space) != 0) continue;      if ((cd->ctypes[c] & ctype_space) != 0) continue;
2337      if (c == '#')      if (c == '#')
2338        {        {
2339        /* The space before the ; is to avoid a warning on a silly compiler        while (*(++ptr) != 0)
2340        on the Macintosh. */          {
2341        while ((c = *(++ptr)) != 0 && c != NEWLINE) ;          if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2342        if (c != 0) continue;   /* Else fall through to handle end of string */          }
2343          if (*ptr != 0) continue;
2344    
2345          /* Else fall through to handle end of string */
2346          c = 0;
2347        }        }
2348      }      }
2349    
# Line 1692  for (;; ptr++) Line 2357  for (;; ptr++)
2357    
2358    switch(c)    switch(c)
2359      {      {
2360      /* The branch terminates at end of string, |, or ). */      /* ===================================================================*/
2361        case 0:                        /* The branch terminates at string end */
2362      case 0:      case '|':                      /* or | or ) */
     case '|':  
2363      case ')':      case ')':
2364      *firstbyteptr = firstbyte;      *firstbyteptr = firstbyte;
2365      *reqbyteptr = reqbyte;      *reqbyteptr = reqbyte;
2366      *codeptr = code;      *codeptr = code;
2367      *ptrptr = ptr;      *ptrptr = ptr;
2368        if (lengthptr != NULL)
2369          {
2370          *lengthptr += code - last_code;   /* To include callout length */
2371          DPRINTF((">> end branch\n"));
2372          }
2373      return TRUE;      return TRUE;
2374    
2375    
2376        /* ===================================================================*/
2377      /* Handle single-character metacharacters. In multiline mode, ^ disables      /* Handle single-character metacharacters. In multiline mode, ^ disables
2378      the setting of any following char as a first character. */      the setting of any following char as a first character. */
2379    
# Line 1731  for (;; ptr++) Line 2402  for (;; ptr++)
2402      *code++ = OP_ANY;      *code++ = OP_ANY;
2403      break;      break;
2404    
2405      /* Character classes. If the included characters are all < 255 in value, we  
2406      build a 32-byte bitmap of the permitted characters, except in the special      /* ===================================================================*/
2407      case where there is only one such character. For negated classes, we build      /* Character classes. If the included characters are all < 256, we build a
2408      the map as usual, then invert it at the end. However, we use a different      32-byte bitmap of the permitted characters, except in the special case
2409      opcode so that data characters > 255 can be handled correctly.      where there is only one such character. For negated classes, we build the
2410        map as usual, then invert it at the end. However, we use a different opcode
2411        so that data characters > 255 can be handled correctly.
2412    
2413      If the class contains characters outside the 0-255 range, a different      If the class contains characters outside the 0-255 range, a different
2414      opcode is compiled. It may optionally have a bit map for characters < 256,      opcode is compiled. It may optionally have a bit map for characters < 256,
# Line 1769  for (;; ptr++) Line 2442  for (;; ptr++)
2442        }        }
2443    
2444      /* Keep a count of chars with values < 256 so that we can optimize the case      /* Keep a count of chars with values < 256 so that we can optimize the case
2445      of just a single character (as long as it's < 256). For higher valued UTF-8      of just a single character (as long as it's < 256). However, For higher
2446      characters, we don't yet do any optimization. */      valued UTF-8 characters, we don't yet do any optimization. */
2447    
2448      class_charcount = 0;      class_charcount = 0;
2449      class_lastchar = -1;      class_lastchar = -1;
2450    
2451        /* Initialize the 32-char bit map to all zeros. We build the map in a
2452        temporary bit of memory, in case the class contains only 1 character (less
2453        than 256), because in that case the compiled code doesn't use the bit map.
2454        */
2455    
2456        memset(classbits, 0, 32 * sizeof(uschar));
2457    
2458  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2459      class_utf8 = FALSE;                       /* No chars >= 256 */      class_utf8 = FALSE;                       /* No chars >= 256 */
2460      class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */      class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2461  #endif  #endif
2462    
     /* Initialize the 32-char bit map to all zeros. We have to build the  
     map in a temporary bit of store, in case the class contains only 1  
     character (< 256), because in that case the compiled code doesn't use the  
     bit map. */  
   
     memset(classbits, 0, 32 * sizeof(uschar));  
   
2463      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
2464      means that an initial ] is taken as a data character. The first pass      means that an initial ] is taken as a data character. At the start of the
2465      through the regex checked the overall syntax, so we don't need to be very      loop, c contains the first byte of the character. */
     strict here. At the start of the loop, c contains the first byte of the  
     character. */  
2466    
2467      do      if (c != 0) do
2468        {        {
2469          const uschar *oldptr;
2470    
2471  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2472        if (utf8 && c > 127)        if (utf8 && c > 127)
2473          {                           /* Braces are required because the */          {                           /* Braces are required because the */
# Line 1806  for (;; ptr++) Line 2479  for (;; ptr++)
2479    
2480        if (inescq)        if (inescq)
2481          {          {
2482          if (c == '\\' && ptr[1] == 'E')          if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2483            {            {
2484            inescq = FALSE;            inescq = FALSE;                   /* Reset literal state */
2485            ptr++;            ptr++;                            /* Skip the 'E' */
2486            continue;            continue;                         /* Carry on with next */
2487            }            }
2488          else goto LONE_SINGLE_CHARACTER;          goto CHECK_RANGE;                   /* Could be range if \E follows */
2489          }          }
2490    
2491        /* Handle POSIX class names. Perl allows a negation extension of the        /* Handle POSIX class names. Perl allows a negation extension of the
# Line 1826  for (;; ptr++) Line 2499  for (;; ptr++)
2499            check_posix_syntax(ptr, &tempptr, cd))            check_posix_syntax(ptr, &tempptr, cd))
2500          {          {
2501          BOOL local_negate = FALSE;          BOOL local_negate = FALSE;
2502          int posix_class, i;          int posix_class, taboffset, tabopt;
2503          register const uschar *cbits = cd->cbits;          register const uschar *cbits = cd->cbits;
2504            uschar pbits[32];
2505    
2506          if (ptr[1] != ':')          if (ptr[1] != ':')
2507            {            {
# Line 1856  for (;; ptr++) Line 2530  for (;; ptr++)
2530          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2531            posix_class = 0;            posix_class = 0;
2532    
2533          /* Or into the map we are building up to 3 of the static class          /* We build the bit map for the POSIX class in a chunk of local store
2534          tables, or their negations. The [:blank:] class sets up the same          because we may be adding and subtracting from it, and we don't want to
2535          chars as the [:space:] class (all white space). We remove the vertical          subtract bits that may be in the main map already. At the end we or the
2536          white space chars afterwards. */          result into the bit map that is being built. */
2537    
2538          posix_class *= 3;          posix_class *= 3;
2539          for (i = 0; i < 3; i++)  
2540            /* Copy in the first table (always present) */
2541    
2542            memcpy(pbits, cbits + posix_class_maps[posix_class],
2543              32 * sizeof(uschar));
2544    
2545            /* If there is a second table, add or remove it as required. */
2546    
2547            taboffset = posix_class_maps[posix_class + 1];
2548            tabopt = posix_class_maps[posix_class + 2];
2549    
2550            if (taboffset >= 0)
2551            {            {
2552            BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;            if (tabopt >= 0)
2553            int taboffset = posix_class_maps[posix_class + i];              for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
           if (taboffset < 0) break;  
           if (local_negate)  
             {  
             if (i == 0)  
               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];  
             else  
               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];  
             if (blankclass) classbits[1] |= 0x3c;  
             }  
2554            else            else
2555              {              for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];  
             if (blankclass) classbits[1] &= ~0x3c;  
             }  
2556            }            }
2557    
2558            /* Not see if we need to remove any special characters. An option
2559            value of 1 removes vertical space and 2 removes underscore. */
2560    
2561            if (tabopt < 0) tabopt = -tabopt;
2562            if (tabopt == 1) pbits[1] &= ~0x3c;
2563              else if (tabopt == 2) pbits[11] &= 0x7f;
2564    
2565            /* Add the POSIX table or its complement into the main table that is
2566            being built and we are done. */
2567    
2568            if (local_negate)
2569              for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2570            else
2571              for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2572    
2573          ptr = tempptr + 1;          ptr = tempptr + 1;
2574          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */          class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2575          continue;    /* End of POSIX syntax handling */          continue;    /* End of POSIX syntax handling */
2576          }          }
2577    
2578        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
2579        of the specials, which just set a flag. Escaped items are checked for        of the specials, which just set a flag. The sequence \b is a special
2580        validity in the pre-compiling pass. The sequence \b is a special case.        case. Inside a class (and only there) it is treated as backspace.
2581        Inside a class (and only there) it is treated as backspace. Elsewhere        Elsewhere it marks a word boundary. Other escapes have preset maps ready
2582        it marks a word boundary. Other escapes have preset maps ready to        to or into the one we are building. We assume they have more than one
       or into the one we are building. We assume they have more than one  
2583        character in them, so set class_charcount bigger than one. */        character in them, so set class_charcount bigger than one. */
2584    
2585        if (c == '\\')        if (c == '\\')
2586          {          {
2587          c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2588            if (*errorcodeptr != 0) goto FAILED;
2589    
2590          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */          if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2591          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */          else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2592            else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2593          else if (-c == ESC_Q)            /* Handle start of quoted string */          else if (-c == ESC_Q)            /* Handle start of quoted string */
2594            {            {
2595            if (ptr[1] == '\\' && ptr[2] == 'E')            if (ptr[1] == '\\' && ptr[2] == 'E')
# Line 1915  for (;; ptr++) Line 2604  for (;; ptr++)
2604            {            {
2605            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
2606            class_charcount += 2;     /* Greater than 1 is what matters */            class_charcount += 2;     /* Greater than 1 is what matters */
2607            switch (-c)  
2608              /* Save time by not doing this in the pre-compile phase. */
2609    
2610              if (lengthptr == NULL) switch (-c)
2611              {              {
2612              case ESC_d:              case ESC_d:
2613              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
# Line 1943  for (;; ptr++) Line 2635  for (;; ptr++)
2635              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2636              continue;              continue;
2637    
2638  #ifdef SUPPORT_UCP              case ESC_E: /* Perl ignores an orphan \E */
2639              case ESC_p:              continue;
2640              case ESC_P:  
2641                default:    /* Not recognized; fall through */
2642                break;      /* Need "default" setting to stop compiler warning. */
2643                }
2644    
2645              /* In the pre-compile phase, just do the recognition. */
2646    
2647              else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2648                       c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2649    
2650              /* We need to deal with \H, \h, \V, and \v in both phases because
2651              they use extra memory. */
2652    
2653              if (-c == ESC_h)
2654                {
2655                SETBIT(classbits, 0x09); /* VT */
2656                SETBIT(classbits, 0x20); /* SPACE */
2657                SETBIT(classbits, 0xa0); /* NSBP */
2658    #ifdef SUPPORT_UTF8
2659                if (utf8)
2660                {                {
               BOOL negated;  
               int property = get_ucp(&ptr, &negated, errorcodeptr);  
               if (property < 0) goto FAILED;  
2661                class_utf8 = TRUE;                class_utf8 = TRUE;
2662                *class_utf8data++ = ((-c == ESC_p) != negated)?                *class_utf8data++ = XCL_SINGLE;
2663                  XCL_PROP : XCL_NOTPROP;                class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2664                *class_utf8data++ = property;                *class_utf8data++ = XCL_SINGLE;
2665                class_charcount -= 2;   /* Not a < 256 character */                class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2666                  *class_utf8data++ = XCL_RANGE;
2667                  class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2668                  class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2669                  *class_utf8data++ = XCL_SINGLE;
2670                  class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2671                  *class_utf8data++ = XCL_SINGLE;
2672                  class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2673                  *class_utf8data++ = XCL_SINGLE;
2674                  class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2675                }                }
             continue;  
2676  #endif  #endif
2677                continue;
2678                }
2679    
2680              /* Unrecognized escapes are faulted if PCRE is running in its            if (-c == ESC_H)
2681              strict mode. By default, for compatibility with Perl, they are              {
2682              treated as literals. */              for (c = 0; c < 32; c++)
2683                  {
2684                  int x = 0xff;
2685                  switch (c)
2686                    {
2687                    case 0x09/8: x ^= 1 << (0x09%8); break;
2688                    case 0x20/8: x ^= 1 << (0x20%8); break;
2689                    case 0xa0/8: x ^= 1 << (0xa0%8); break;
2690                    default: break;
2691                    }
2692                  classbits[c] |= x;
2693                  }
2694    
2695              default:  #ifdef SUPPORT_UTF8
2696              if ((options & PCRE_EXTRA) != 0)              if (utf8)
2697                {                {
2698                *errorcodeptr = ERR7;                class_utf8 = TRUE;
2699                goto FAILED;                *class_utf8data++ = XCL_RANGE;
2700                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2701                  class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2702                  *class_utf8data++ = XCL_RANGE;
2703                  class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2704                  class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2705                  *class_utf8data++ = XCL_RANGE;
2706                  class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2707                  class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2708                  *class_utf8data++ = XCL_RANGE;
2709                  class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2710                  class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2711                  *class_utf8data++ = XCL_RANGE;
2712                  class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2713                  class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2714                  *class_utf8data++ = XCL_RANGE;
2715                  class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2716                  class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2717                  *class_utf8data++ = XCL_RANGE;
2718                  class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2719                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2720                }                }
2721              c = *ptr;              /* The final character */  #endif
2722              class_charcount -= 2;  /* Undo the default count from above */              continue;
2723              }              }
           }  
2724    
2725          /* Fall through if we have a single character (c >= 0). This may be            if (-c == ESC_v)
2726          > 256 in UTF-8 mode. */              {
2727                SETBIT(classbits, 0x0a); /* LF */
2728                SETBIT(classbits, 0x0b); /* VT */
2729                SETBIT(classbits, 0x0c); /* FF */
2730                SETBIT(classbits, 0x0d); /* CR */
2731                SETBIT(classbits, 0x85); /* NEL */
2732    #ifdef SUPPORT_UTF8
2733                if (utf8)
2734                  {
2735                  class_utf8 = TRUE;
2736                  *class_utf8data++ = XCL_RANGE;
2737                  class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2738                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2739                  }
2740    #endif
2741                continue;
2742                }
2743    
2744              if (-c == ESC_V)
2745                {
2746                for (c = 0; c < 32; c++)
2747                  {
2748                  int x = 0xff;
2749                  switch (c)
2750                    {
2751                    case 0x0a/8: x ^= 1 << (0x0a%8);
2752                                 x ^= 1 << (0x0b%8);
2753                                 x ^= 1 << (0x0c%8);
2754                                 x ^= 1 << (0x0d%8);
2755                                 break;
2756                    case 0x85/8: x ^= 1 << (0x85%8); break;
2757                    default: break;
2758                    }
2759                  classbits[c] |= x;
2760                  }
2761    
2762    #ifdef SUPPORT_UTF8
2763                if (utf8)
2764                  {
2765                  class_utf8 = TRUE;
2766                  *class_utf8data++ = XCL_RANGE;
2767                  class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2768                  class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2769                  *class_utf8data++ = XCL_RANGE;
2770                  class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2771                  class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2772                  }
2773    #endif
2774                continue;
2775                }
2776    
2777              /* We need to deal with \P and \p in both phases. */
2778    
2779    #ifdef SUPPORT_UCP
2780              if (-c == ESC_p || -c == ESC_P)
2781                {
2782                BOOL negated;
2783                int pdata;
2784                int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2785                if (ptype < 0) goto FAILED;
2786                class_utf8 = TRUE;
2787                *class_utf8data++ = ((-c == ESC_p) != negated)?
2788                  XCL_PROP : XCL_NOTPROP;
2789                *class_utf8data++ = ptype;
2790                *class_utf8data++ = pdata;
2791                class_charcount -= 2;   /* Not a < 256 character */
2792                continue;
2793                }
2794    #endif
2795              /* Unrecognized escapes are faulted if PCRE is running in its
2796              strict mode. By default, for compatibility with Perl, they are
2797              treated as literals. */
2798    
2799              if ((options & PCRE_EXTRA) != 0)
2800                {
2801                *errorcodeptr = ERR7;
2802                goto FAILED;
2803                }
2804    
2805              class_charcount -= 2;  /* Undo the default count from above */
2806              c = *ptr;              /* Get the final character and fall through */
2807              }
2808    
2809            /* Fall through if we have a single character (c >= 0). This may be
2810            greater than 256 in UTF-8 mode. */
2811    
2812          }   /* End of backslash handling */          }   /* End of backslash handling */
2813    
2814        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
2815        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
2816        here is treated as a literal. */        at the end is treated as a literal. Perl ignores orphaned \E sequences
2817          entirely. The code for handling \Q and \E is messy. */
2818    
2819          CHECK_RANGE:
2820          while (ptr[1] == '\\' && ptr[2] == 'E')
2821            {
2822            inescq = FALSE;
2823            ptr += 2;
2824            }
2825    
2826        if (ptr[1] == '-' && ptr[2] != ']')        oldptr = ptr;
2827    
2828          if (!inescq && ptr[1] == '-')
2829          {          {
2830          int d;          int d;
2831          ptr += 2;          ptr += 2;
2832            while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2833    
2834            /* If we hit \Q (not followed by \E) at this point, go into escaped
2835            mode. */
2836    
2837            while (*ptr == '\\' && ptr[1] == 'Q')
2838              {
2839              ptr += 2;
2840              if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2841              inescq = TRUE;
2842              break;
2843              }
2844    
2845            if (*ptr == 0 || (!inescq && *ptr == ']'))
2846              {
2847              ptr = oldptr;
2848              goto LONE_SINGLE_CHARACTER;
2849              }
2850    
2851  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2852          if (utf8)          if (utf8)
# Line 2001  for (;; ptr++) Line 2861  for (;; ptr++)
2861          not any of the other escapes. Perl 5.6 treats a hyphen as a literal          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2862          in such circumstances. */          in such circumstances. */
2863    
2864          if (d == '\\')          if (!inescq && d == '\\')
2865            {            {
2866            const uschar *oldptr = ptr;            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2867            d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);            if (*errorcodeptr != 0) goto FAILED;
2868    
2869            /* \b is backslash; \X is literal X; any other special means the '-'            /* \b is backslash; \X is literal X; \R is literal R; any other
2870            was literal */            special means the '-' was literal */
2871    
2872            if (d < 0)            if (d < 0)
2873              {              {
2874              if (d == -ESC_b) d = '\b';              if (d == -ESC_b) d = '\b';
2875              else if (d == -ESC_X) d = 'X'; else              else if (d == -ESC_X) d = 'X';
2876                else if (d == -ESC_R) d = 'R'; else
2877                {                {
2878                ptr = oldptr - 2;                ptr = oldptr;
2879                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2880                }                }
2881              }              }
2882            }            }
2883    
2884          /* The check that the two values are in the correct order happens in          /* Check that the two values are in the correct order. Optimize
2885          the pre-pass. Optimize one-character ranges */          one-character ranges */
2886    
2887            if (d < c)
2888              {
2889              *errorcodeptr = ERR8;
2890              goto FAILED;
2891              }
2892    
2893          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */          if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2894    
# Line 2042  for (;; ptr++) Line 2909  for (;; ptr++)
2909  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2910            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
2911              {              {
2912              int occ, ocd;              unsigned int occ, ocd;
2913              int cc = c;              unsigned int cc = c;
2914              int origd = d;              unsigned int origd = d;
2915              while (get_othercase_range(&cc, origd, &occ, &ocd))              while (get_othercase_range(&cc, origd, &occ, &ocd))
2916                {                {
2917                if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */                if (occ >= (unsigned int)c &&
2918                      ocd <= (unsigned int)d)
2919                    continue;                          /* Skip embedded ranges */
2920    
2921                if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                if (occ < (unsigned int)c  &&
2922                      ocd >= (unsigned int)c - 1)      /* Extend the basic range */
2923                  {                                  /* if there is overlap,   */                  {                                  /* if there is overlap,   */
2924                  c = occ;                           /* noting that if occ < c */                  c = occ;                           /* noting that if occ < c */
2925                  continue;                          /* we can't have ocd > d  */                  continue;                          /* we can't have ocd > d  */
2926                  }                                  /* because a subrange is  */                  }                                  /* because a subrange is  */
2927                if (ocd > d && occ <= d + 1)         /* always shorter than    */                if (ocd > (unsigned int)d &&
2928                      occ <= (unsigned int)d + 1)      /* always shorter than    */
2929                  {                                  /* the basic range.       */                  {                                  /* the basic range.       */
2930                  d = ocd;                  d = ocd;
2931                  continue;                  continue;
# Line 2102  for (;; ptr++) Line 2973  for (;; ptr++)
2973          ranges that lie entirely within 0-127 when there is UCP support; else          ranges that lie entirely within 0-127 when there is UCP support; else
2974          for partial ranges without UCP support. */          for partial ranges without UCP support. */
2975    
2976          for (; c <= d; c++)          class_charcount += d - c + 1;
2977            class_lastchar = d;
2978    
2979            /* We can save a bit of time by skipping this in the pre-compile. */
2980    
2981            if (lengthptr == NULL) for (; c <= d; c++)
2982            {            {
2983            classbits[c/8] |= (1 << (c&7));            classbits[c/8] |= (1 << (c&7));
2984            if ((options & PCRE_CASELESS) != 0)            if ((options & PCRE_CASELESS) != 0)
# Line 2110  for (;; ptr++) Line 2986  for (;; ptr++)
2986              int uc = cd->fcc[c];           /* flip case */              int uc = cd->fcc[c];           /* flip case */
2987              classbits[uc/8] |= (1 << (uc&7));              classbits[uc/8] |= (1 << (uc&7));
2988              }              }
           class_charcount++;                /* in case a one-char range */  
           class_lastchar = c;  
2989            }            }
2990    
2991          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
# Line 2135  for (;; ptr++) Line 3009  for (;; ptr++)
3009  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3010          if ((options & PCRE_CASELESS) != 0)          if ((options & PCRE_CASELESS) != 0)
3011            {            {
3012            int chartype;            unsigned int othercase;
3013            int othercase;            if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
           if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&  
                othercase > 0)  
3014              {              {
3015              *class_utf8data++ = XCL_SINGLE;              *class_utf8data++ = XCL_SINGLE;
3016              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);              class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
# Line 2163  for (;; ptr++) Line 3035  for (;; ptr++)
3035          }          }
3036        }        }
3037    
3038      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached. This "while" is the end of the "do" above. */
3039      loop. This "while" is the end of the "do" above. */  
3040        while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3041    
3042      while ((c = *(++ptr)) != ']' || inescq);      if (c == 0)                          /* Missing terminating ']' */
3043          {
3044          *errorcodeptr = ERR6;
3045          goto FAILED;
3046          }
3047    
3048      /* If class_charcount is 1, we saw precisely one character whose value is      /* If class_charcount is 1, we saw precisely one character whose value is
3049      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we      less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
# Line 2230  for (;; ptr++) Line 3107  for (;; ptr++)
3107    
3108      /* If there are characters with values > 255, we have to compile an      /* If there are characters with values > 255, we have to compile an
3109      extended class, with its own opcode. If there are no characters < 256,      extended class, with its own opcode. If there are no characters < 256,
3110      we can omit the bitmap. */      we can omit the bitmap in the actual compiled code. */
3111    
3112  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3113      if (class_utf8)      if (class_utf8)
# Line 2240  for (;; ptr++) Line 3117  for (;; ptr++)
3117        code += LINK_SIZE;        code += LINK_SIZE;
3118        *code = negate_class? XCL_NOT : 0;        *code = negate_class? XCL_NOT : 0;
3119    
3120        /* If the map is required, install it, and move on to the end of        /* If the map is required, move up the extra data to make room for it;
3121        the extra data */        otherwise just move the code pointer to the end of the extra data. */
3122    
3123        if (class_charcount > 0)        if (class_charcount > 0)
3124          {          {
3125          *code++ |= XCL_MAP;          *code++ |= XCL_MAP;
3126            memmove(code + 32, code, class_utf8data - code);
3127          memcpy(code, classbits, 32);          memcpy(code, classbits, 32);
3128          code = class_utf8data;          code = class_utf8data + 32;
         }  
   
       /* If the map is not required, slide down the extra data. */  
   
       else  
         {  
         int len = class_utf8data - (code + 33);  
         memmove(code + 1, code + 33, len);  
         code += len + 1;  
3129          }          }
3130          else code = class_utf8data;
3131    
3132        /* Now fill in the complete length of the item */        /* Now fill in the complete length of the item */
3133    
# Line 2274  for (;; ptr++) Line 3144  for (;; ptr++)
3144      if (negate_class)      if (negate_class)
3145        {        {
3146        *code++ = OP_NCLASS;        *code++ = OP_NCLASS;
3147        for (c = 0; c < 32; c++) code[c] = ~classbits[c];        if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3148            for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3149        }        }
3150      else      else
3151        {        {
# Line 2284  for (;; ptr++) Line 3155  for (;; ptr++)
3155      code += 32;      code += 32;
3156      break;      break;
3157    
3158    
3159        /* ===================================================================*/
3160      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this      /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3161      has been tested above. */      has been tested above. */
3162    
# Line 2351  for (;; ptr++) Line 3224  for (;; ptr++)
3224        }        }
3225      else repeat_type = greedy_default;      else repeat_type = greedy_default;
3226    
     /* If previous was a recursion, we need to wrap it inside brackets so that  
     it can be replicated if necessary. */  
   
     if (*previous == OP_RECURSE)  
       {  
       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);  
       code += 1 + LINK_SIZE;  
       *previous = OP_BRA;  
       PUT(previous, 1, code - previous);  
       *code = OP_KET;  
       PUT(code, 1, code - previous);  
       code += 1 + LINK_SIZE;  
       }  
   
3227      /* If previous was a character match, abolish the item and generate a      /* If previous was a character match, abolish the item and generate a
3228      repeat item instead. If a char item has a minumum of more than one, ensure      repeat item instead. If a char item has a minumum of more than one, ensure
3229      that it is set in reqbyte - it might not be if a sequence such as x{3} is      that it is set in reqbyte - it might not be if a sequence such as x{3} is
# Line 2398  for (;; ptr++) Line 3257  for (;; ptr++)
3257          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3258          }          }
3259    
3260          /* If the repetition is unlimited, it pays to see if the next thing on
3261          the line is something that cannot possibly match this character. If so,
3262          automatically possessifying this item gains some performance in the case
3263          where the match fails. */
3264    
3265          if (!possessive_quantifier &&
3266              repeat_max < 0 &&
3267              check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3268                options, cd))
3269            {
3270            repeat_type = 0;    /* Force greedy */
3271            possessive_quantifier = TRUE;
3272            }
3273    
3274        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3275        }        }
3276    
3277      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
3278      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
3279      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
3280      repeat_type. OP_NOT is currently used only for single-byte chars. */      repeat_type. We can also test for auto-possessification. OP_NOT is
3281        currently used only for single-byte chars. */
3282    
3283      else if (*previous == OP_NOT)      else if (*previous == OP_NOT)
3284        {        {
3285        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3286        c = previous[1];        c = previous[1];
3287          if (!possessive_quantifier &&
3288              repeat_max < 0 &&
3289              check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3290            {
3291            repeat_type = 0;    /* Force greedy */
3292            possessive_quantifier = TRUE;
3293            }
3294        goto OUTPUT_SINGLE_REPEAT;        goto OUTPUT_SINGLE_REPEAT;
3295        }        }
3296    
# Line 2423  for (;; ptr++) Line 3304  for (;; ptr++)
3304      else if (*previous < OP_EODN)      else if (*previous < OP_EODN)
3305        {        {
3306        uschar *oldcode;        uschar *oldcode;
3307        int prop_type;        int prop_type, prop_value;
3308        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3309        c = *previous;        c = *previous;
3310    
3311          if (!possessive_quantifier &&
3312              repeat_max < 0 &&
3313              check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3314            {
3315            repeat_type = 0;    /* Force greedy */
3316            possessive_quantifier = TRUE;
3317            }
3318    
3319        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
3320        prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?        if (*previous == OP_PROP || *previous == OP_NOTPROP)
3321          previous[1] : -1;          {
3322            prop_type = previous[1];
3323            prop_value = previous[2];
3324            }
3325          else prop_type = prop_value = -1;
3326    
3327        oldcode = code;        oldcode = code;
3328        code = previous;                  /* Usually overwrite previous item */        code = previous;                  /* Usually overwrite previous item */
# Line 2463  for (;; ptr++) Line 3356  for (;; ptr++)
3356          }          }
3357    
3358        /* A repeat minimum of 1 is optimized into some special cases. If the        /* A repeat minimum of 1 is optimized into some special cases. If the
3359        maximum is unlimited, we use OP_PLUS. Otherwise, the original item it        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3360        left in place and, if the maximum is greater than 1, we use OP_UPTO with        left in place and, if the maximum is greater than 1, we use OP_UPTO with
3361        one less than the maximum. */        one less than the maximum. */
3362    
# Line 2490  for (;; ptr++) Line 3383  for (;; ptr++)
3383    
3384          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3385          we have to insert the character for the previous code. For a repeated          we have to insert the character for the previous code. For a repeated
3386          Unicode property match, there is an extra byte that defines the          Unicode property match, there are two extra bytes that define the
3387          required property. In UTF-8 mode, long characters have their length in          required property. In UTF-8 mode, long characters have their length in
3388          c, with the 0x80 bit as a flag. */          c, with the 0x80 bit as a flag. */
3389    
# Line 2506  for (;; ptr++) Line 3399  for (;; ptr++)
3399  #endif  #endif
3400              {              {
3401              *code++ = c;              *code++ = c;
3402              if (prop_type >= 0) *code++ = prop_type;              if (prop_type >= 0)
3403                  {
3404                  *code++ = prop_type;
3405                  *code++ = prop_value;
3406                  }
3407              }              }
3408            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
3409            }            }
3410    
3411          /* Else insert an UPTO if the max is greater than the min, again          /* Else insert an UPTO if the max is greater than the min, again
3412          preceded by the character, for the previously inserted code. */          preceded by the character, for the previously inserted code. If the
3413            UPTO is just for 1 instance, we can use QUERY instead. */
3414    
3415          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
3416            {            {
# Line 2525  for (;; ptr++) Line 3423  for (;; ptr++)
3423            else            else
3424  #endif  #endif
3425            *code++ = c;            *code++ = c;
3426            if (prop_type >= 0) *code++ = prop_type;            if (prop_type >= 0)
3427                {
3428                *code++ = prop_type;
3429                *code++ = prop_value;
3430                }
3431            repeat_max -= repeat_min;            repeat_max -= repeat_min;
3432            *code++ = OP_UPTO + repeat_type;  
3433            PUT2INC(code, 0, repeat_max);            if (repeat_max == 1)
3434                {
3435                *code++ = OP_QUERY + repeat_type;
3436                }
3437              else
3438                {
3439                *code++ = OP_UPTO + repeat_type;
3440                PUT2INC(code, 0, repeat_max);
3441                }
3442            }            }
3443          }          }
3444    
# Line 2544  for (;; ptr++) Line 3454  for (;; ptr++)
3454  #endif  #endif
3455        *code++ = c;        *code++ = c;
3456    
3457        /* For a repeated Unicode property match, there is an extra byte that        /* For a repeated Unicode property match, there are two extra bytes that
3458        defines the required property. */        define the required property. */
3459    
3460  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
3461        if (prop_type >= 0) *code++ = prop_type;        if (prop_type >= 0)
3462            {
3463            *code++ = prop_type;
3464            *code++ = prop_value;
3465            }
3466  #endif  #endif
3467        }        }
3468    
# Line 2591  for (;; ptr++) Line 3505  for (;; ptr++)
3505      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
3506      cases. */      cases. */
3507    
3508      else if (*previous >= OP_BRA || *previous == OP_ONCE ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3509               *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
3510        {        {
3511        register int i;        register int i;
3512        int ketoffset = 0;        int ketoffset = 0;
3513        int len = code - previous;        int len = code - previous;
3514        uschar *bralink = NULL;        uschar *bralink = NULL;
3515    
3516          /* Repeating a DEFINE group is pointless */
3517    
3518          if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3519            {
3520            *errorcodeptr = ERR55;
3521            goto FAILED;
3522            }
3523    
3524          /* This is a paranoid check to stop integer overflow later on */
3525    
3526          if (len > MAX_DUPLENGTH)
3527            {
3528            *errorcodeptr = ERR50;
3529            goto FAILED;
3530            }
3531    
3532        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
3533        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
3534        from the current code pointer. There may be an OP_OPT setting following        from the current code pointer. There may be an OP_OPT setting following
# Line 2633  for (;; ptr++) Line 3563  for (;; ptr++)
3563          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
3564          BRAZERO and do no more at this point. However, we do need to adjust          BRAZERO and do no more at this point. However, we do need to adjust
3565          any OP_RECURSE calls inside the group that refer to the group itself or          any OP_RECURSE calls inside the group that refer to the group itself or
3566          any internal group, because the offset is from the start of the whole          any internal or forward referenced group, because the offset is from
3567          regex. Temporarily terminate the pattern while doing this. */          the start of the whole regex. Temporarily terminate the pattern while
3568            doing this. */
3569    
3570          if (repeat_max <= 1)          if (repeat_max <= 1)
3571            {            {
3572            *code = OP_END;            *code = OP_END;
3573            adjust_recurse(previous, 1, utf8, cd);            adjust_recurse(previous, 1, utf8, cd, save_hwm);
3574            memmove(previous+1, previous, len);            memmove(previous+1, previous, len);
3575            code++;            code++;
3576            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2657  for (;; ptr++) Line 3588  for (;; ptr++)
3588            {            {
3589            int offset;            int offset;
3590            *code = OP_END;            *code = OP_END;
3591            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);            adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3592            memmove(previous + 2 + LINK_SIZE, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
3593            code += 2 + LINK_SIZE;            code += 2 + LINK_SIZE;
3594            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
# Line 2677  for (;; ptr++) Line 3608  for (;; ptr++)
3608        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
3609        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
3610        copies that we need. If we set a first char from the group, and didn't        copies that we need. If we set a first char from the group, and didn't
3611        set a required char, copy the latter from the former. */        set a required char, copy the latter from the former. If there are any
3612          forward reference subroutine calls in the group, there will be entries on
3613          the workspace list; replicate these with an appropriate increment. */
3614    
3615        else        else
3616          {          {
3617          if (repeat_min > 1)          if (repeat_min > 1)
3618            {            {
3619            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;            /* In the pre-compile phase, we don't actually do the replication. We
3620            for (i = 1; i < repeat_min; i++)            just adjust the length as if we had. */
3621    
3622              if (lengthptr != NULL)
3623                *lengthptr += (repeat_min - 1)*length_prevgroup;
3624    
3625              /* This is compiling for real */
3626    
3627              else
3628              {              {
3629              memcpy(code, previous, len);              if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3630              code += len;              for (i = 1; i < repeat_min; i++)
3631                  {
3632                  uschar *hc;
3633                  uschar *this_hwm = cd->hwm;
3634                  memcpy(code, previous, len);
3635                  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3636                    {
3637                    PUT(cd->hwm, 0, GET(hc, 0) + len);
3638                    cd->hwm += LINK_SIZE;
3639                    }
3640                  save_hwm = this_hwm;
3641                  code += len;
3642                  }
3643              }              }
3644            }            }
3645    
3646          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
3647          }          }
3648    
# Line 2697  for (;; ptr++) Line 3650  for (;; ptr++)
3650        the maximum is limited, it replicates the group in a nested fashion,        the maximum is limited, it replicates the group in a nested fashion,
3651        remembering the bracket starts on a stack. In the case of a zero minimum,        remembering the bracket starts on a stack. In the case of a zero minimum,
3652        the first one was set up above. In all cases the repeat_max now specifies        the first one was set up above. In all cases the repeat_max now specifies
3653        the number of additional copies needed. */        the number of additional copies needed. Again, we must remember to
3654          replicate entries on the forward reference list. */
3655    
3656        if (repeat_max >= 0)        if (repeat_max >= 0)
3657          {          {
3658          for (i = repeat_max - 1; i >= 0; i--)          /* In the pre-compile phase, we don't actually do the replication. We
3659            just adjust the length as if we had. For each repetition we must add 1
3660            to the length for BRAZERO and for all but the last repetition we must
3661            add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3662    
3663            if (lengthptr != NULL && repeat_max > 0)
3664              *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3665                2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3666    
3667            /* This is compiling for real */
3668    
3669            else for (i = repeat_max - 1; i >= 0; i--)
3670            {            {
3671              uschar *hc;
3672              uschar *this_hwm = cd->hwm;
3673    
3674            *code++ = OP_BRAZERO + repeat_type;            *code++ = OP_BRAZERO + repeat_type;
3675    
3676            /* All but the final copy start a new nesting, maintaining the            /* All but the final copy start a new nesting, maintaining the
# Line 2718  for (;; ptr++) Line 3686  for (;; ptr++)
3686              }              }
3687    
3688            memcpy(code, previous, len);            memcpy(code, previous, len);
3689              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3690                {
3691                PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3692                cd->hwm += LINK_SIZE;
3693                }
3694              save_hwm = this_hwm;
3695            code += len;            code += len;
3696            }            }
3697    
# Line 2740  for (;; ptr++) Line 3714  for (;; ptr++)
3714        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. We
3715        can't just offset backwards from the current code point, because we        can't just offset backwards from the current code point, because we
3716        don't know if there's been an options resetting after the ket. The        don't know if there's been an options resetting after the ket. The
3717        correct offset was computed above. */        correct offset was computed above.
3718    
3719          Then, when we are doing the actual compile phase, check to see whether
3720          this group is a non-atomic one that could match an empty string. If so,
3721          convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3722          that runtime checking can be done. [This check is also applied to
3723          atomic groups at runtime, but in a different way.] */
3724    
3725        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else
3726            {
3727            uschar *ketcode = code - ketoffset;
3728            uschar *bracode = ketcode - GET(ketcode, 1);
3729            *ketcode = OP_KETRMAX + repeat_type;
3730            if (lengthptr == NULL && *bracode != OP_ONCE)
3731              {
3732              uschar *scode = bracode;
3733              do
3734                {
3735                if (could_be_empty_branch(scode, ketcode, utf8))
3736                  {
3737                  *bracode += OP_SBRA - OP_BRA;
3738                  break;
3739                  }
3740                scode += GET(scode, 1);
3741                }
3742              while (*scode == OP_ALT);
3743              }
3744            }
3745        }        }
3746    
3747      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 2753  for (;; ptr++) Line 3752  for (;; ptr++)
3752        goto FAILED;        goto FAILED;
3753        }        }
3754    
3755      /* If the character following a repeat is '+', we wrap the entire repeated      /* If the character following a repeat is '+', or if certain optimization
3756      item inside OP_ONCE brackets. This is just syntactic sugar, taken from      tests above succeeded, possessive_quantifier is TRUE. For some of the
3757      Sun's Java package. The repeated item starts at tempcode, not at previous,      simpler opcodes, there is an special alternative opcode for this. For
3758      which might be the first part of a string whose (former) last char we      anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3759      repeated. However, we don't support '+' after a greediness '?'. */      The '+' notation is just syntactic sugar, taken from Sun's Java package,
3760        but the special opcodes can optimize it a bit. The repeated item starts at
3761        tempcode, not at previous, which might be the first part of a string whose
3762        (former) last char we repeated.
3763    
3764        Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3765        an 'upto' may follow. We skip over an 'exact' item, and then test the
3766        length of what remains before proceeding. */
3767    
3768      if (possessive_quantifier)      if (possessive_quantifier)
3769        {        {
3770        int len = code - tempcode;        int len;
3771        memmove(tempcode + 1+LINK_SIZE, tempcode, len);        if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3772        code += 1 + LINK_SIZE;            *tempcode == OP_NOTEXACT)
3773        len += 1 + LINK_SIZE;          tempcode += _pcre_OP_lengths[*tempcode];
3774        tempcode[0] = OP_ONCE;        len = code - tempcode;
3775        *code++ = OP_KET;        if (len > 0) switch (*tempcode)
3776        PUTINC(code, 0, len);          {
3777        PUT(tempcode, 1, len);          case OP_STAR:  *tempcode = OP_POSSTAR; break;
3778            case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3779            case OP_QUERY: *tempcode = OP_POSQUERY; break;
3780            case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3781    
3782            case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3783            case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3784            case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3785            case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3786    
3787            case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3788            case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3789            case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3790            case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3791    
3792            default:
3793            memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3794            code += 1 + LINK_SIZE;
3795            len += 1 + LINK_SIZE;
3796            tempcode[0] = OP_ONCE;
3797            *code++ = OP_KET;
3798            PUTINC(code, 0, len);
3799            PUT(tempcode, 1, len);
3800            break;
3801            }
3802        }        }
3803    
3804      /* In all case we no longer have a previous item. We also set the      /* In all case we no longer have a previous item. We also set the
# Line 2781  for (;; ptr++) Line 3811  for (;; ptr++)
3811      break;      break;
3812    
3813    
3814      /* Start of nested bracket sub-expression, or comment or lookahead or      /* ===================================================================*/
3815      lookbehind or option setting or condition. First deal with special things      /* Start of nested parenthesized sub-expression, or comment or lookahead or
3816      that can come after a bracket; all are introduced by ?, and the appearance      lookbehind or option setting or condition or all the other extended
3817      of any of them means that this is not a referencing group. They were      parenthesis forms. First deal with the specials; all are introduced by ?,
3818      checked for validity in the first pass over the string, so we don't have to      and the appearance of any of them means that this is not a capturing
3819      check for syntax errors here.  */      group. */
3820    
3821      case '(':      case '(':
3822      newoptions = options;      newoptions = options;
3823      skipbytes = 0;      skipbytes = 0;
3824        bravalue = OP_CBRA;
3825        save_hwm = cd->hwm;
3826        reset_bracount = FALSE;
3827    
3828      if (*(++ptr) == '?')      if (*(++ptr) == '?')
3829        {        {
3830        int set, unset;        int i, set, unset, namelen;
3831        int *optset;        int *optset;
3832          const uschar *name;
3833          uschar *slot;
3834    
3835        switch (*(++ptr))        switch (*(++ptr))
3836          {          {
3837          case '#':                 /* Comment; skip to ket */          case '#':                 /* Comment; skip to ket */
3838          ptr++;          ptr++;
3839          while (*ptr != ')') ptr++;          while (*ptr != 0 && *ptr != ')') ptr++;
3840            if (*ptr == 0)
3841              {
3842              *errorcodeptr = ERR18;
3843              goto FAILED;
3844              }
3845          continue;          continue;
3846    
3847          case ':':                 /* Non-extracting bracket */  
3848            /* ------------------------------------------------------------ */
3849            case '|':                 /* Reset capture count for each branch */
3850            reset_bracount = TRUE;
3851            /* Fall through */
3852    
3853            /* ------------------------------------------------------------ */
3854            case ':':                 /* Non-capturing bracket */
3855          bravalue = OP_BRA;          bravalue = OP_BRA;
3856          ptr++;          ptr++;
3857          break;          break;
3858    
3859    
3860            /* ------------------------------------------------------------ */
3861          case '(':          case '(':
3862          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
3863    
3864          /* Condition to test for recursion */          /* A condition can be an assertion, a number (referring to a numbered
3865            group), a name (referring to a named group), or 'R', referring to
3866            recursion. R<digits> and R&name are also permitted for recursion tests.
3867    
3868            There are several syntaxes for testing a named group: (?(name)) is used
3869            by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3870    
3871            There are two unfortunate ambiguities, caused by history. (a) 'R' can
3872            be the recursive thing or the name 'R' (and similarly for 'R' followed
3873            by digits), and (b) a number could be a name that consists of digits.
3874            In both cases, we look for a name first; if not found, we try the other
3875            cases. */
3876    
3877            /* For conditions that are assertions, check the syntax, and then exit
3878            the switch. This will take control down to where bracketed groups,
3879            including assertions, are processed. */
3880    
3881            if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3882              break;
3883    
3884            /* Most other conditions use OP_CREF (a couple change to OP_RREF
3885            below), and all need to skip 3 bytes at the start of the group. */
3886    
3887            code[1+LINK_SIZE] = OP_CREF;
3888            skipbytes = 3;
3889            refsign = -1;
3890    
3891            /* Check for a test for recursion in a named group. */
3892    
3893            if (ptr[1] == 'R' && ptr[2] == '&')
3894              {
3895              terminator = -1;
3896              ptr += 2;
3897              code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3898              }
3899    
3900            /* Check for a test for a named group's having been set, using the Perl
3901            syntax (?(<name>) or (?('name') */
3902    
3903            else if (ptr[1] == '<')
3904              {
3905              terminator = '>';
3906              ptr++;
3907              }
3908            else if (ptr[1] == '\'')
3909              {
3910              terminator = '\'';
3911              ptr++;
3912              }
3913            else
3914              {
3915              terminator = 0;
3916              if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3917              }
3918    
3919            /* We now expect to read a name; any thing else is an error */
3920    
3921            if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3922              {
3923              ptr += 1;  /* To get the right offset */
3924              *errorcodeptr = ERR28;
3925              goto FAILED;
3926              }
3927    
3928            /* Read the name, but also get it as a number if it's all digits */
3929    
3930          if (ptr[1] == 'R')          recno = 0;
3931            name = ++ptr;
3932            while ((cd->ctypes[*ptr] & ctype_word) != 0)
3933              {
3934              if (recno >= 0)
3935                recno = ((digitab[*ptr] & ctype_digit) != 0)?
3936                  recno * 10 + *ptr - '0' : -1;
3937              ptr++;
3938              }
3939            namelen = ptr - name;
3940    
3941            if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3942            {            {
3943            code[1+LINK_SIZE] = OP_CREF;            ptr--;      /* Error offset */
3944            PUT2(code, 2+LINK_SIZE, CREF_RECURSE);            *errorcodeptr = ERR26;
3945            skipbytes = 3;            goto FAILED;
           ptr += 3;  
3946            }            }
3947    
3948          /* Condition to test for a numbered subpattern match. We know that          /* Do no further checking in the pre-compile phase. */
         if a digit follows ( then there will just be digits until ) because  
         the syntax was checked in the first pass. */  
3949    
3950          else if ((digitab[ptr[1]] && ctype_digit) != 0)          if (lengthptr != NULL) break;
3951    
3952            /* In the real compile we do the work of looking for the actual
3953            reference. If the string started with "+" or "-" we require the rest to
3954            be digits, in which case recno will be set. */
3955    
3956            if (refsign > 0)
3957            {            {
3958            int condref;                 /* Don't amalgamate; some compilers */            if (recno <= 0)
           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */  
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';  
           if (condref == 0)  
3959              {              {
3960              *errorcodeptr = ERR35;              *errorcodeptr = ERR58;
3961              goto FAILED;              goto FAILED;
3962              }              }
3963            ptr++;            if (refsign == '-')
3964            code[1+LINK_SIZE] = OP_CREF;              {
3965            PUT2(code, 2+LINK_SIZE, condref);              recno = cd->bracount - recno + 1;
3966            skipbytes = 3;              if (recno <= 0)
3967                  {
3968                  *errorcodeptr = ERR15;
3969                  goto FAILED;
3970                  }
3971                }
3972              else recno += cd->bracount;
3973              PUT2(code, 2+LINK_SIZE, recno);
3974              break;
3975              }
3976    
3977            /* Otherwise (did not start with "+" or "-"), start by looking for the
3978            name. */
3979    
3980            slot = cd->name_table;
3981            for (i = 0; i < cd->names_found; i++)
3982              {
3983              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3984              slot += cd->name_entry_size;
3985              }
3986    
3987            /* Found a previous named subpattern */
3988    
3989            if (i < cd->names_found)
3990              {
3991              recno = GET2(slot, 0);
3992              PUT2(code, 2+LINK_SIZE, recno);
3993              }
3994    
3995            /* Search the pattern for a forward reference */
3996    
3997            else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3998                            (options & PCRE_EXTENDED) != 0)) > 0)
3999              {
4000              PUT2(code, 2+LINK_SIZE, i);
4001              }
4002    
4003            /* If terminator == 0 it means that the name followed directly after
4004            the opening parenthesis [e.g. (?(abc)...] and in this case there are
4005            some further alternatives to try. For the cases where terminator != 0
4006            [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4007            now checked all the possibilities, so give an error. */
4008    
4009            else if (terminator != 0)
4010              {
4011              *errorcodeptr = ERR15;
4012              goto FAILED;
4013              }
4014    
4015            /* Check for (?(R) for recursion. Allow digits after R to specify a
4016            specific group number. */
4017    
4018            else if (*name == 'R')
4019              {
4020              recno = 0;
4021              for (i = 1; i < namelen; i++)
4022                {
4023                if ((digitab[name[i]] & ctype_digit) == 0)
4024                  {
4025                  *errorcodeptr = ERR15;
4026                  goto FAILED;
4027                  }
4028                recno = recno * 10 + name[i] - '0';
4029                }
4030              if (recno == 0) recno = RREF_ANY;
4031              code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4032              PUT2(code, 2+LINK_SIZE, recno);
4033              }
4034    
4035            /* Similarly, check for the (?(DEFINE) "condition", which is always
4036            false. */
4037    
4038            else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4039              {
4040              code[1+LINK_SIZE] = OP_DEF;
4041              skipbytes = 1;
4042              }
4043    
4044            /* Check for the "name" actually being a subpattern number. */
4045    
4046            else if (recno > 0)
4047              {
4048              PUT2(code, 2+LINK_SIZE, recno);
4049              }
4050    
4051            /* Either an unidentified subpattern, or a reference to (?(0) */
4052    
4053            else
4054              {
4055              *errorcodeptr = (recno == 0)? ERR35: ERR15;
4056              goto FAILED;
4057            }            }
         /* For conditions that are assertions, we just fall through, having  
         set bravalue above. */  
4058          break;          break;
4059    
4060    
4061            /* ------------------------------------------------------------ */
4062          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
4063          bravalue = OP_ASSERT;          bravalue = OP_ASSERT;
4064          ptr++;          ptr++;
4065          break;          break;
4066    
4067    
4068            /* ------------------------------------------------------------ */
4069          case '!':                 /* Negative lookahead */          case '!':                 /* Negative lookahead */
4070          bravalue = OP_ASSERT_NOT;          bravalue = OP_ASSERT_NOT;
4071          ptr++;          ptr++;
4072          break;          break;
4073    
4074          case '<':                 /* Lookbehinds */  
4075          switch (*(++ptr))          /* ------------------------------------------------------------ */
4076            case '<':                 /* Lookbehind or named define */
4077            switch (ptr[1])
4078            {            {
4079            case '=':               /* Positive lookbehind */            case '=':               /* Positive lookbehind */
4080            bravalue = OP_ASSERTBACK;            bravalue = OP_ASSERTBACK;
4081            ptr++;            ptr += 2;
4082            break;            break;
4083    
4084            case '!':               /* Negative lookbehind */            case '!':               /* Negative lookbehind */
4085            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
4086            ptr++;            ptr += 2;
4087            break;            break;
4088    
4089              default:                /* Could be name define, else bad */
4090              if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4091              ptr++;                  /* Correct offset for error */
4092              *errorcodeptr = ERR24;
4093              goto FAILED;
4094            }            }
4095          break;          break;
4096    
4097    
4098            /* ------------------------------------------------------------ */
4099          case '>':                 /* One-time brackets */          case '>':                 /* One-time brackets */
4100          bravalue = OP_ONCE;          bravalue = OP_ONCE;
4101          ptr++;          ptr++;
4102          break;          break;
4103    
4104    
4105            /* ------------------------------------------------------------ */
4106          case 'C':                 /* Callout - may be followed by digits; */          case 'C':                 /* Callout - may be followed by digits; */
4107          previous_callout = code;  /* Save for later completion */          previous_callout = code;  /* Save for later completion */
4108          after_manual_callout = 1; /* Skip one item before completing */          after_manual_callout = 1; /* Skip one item before completing */
4109          *code++ = OP_CALLOUT;     /* Already checked that the terminating */          *code++ = OP_CALLOUT;
4110            {                       /* closing parenthesis is present. */            {
4111            int n = 0;            int n = 0;
4112            while ((digitab[*(++ptr)] & ctype_digit) != 0)            while ((digitab[*(++ptr)] & ctype_digit) != 0)
4113              n = n * 10 + *ptr - '0';              n = n * 10 + *ptr - '0';
4114              if (*ptr != ')')
4115                {
4116                *errorcodeptr = ERR39;
4117                goto FAILED;
4118                }
4119            if (n > 255)            if (n > 255)
4120              {              {
4121              *errorcodeptr = ERR38;              *errorcodeptr = ERR38;
# Line 2896  for (;; ptr++) Line 4129  for (;; ptr++)
4129          previous = NULL;          previous = NULL;
4130          continue;          continue;
4131    
4132          case 'P':                 /* Named subpattern handling */  
4133          if (*(++ptr) == '<')      /* Definition */          /* ------------------------------------------------------------ */
4134            case 'P':                 /* Python-style named subpattern handling */
4135            if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4136              {
4137              is_recurse = *ptr == '>';
4138              terminator = ')';
4139              goto NAMED_REF_OR_RECURSE;
4140              }
4141            else if (*ptr != '<')    /* Test for Python-style definition */
4142              {
4143              *errorcodeptr = ERR41;
4144              goto FAILED;
4145              }
4146            /* Fall through to handle (?P< as (?< is handled */
4147    
4148    
4149            /* ------------------------------------------------------------ */
4150            DEFINE_NAME:    /* Come here from (?< handling */
4151            case '\'':
4152            {            {
4153            int i, namelen;            terminator = (*ptr == '<')? '>' : '\'';
4154            uschar *slot = cd->name_table;            name = ++ptr;
4155            const uschar *name;     /* Don't amalgamate; some compilers */  
4156            name = ++ptr;           /* grumble at autoincrement in declaration */            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4157              namelen = ptr - name;
4158    
4159            while (*ptr++ != '>');            /* In the pre-compile phase, just do a syntax check. */
           namelen = ptr - name - 1;  
4160    
4161            for (i = 0; i < cd->names_found; i++)            if (lengthptr != NULL)
4162              {              {
4163              int crc = memcmp(name, slot+2, namelen);              if (*ptr != terminator)
             if (crc == 0)  
4164                {                {
4165                if (slot[2+namelen] == 0)                *errorcodeptr = ERR42;
4166                  goto FAILED;
4167                  }
4168                if (cd->names_found >= MAX_NAME_COUNT)
4169                  {
4170                  *errorcodeptr = ERR49;
4171                  goto FAILED;
4172                  }
4173                if (namelen + 3 > cd->name_entry_size)
4174                  {
4175                  cd->name_entry_size = namelen + 3;
4176                  if (namelen > MAX_NAME_SIZE)
4177                  {                  {
4178                  *errorcodeptr = ERR43;                  *errorcodeptr = ERR48;
4179                  goto FAILED;                  goto FAILED;
4180                  }                  }
               crc = -1;             /* Current name is substring */  
4181                }                }
             if (crc < 0)  
               {  
               memmove(slot + cd->name_entry_size, slot,  
                 (cd->names_found - i) * cd->name_entry_size);  
               break;  
               }  
             slot += cd->name_entry_size;  
4182              }              }
4183    
4184            PUT2(slot, 0, *brackets + 1);            /* In the real compile, create the entry in the table */
4185            memcpy(slot + 2, name, namelen);  
4186            slot[2+namelen] = 0;            else
4187            cd->names_found++;              {
4188            goto NUMBERED_GROUP;              slot = cd->name_table;
4189                for (i = 0; i < cd->names_found; i++)
4190                  {
4191                  int crc = memcmp(name, slot+2, namelen);
4192                  if (crc == 0)
4193                    {
4194                    if (slot[2+namelen] == 0)
4195                      {
4196                      if ((options & PCRE_DUPNAMES) == 0)
4197                        {
4198                        *errorcodeptr = ERR43;
4199                        goto FAILED;
4200                        }
4201                      }
4202                    else crc = -1;      /* Current name is substring */
4203                    }
4204                  if (crc < 0)
4205                    {
4206                    memmove(slot + cd->name_entry_size, slot,
4207                      (cd->names_found - i) * cd->name_entry_size);
4208                    break;
4209                    }
4210                  slot += cd->name_entry_size;
4211                  }
4212    
4213                PUT2(slot, 0, cd->bracount + 1);
4214                memcpy(slot + 2, name, namelen);
4215                slot[2+namelen] = 0;
4216                }
4217            }            }
4218    
4219          if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */          /* In both cases, count the number of names we've encountered. */
4220    
4221            ptr++;                    /* Move past > or ' */
4222            cd->names_found++;
4223            goto NUMBERED_GROUP;
4224    
4225    
4226            /* ------------------------------------------------------------ */
4227            case '&':                 /* Perl recursion/subroutine syntax */
4228            terminator = ')';
4229            is_recurse = TRUE;
4230            /* Fall through */
4231    
4232            /* We come here from the Python syntax above that handles both
4233            references (?P=name) and recursion (?P>name), as well as falling
4234            through from the Perl recursion syntax (?&name). */
4235    
4236            NAMED_REF_OR_RECURSE:
4237            name = ++ptr;
4238            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4239            namelen = ptr - name;
4240    
4241            /* In the pre-compile phase, do a syntax check and set a dummy
4242            reference number. */
4243    
4244            if (lengthptr != NULL)
4245            {            {
4246            int i, namelen;            if (*ptr != terminator)
4247            int type = *ptr++;              {
4248            const uschar *name = ptr;              *errorcodeptr = ERR42;
4249            uschar *slot = cd->name_table;              goto FAILED;
4250                }
4251              if (namelen > MAX_NAME_SIZE)
4252                {
4253                *errorcodeptr = ERR48;
4254                goto FAILED;
4255                }
4256              recno = 0;
4257              }
4258    
4259            while (*ptr != ')') ptr++;          /* In the real compile, seek the name in the table */
           namelen = ptr - name;  
4260    
4261            else
4262              {
4263              slot = cd->name_table;
4264            for (i = 0; i < cd->names_found; i++)            for (i = 0; i < cd->names_found; i++)
4265              {              {
4266              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;              if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4267              slot += cd->name_entry_size;              slot += cd->name_entry_size;
4268              }              }
4269            if (i >= cd->names_found)  
4270              if (i < cd->names_found)         /* Back reference */
4271                {
4272                recno = GET2(slot, 0);
4273                }
4274              else if ((recno =                /* Forward back reference */
4275                        find_parens(ptr, cd->bracount, name, namelen,
4276                          (options & PCRE_EXTENDED) != 0)) <= 0)
4277              {              {
4278              *errorcodeptr = ERR15;              *errorcodeptr = ERR15;
4279              goto FAILED;              goto FAILED;
4280              }              }
4281              }
4282    
4283            recno = GET2(slot, 0);          /* In both phases, we can now go to the code than handles numerical
4284            recursion or backreferences. */
           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */  
   
           /* Back reference */  
4285    
4286            previous = code;          if (is_recurse) goto HANDLE_RECURSION;
4287            *code++ = OP_REF;            else goto HANDLE_REFERENCE;
           PUT2INC(code, 0, recno);  
           cd->backref_map |= (recno < 32)? (1 << recno) : 1;  
           if (recno > cd->top_backref) cd->top_backref = recno;  
           continue;  
           }  
4288    
         /* Should never happen */  
         break;  
4289    
4290          case 'R':                 /* Pattern recursion */          /* ------------------------------------------------------------ */
4291            case 'R':                 /* Recursion */
4292          ptr++;                    /* Same as (?0)      */          ptr++;                    /* Same as (?0)      */
4293          /* Fall through */          /* Fall through */
4294    
         /* Recursion or "subroutine" call */  
4295    
4296          case '0': case '1': case '2': case '3': case '4':          /* ------------------------------------------------------------ */
4297          case '5': case '6': case '7': case '8': case '9':          case '-': case '+':
4298            case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4299            case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4300            {            {
4301            const uschar *called;            const uschar *called;
4302    
4303              if ((refsign = *ptr) == '+') ptr++;
4304              else if (refsign == '-')
4305                {
4306                if ((digitab[ptr[1]] & ctype_digit) == 0)
4307                  goto OTHER_CHAR_AFTER_QUERY;
4308                ptr++;
4309                }
4310    
4311            recno = 0;            recno = 0;
4312            while((digitab[*ptr] & ctype_digit) != 0)            while((digitab[*ptr] & ctype_digit) != 0)
4313              recno = recno * 10 + *ptr++ - '0';              recno = recno * 10 + *ptr++ - '0';
4314    
4315              if (*ptr != ')')
4316                {
4317                *errorcodeptr = ERR29;
4318                goto FAILED;
4319                }
4320    
4321              if (refsign == '-')
4322                {
4323                if (recno == 0)
4324                  {
4325                  *errorcodeptr = ERR58;
4326                  goto FAILED;
4327                  }
4328                recno = cd->bracount - recno + 1;
4329                if (recno <= 0)
4330                  {
4331                  *errorcodeptr = ERR15;
4332                  goto FAILED;
4333                  }
4334                }
4335              else if (refsign == '+')
4336                {
4337                if (recno == 0)
4338                  {
4339                  *errorcodeptr = ERR58;
4340                  goto FAILED;
4341                  }
4342                recno += cd->bracount;
4343                }
4344    
4345            /* Come here from code above that handles a named recursion */            /* Come here from code above that handles a named recursion */
4346    
4347            HANDLE_RECURSION:            HANDLE_RECURSION:
4348    
4349            previous = code;            previous = code;
4350              called = cd->start_code;
4351    
4352            /* Find the bracket that is being referenced. Temporarily end the            /* When we are actually compiling, find the bracket that is being
4353            regex in case it doesn't exist. */            referenced. Temporarily end the regex in case it doesn't exist before
4354              this point. If we end up with a forward reference, first check that
4355              the bracket does occur later so we can give the error (and position)
4356              now. Then remember this forward reference in the workspace so it can
4357              be filled in at the end. */
4358    
4359            *code = OP_END;            if (lengthptr == NULL)
           called = (recno == 0)?  
             cd->start_code : find_bracket(cd->start_code, utf8, recno);  
   
           if (called == NULL)  
4360              {              {
4361              *errorcodeptr = ERR15;              *code = OP_END;
4362              goto FAILED;              if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
             }  
4363    
4364            /* If the subpattern is still open, this is a recursive call. We              /* Forward reference */
           check to see if this is a left recursion that could loop for ever,  
           and diagnose that case. */  
4365    
4366            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))              if (called == NULL)
4367              {                {
4368              *errorcodeptr = ERR40;                if (find_parens(ptr, cd->bracount, NULL, recno,
4369              goto FAILED;                     (options & PCRE_EXTENDED) != 0) < 0)
4370                    {
4371                    *errorcodeptr = ERR15;
4372                    goto FAILED;
4373                    }
4374                  called = cd->start_code + recno;
4375                  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4376                  }
4377    
4378                /* If not a forward reference, and the subpattern is still open,
4379                this is a recursive call. We check to see if this is a left
4380                recursion that could loop for ever, and diagnose that case. */
4381    
4382                else if (GET(called, 1) == 0 &&
4383                         could_be_empty(called, code, bcptr, utf8))
4384                  {
4385                  *errorcodeptr = ERR40;
4386                  goto FAILED;
4387                  }
4388              }              }
4389    
4390            /* Insert the recursion/subroutine item */            /* Insert the recursion/subroutine item, automatically wrapped inside
4391              "once" brackets. Set up a "previous group" length so that a
4392              subsequent quantifier will work. */
4393    
4394              *code = OP_ONCE;
4395              PUT(code, 1, 2 + 2*LINK_SIZE);
4396              code += 1 + LINK_SIZE;
4397    
4398            *code = OP_RECURSE;            *code = OP_RECURSE;
4399            PUT(code, 1, called - cd->start_code);            PUT(code, 1, called - cd->start_code);
4400            code += 1 + LINK_SIZE;            code += 1 + LINK_SIZE;
4401    
4402              *code = OP_KET;
4403              PUT(code, 1, 2 + 2*LINK_SIZE);
4404              code += 1 + LINK_SIZE;
4405    
4406              length_prevgroup = 3 + 3*LINK_SIZE;
4407            }            }
4408    
4409            /* Can't determine a first byte now */
4410    
4411            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4412          continue;          continue;
4413    
         /* Character after (? not specially recognized */  
4414    
4415          default:                  /* Option setting */          /* ------------------------------------------------------------ */
4416            default:              /* Other characters: check option setting */
4417            OTHER_CHAR_AFTER_QUERY:
4418          set = unset = 0;          set = unset = 0;
4419          optset = &set;          optset = &set;
4420    
# Line 3036  for (;; ptr++) Line 4424  for (;; ptr++)
4424              {              {
4425              case '-': optset = &unset; break;              case '-': optset = &unset; break;
4426    
4427                case 'J':    /* Record that it changed in the external options */
4428                *optset |= PCRE_DUPNAMES;
4429                cd->external_options |= PCRE_JCHANGED;
4430                break;
4431    
4432              case 'i': *optset |= PCRE_CASELESS; break;              case 'i': *optset |= PCRE_CASELESS; break;
4433              case 'm': *optset |= PCRE_MULTILINE; break;              case 'm': *optset |= PCRE_MULTILINE; break;
4434              case 's': *optset |= PCRE_DOTALL; break;              case 's': *optset |= PCRE_DOTALL; break;
4435              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
4436              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
4437              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
4438    
4439                default:  *errorcodeptr = ERR12;
4440                          ptr--;    /* Correct the offset */
4441                          goto FAILED;
4442              }              }
4443            }            }
4444    
# Line 3050  for (;; ptr++) Line 4447  for (;; ptr++)
4447          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
4448    
4449          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
4450          group with option changes, so the options change at this level. Compile          group with option changes, so the options change at this level. If this
4451          code to change the ims options if this setting actually changes any of          item is right at the start of the pattern, the options can be
4452          them. We also pass the new setting back so that it can be put at the          abstracted and made external in the pre-compile phase, and ignored in
4453          start of any following branches, and when this group ends (if we are in          the compile phase. This can be helpful when matching -- for instance in
4454          a group), a resetting item can be compiled.          caseless checking of required bytes.
4455    
4456          Note that if this item is right at the start of the pattern, the          If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4457          options will have been abstracted and made global, so there will be no          definitely *not* at the start of the pattern because something has been
4458          change to compile. */          compiled. In the pre-compile phase, however, the code pointer can have
4459            that value after the start, because it gets reset as code is discarded
4460            during the pre-compile. However, this can happen only at top level - if
4461            we are within parentheses, the starting BRA will still be present. At
4462            any parenthesis level, the length value can be used to test if anything
4463            has been compiled at that level. Thus, a test for both these conditions
4464            is necessary to ensure we correctly detect the start of the pattern in
4465            both phases.
4466    
4467            If we are not at the pattern start, compile code to change the ims
4468            options if this setting actually changes any of them. We also pass the
4469            new setting back so that it can be put at the start of any following
4470            branches, and when this group ends (if we are in a group), a resetting
4471            item can be compiled. */
4472    
4473          if (*ptr == ')')          if (*ptr == ')')
4474            {            {
4475            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))            if (code == cd->start_code + 1 + LINK_SIZE &&
4476                   (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4477              {              {
4478              *code++ = OP_OPT;              cd->external_options = newoptions;
4479              *code++ = newoptions & PCRE_IMS;              options = newoptions;
4480              }              }
4481             else
4482                {
4483                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4484                  {
4485                  *code++ = OP_OPT;
4486                  *code++ = newoptions & PCRE_IMS;
4487                  }
4488    
4489            /* Change options at this level, and pass them back for use              /* Change options at this level, and pass them back for use
4490            in subsequent branches. Reset the greedy defaults and the case              in subsequent branches. Reset the greedy defaults and the case
4491            value for firstbyte and reqbyte. */              value for firstbyte and reqbyte. */
4492    
4493            *optionsptr = options = newoptions;              *optionsptr = options = newoptions;
4494            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4495            greedy_non_default = greedy_default ^ 1;              greedy_non_default = greedy_default ^ 1;
4496            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4497                }
4498    
4499            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
4500            continue;              /* It is complete */            continue;              /* It is complete */
# Line 3088  for (;; ptr++) Line 4507  for (;; ptr++)
4507    
4508          bravalue = OP_BRA;          bravalue = OP_BRA;
4509          ptr++;          ptr++;
4510          }          }     /* End of switch for character following (? */
4511        }        }       /* End of (? handling */
4512    
4513      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become      /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4514      non-capturing and behave like (?:...) brackets */      all unadorned brackets become non-capturing and behave like (?:...)
4515        brackets. */
4516    
4517      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4518        {        {
4519        bravalue = OP_BRA;        bravalue = OP_BRA;
4520        }        }
4521    
4522      /* Else we have a referencing group; adjust the opcode. If the bracket      /* Else we have a capturing group. */
     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and  
     arrange for the true number to follow later, in an OP_BRANUMBER item. */  
4523    
4524      else      else
4525        {        {
4526        NUMBERED_GROUP:        NUMBERED_GROUP:
4527        if (++(*brackets) > EXTRACT_BASIC_MAX)        cd->bracount += 1;
4528          {        PUT2(code, 1+LINK_SIZE, cd->bracount);
4529          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;        skipbytes = 2;
         code[1+LINK_SIZE] = OP_BRANUMBER;  
         PUT2(code, 2+LINK_SIZE, *brackets);  
         skipbytes = 3;